From 455317481bed5e101d8dd3e560e31b01f93d5cd6 Mon Sep 17 00:00:00 2001
From: tiffanychu90 <tiffany.chu@dot.ca.gov>
Date: Thu, 19 Sep 2024 16:42:53 +0000
Subject: [PATCH 01/11] run gtfs__funnel, hqta, open data for sep2024, remove
 private data suppression from hqta polygons because it's inhereited from hqta
 points

---
 _shared_utils/shared_utils/rt_dates.py          |  1 +
 gtfs_funnel/logs/download_data.log              | 17 +++++++++++++++++
 gtfs_funnel/logs/download_vp_v2.log             | 11 +++++++++++
 gtfs_funnel/logs/vp_preprocessing.log           | 11 +++++++++++
 gtfs_funnel/update_vars.py                      |  2 +-
 .../D2_assemble_hqta_polygons.py                |  9 +++------
 .../logs/hqta_processing.log                    |  8 ++++++++
 high_quality_transit_areas/update_vars.py       |  2 +-
 open_data/create_stops_data.py                  |  2 +-
 open_data/update_vars.py                        |  2 +-
 10 files changed, 55 insertions(+), 10 deletions(-)
diff --git a/_shared_utils/shared_utils/rt_dates.py b/_shared_utils/shared_utils/rt_dates.py
index 98691d675..bcafbfb74 100644
--- a/_shared_utils/shared_utils/rt_dates.py
+++ b/_shared_utils/shared_utils/rt_dates.py
@@ -63,6 +63,7 @@
     "jun2024": "2024-06-12",
     "jul2024": "2024-07-17",
     "aug2024": "2024-08-14",
+    "sep2024": "2024-09-18",
 }
 
 y2023_dates = [
diff --git a/gtfs_funnel/logs/download_data.log b/gtfs_funnel/logs/download_data.log
index 38fd4153f..938fbc549 100644
--- a/gtfs_funnel/logs/download_data.log
+++ b/gtfs_funnel/logs/download_data.log
@@ -516,3 +516,20 @@
 2024-08-15 09:09:27.480 | INFO     | __main__:download_one_day:33 - *********** Download st data ***********
 2024-08-15 09:11:56.577 | INFO     | __main__:download_one_day:56 - execution time: 0:02:30.991910
 2024-08-15 10:30:38.864 | INFO     | __main__:download_one_year:35 - execution time: 0:00:25.978363
+2024-09-19 08:13:46.511 | INFO     | __main__:download_one_day:45 - Analysis date: 2024-09-18
+2024-09-19 08:13:49.222 | INFO     | __main__:download_one_day:52 - # operators to run: 221
+2024-09-19 08:13:49.223 | INFO     | __main__:download_one_day:56 - *********** Download trips data ***********
+2024-09-19 08:14:16.573 | INFO     | __main__:download_one_day:86 - execution time: 0:00:30.061230
+2024-09-19 08:14:35.388 | INFO     | __main__:download_one_day:22 - Analysis date: 2024-09-18
+2024-09-19 08:14:37.294 | INFO     | __main__:download_one_day:29 - # operators to run: 221
+2024-09-19 08:14:37.294 | INFO     | __main__:download_one_day:33 - *********** Download stops data ***********
+2024-09-19 08:14:47.392 | INFO     | __main__:download_one_day:64 - execution time: 0:00:12.003376
+2024-09-19 08:15:03.834 | INFO     | __main__:download_one_day:22 - Analysis date: 2024-09-18
+2024-09-19 08:15:05.784 | INFO     | __main__:download_one_day:29 - # operators to run: 221
+2024-09-19 08:15:05.785 | INFO     | __main__:download_one_day:33 - *********** Download routelines data ***********
+2024-09-19 08:16:57.558 | INFO     | __main__:download_one_day:63 - execution time: 0:01:53.723521
+2024-09-19 08:17:14.221 | INFO     | __main__:download_one_day:21 - Analysis date: 2024-09-18
+2024-09-19 08:17:15.854 | INFO     | __main__:download_one_day:29 - # operators to run: 190
+2024-09-19 08:17:15.855 | INFO     | __main__:download_one_day:33 - *********** Download st data ***********
+2024-09-19 08:19:06.258 | INFO     | __main__:download_one_day:56 - execution time: 0:01:52.036660
+2024-09-19 09:28:35.882 | INFO     | __main__:download_one_year:35 - execution time: 0:00:45.388883
diff --git a/gtfs_funnel/logs/download_vp_v2.log b/gtfs_funnel/logs/download_vp_v2.log
index 987846da9..e81bf94b5 100644
--- a/gtfs_funnel/logs/download_vp_v2.log
+++ b/gtfs_funnel/logs/download_vp_v2.log
@@ -339,3 +339,14 @@
 2024-08-15 09:29:03.589 | INFO     | __main__:<module>:112 - export concatenated vp: 0:04:16.418987
 2024-08-15 09:34:04.743 | INFO     | __main__:<module>:134 - remove batched parquets
 2024-08-15 09:34:04.745 | INFO     | __main__:<module>:137 - execution time: 0:09:26.469734
+2024-09-19 08:19:35.573 | INFO     | __main__:<module>:148 - Analysis date: 2024-09-18
+2024-09-19 08:21:52.859 | INFO     | __main__:loop_through_batches_and_download_vp:111 - exported batch 0 to GCS: 0:02:17.254015
+2024-09-19 08:23:01.583 | INFO     | __main__:loop_through_batches_and_download_vp:111 - exported batch 1 to GCS: 0:01:08.722700
+2024-09-19 08:26:57.364 | INFO     | __main__:loop_through_batches_and_download_vp:111 - exported batch 2 to GCS: 0:03:55.780573
+2024-09-19 08:28:55.328 | INFO     | __main__:loop_through_batches_and_download_vp:111 - exported batch 3 to GCS: 0:01:57.952237
+2024-09-19 08:28:55.328 | INFO     | __main__:<module>:155 - execution time: 0:09:19.722825
+2024-09-19 08:29:19.967 | INFO     | __main__:<module>:97 - Analysis date: 2024-09-18
+2024-09-19 08:29:38.182 | INFO     | __main__:<module>:105 - concat and filter batched data: 0:00:18.208902
+2024-09-19 08:33:43.251 | INFO     | __main__:<module>:112 - export concatenated vp: 0:04:05.069147
+2024-09-19 08:37:30.865 | INFO     | __main__:<module>:134 - remove batched parquets
+2024-09-19 08:37:30.865 | INFO     | __main__:<module>:137 - execution time: 0:08:10.892310
diff --git a/gtfs_funnel/logs/vp_preprocessing.log b/gtfs_funnel/logs/vp_preprocessing.log
index ccb836743..7b9dddf71 100644
--- a/gtfs_funnel/logs/vp_preprocessing.log
+++ b/gtfs_funnel/logs/vp_preprocessing.log
@@ -200,3 +200,14 @@
 2024-08-15 10:05:01.848 | INFO     | __main__:<module>:235 - vp with dwell time 2024-08-14: 0:07:09.680694
 2024-08-15 10:13:16.657 | INFO     | __main__:<module>:120 - 2024-08-14: condense vp for trip 0:07:51.642337
 2024-08-15 10:24:50.802 | INFO     | __main__:<module>:128 - 2024-08-14: prepare vp to use in nearest neighbor: 0:11:34.144491
+2024-09-19 08:46:17.298 | INFO     | __main__:<module>:169 - 2024-09-18: pare down vp: 0:02:12.746302
+2024-09-19 08:51:10.542 | INFO     | __main__:attach_prior_vp_add_direction:90 - persist vp gddf: 0:04:35.313281
+2024-09-19 08:55:04.346 | INFO     | __main__:attach_prior_vp_add_direction:122 - np vectorize arrays for direction: 0:03:53.804190
+2024-09-19 08:55:11.908 | INFO     | __main__:<module>:194 - 2024-09-18: export vp direction: 0:08:36.678934
+2024-09-19 08:56:33.980 | INFO     | __main__:<module>:200 - 2024-09-18: export usable vp with direction: 0:01:22.071985
+2024-09-19 08:56:33.981 | INFO     | __main__:<module>:203 - 2024-09-18: vp_direction script execution time: 0:09:58.750919
+2024-09-19 09:01:58.870 | INFO     | __main__:<module>:212 - compute dwell df: 0:04:44.983561
+2024-09-19 09:03:13.198 | INFO     | __main__:<module>:234 - merge with original and export: 0:01:14.327719
+2024-09-19 09:03:13.200 | INFO     | __main__:<module>:235 - vp with dwell time 2024-09-18: 0:05:59.311280
+2024-09-19 09:08:43.742 | INFO     | __main__:<module>:120 - 2024-09-18: condense vp for trip 0:05:09.575132
+2024-09-19 09:20:16.936 | INFO     | __main__:<module>:128 - 2024-09-18: prepare vp to use in nearest neighbor: 0:11:33.194871
diff --git a/gtfs_funnel/update_vars.py b/gtfs_funnel/update_vars.py
index cf98ac2c1..e59bd1b13 100644
--- a/gtfs_funnel/update_vars.py
+++ b/gtfs_funnel/update_vars.py
@@ -11,7 +11,7 @@
 )
 
 
-analysis_date_list = [rt_dates.DATES["aug2024"]]
+analysis_date_list = [rt_dates.DATES["sep2024"]]
 
 GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")
 
diff --git a/high_quality_transit_areas/D2_assemble_hqta_polygons.py b/high_quality_transit_areas/D2_assemble_hqta_polygons.py
index 7d68be922..1270c630a 100644
--- a/high_quality_transit_areas/D2_assemble_hqta_polygons.py
+++ b/high_quality_transit_areas/D2_assemble_hqta_polygons.py
@@ -14,8 +14,7 @@
 import C1_prep_pairwise_intersections as prep_clip
 import D1_assemble_hqta_points as assemble_hqta_points
 from calitp_data_analysis import utils, geography_utils
-from D1_assemble_hqta_points import (EXPORT_PATH, add_route_info)
-from shared_utils import gtfs_utils_v2
+from D1_assemble_hqta_points import EXPORT_PATH, add_route_info
 from update_vars import GCS_FILE_PATH, analysis_date, PROJECT_CRS
 
 catalog = intake.open_catalog("*.yml")
@@ -108,9 +107,7 @@ def final_processing(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
     """
     Drop extra columns, get sorting done.
     Used to drop bad stops, but these all look ok.
-    """
-    public_feeds = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys()
-    
+    """    
     keep_cols = [
         "agency_primary", "agency_secondary",
         "hqta_type", "hqta_details", "route_id", 
@@ -121,7 +118,7 @@ def final_processing(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
     
     # Drop bad stops, subset columns
     gdf2 = (
-        gdf[gdf.schedule_gtfs_dataset_key.isin(public_feeds)][keep_cols]
+        gdf[keep_cols]
             .drop_duplicates()
             .sort_values(["hqta_type", "agency_primary", 
                           "agency_secondary",
diff --git a/high_quality_transit_areas/logs/hqta_processing.log b/high_quality_transit_areas/logs/hqta_processing.log
index 42d1a4082..66aa62c21 100644
--- a/high_quality_transit_areas/logs/hqta_processing.log
+++ b/high_quality_transit_areas/logs/hqta_processing.log
@@ -134,3 +134,11 @@
 2024-08-15 10:13:09.237 | INFO     | __main__:<module>:163 - C3_create_bus_hqta_types 2024-08-14 execution time: 0:00:39.499192
 2024-08-15 10:14:06.867 | INFO     | __main__:<module>:297 - D1_assemble_hqta_points 2024-08-14 execution time: 0:00:34.144908
 2024-08-15 10:15:08.381 | INFO     | __main__:<module>:167 - D2_assemble_hqta_polygons 2024-08-14 execution time: 0:00:37.945649
+2024-09-19 09:22:34.969 | INFO     | __main__:<module>:354 - A1_rail_ferry_brt_stops 2024-09-18 execution time: 0:01:17.399227
+2024-09-19 09:28:56.027 | INFO     | __main__:<module>:249 - B1_create_hqta_segments execution time: 0:05:58.743890
+2024-09-19 09:30:06.095 | INFO     | __main__:<module>:256 - B2_sjoin_stops_to_segments 2024-09-18 execution time: 0:00:49.770142
+2024-09-19 09:30:32.509 | INFO     | __main__:<module>:142 - C1_prep_pairwise_intersections 2024-09-18 execution time: 0:00:08.451303
+2024-09-19 09:31:37.803 | INFO     | __main__:<module>:125 - C2_find_intersections 2024-09-18 execution time: 0:00:37.681126
+2024-09-19 09:32:36.082 | INFO     | __main__:<module>:163 - C3_create_bus_hqta_types 2024-09-18 execution time: 0:00:37.486499
+2024-09-19 09:33:22.863 | INFO     | __main__:<module>:307 - D1_assemble_hqta_points 2024-09-18 execution time: 0:00:27.668799
+2024-09-19 09:36:35.489 | INFO     | __main__:<module>:167 - D2_assemble_hqta_polygons 2024-09-18 execution time: 0:00:26.678607
diff --git a/high_quality_transit_areas/update_vars.py b/high_quality_transit_areas/update_vars.py
index 46046c8be..5de2605d6 100644
--- a/high_quality_transit_areas/update_vars.py
+++ b/high_quality_transit_areas/update_vars.py
@@ -1,6 +1,6 @@
 from shared_utils import rt_dates
 
-analysis_date = rt_dates.DATES["aug2024"]
+analysis_date = rt_dates.DATES["sep2024"]
 
 GCS_FILE_PATH = ("gs://calitp-analytics-data/data-analyses/"
                  "high_quality_transit_areas/")
diff --git a/open_data/create_stops_data.py b/open_data/create_stops_data.py
index 0a4fccea1..aa43e3442 100644
--- a/open_data/create_stops_data.py
+++ b/open_data/create_stops_data.py
@@ -105,7 +105,7 @@ def create_stops_file_for_export(date: str) -> gpd.GeoDataFrame:
     stop_times = helpers.import_scheduled_stop_times(
         date,
         columns = prep_traffic_ops.keep_stop_time_cols,
-        get_panda = True
+        get_pandas = True
     )
         
     stops_assembled = attach_route_info_to_stops(stops, trips, stop_times)
diff --git a/open_data/update_vars.py b/open_data/update_vars.py
index b68e3d2bb..b8364cd71 100644
--- a/open_data/update_vars.py
+++ b/open_data/update_vars.py
@@ -1,7 +1,7 @@
 from pathlib import Path
 from shared_utils import rt_dates
 
-analysis_date = rt_dates.DATES["aug2024"]
+analysis_date = rt_dates.DATES["sep2024"]
 
 GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/"
 COMPILED_CACHED_VIEWS = f"{GCS_FILE_PATH}rt_delay/compiled_cached_views/"

From 9726684a062a2a4d4dc2efa348d8a9fd822a604e Mon Sep 17 00:00:00 2001
From: tiffanychu90 <tiffany.chu@dot.ca.gov>
Date: Thu, 19 Sep 2024 17:33:23 +0000
Subject: [PATCH 02/11] initial publish of operators by most recent date

---
 gtfs_funnel/Makefile                |   3 +-
 gtfs_funnel/published_operators.yml | 235 ++++++++++++++++++++++++++++
 gtfs_funnel/track_publish_dates.py  |  85 ++++++++++
 gtfs_funnel/update_vars.py          |   4 +-
 4 files changed, 325 insertions(+), 2 deletions(-)
 create mode 100644 gtfs_funnel/published_operators.yml
 create mode 100644 gtfs_funnel/track_publish_dates.py

diff --git a/gtfs_funnel/Makefile b/gtfs_funnel/Makefile
index 1389e9011..6fb981b36 100644
--- a/gtfs_funnel/Makefile
+++ b/gtfs_funnel/Makefile
@@ -30,7 +30,8 @@ route_typologies_data:
 # Clean route names for displaying across time
 timeseries_preprocessing: 
 	python clean_route_naming.py
-
+	python track_publish_dates.py
+    
 # monthly scheduled service, download after the end of each month
 monthly_scheduled_data:
 	python download_monthly_service.py
diff --git a/gtfs_funnel/published_operators.yml b/gtfs_funnel/published_operators.yml
new file mode 100644
index 000000000..a29fa245e
--- /dev/null
+++ b/gtfs_funnel/published_operators.yml
@@ -0,0 +1,235 @@
+2024-09-18:
+  - Alhambra Schedule
+  - Amador Schedule
+  - Anaheim Resort Schedule
+  - Anaheim Resort Schedule v2
+  - Antelope Valley Transit Authority Schedule
+  - Arcadia Schedule
+  - Arvin Schedule
+  - Auburn Schedule
+  - B-Line Schedule
+  - Baldwin Park Schedule
+  - Banning Pass Schedule
+  - Bay Area 511 AC Transit Schedule
+  - Bay Area 511 ACE Schedule
+  - Bay Area 511 Angel Island-Tiburon Ferry Schedule
+  - Bay Area 511 BART Schedule
+  - Bay Area 511 Caltrain Schedule
+  - Bay Area 511 Capitol Corridor Schedule
+  - Bay Area 511 Commute.org Schedule
+  - Bay Area 511 County Connection Schedule
+  - Bay Area 511 Dumbarton Express Schedule
+  - Bay Area 511 Emery Go-Round Schedule
+  - Bay Area 511 Fairfield and Suisun Transit Schedule
+  - Bay Area 511 Golden Gate Ferry Schedule
+  - Bay Area 511 Golden Gate Transit Schedule
+  - Bay Area 511 MVGO Schedule
+  - Bay Area 511 Marin Schedule
+  - Bay Area 511 Mission Bay Schedule
+  - Bay Area 511 Muni Schedule
+  - Bay Area 511 Petaluma Schedule
+  - Bay Area 511 Rio Vista Delta Breeze Schedule
+  - Bay Area 511 SFO AirTrain Schedule
+  - Bay Area 511 SamTrans Schedule
+  - Bay Area 511 San Francisco Bay Ferry Schedule
+  - Bay Area 511 Santa Clara Transit Schedule
+  - Bay Area 511 Santa Rosa CityBus Schedule
+  - Bay Area 511 SolTrans Schedule
+  - Bay Area 511 Sonoma County Transit Schedule
+  - Bay Area 511 Sonoma-Marin Area Rail Transit Schedule
+  - Bay Area 511 South San Francisco Shuttle Schedule
+  - Bay Area 511 Treasure Island Ferry Schedule
+  - Bay Area 511 Tri Delta Schedule
+  - Bay Area 511 Tri-Valley Wheels Schedule
+  - Bay Area 511 Union City Transit Schedule
+  - Bay Area 511 Vacaville City Coach Schedule
+  - Bay Area 511 Vine Transit Schedule
+  - Bay Area 511 WestCAT Schedule
+  - Beach Cities GMV Schedule
+  - Bear Schedule
+  - Beaumont Pass Schedule
+  - Bell Gardens Schedule
+  - Bellflower Bus Schedule
+  - Big Blue Bus Schedule
+  - Big Blue Bus Swiftly Schedule
+  - BruinBus Schedule
+  - Burbank Schedule
+  - Calabasas Schedule
+  - Calaveras Schedule
+  - Cerritos on Wheels Schedule
+  - Cerritos on Wheels Website Schedule
+  - Clean Air Express Schedule
+  - Clovis Schedule
+  - Commerce Schedule
+  - Corona Schedule
+  - County Express Schedule
+  - Cudahy Schedule
+  - Culver City Schedule
+  - Curry Public Transit Schedule
+  - Dana Point Trolley Schedule
+  - Delano Schedule
+  - Desert Roadrunner GMV Schedule
+  - Desert Roadrunner Schedule
+  - DowneyLINK GMV Schedule
+  - Eastern Sierra Schedule
+  - El Dorado Schedule
+  - El Monte Schedule
+  - Elk Grove Schedule
+  - Flixbus Schedule
+  - Foothill Schedule
+  - Fresno County Schedule
+  - Fresno Schedule
+  - G Trans Schedule
+  - GET Schedule
+  - Get Around Town Express Schedule
+  - Glendale Schedule
+  - Glendora Schedule
+  - Glenn Schedule
+  - Go West Schedule
+  - Grapeline Schedule
+  - Guadalupe Flyer Schedule
+  - Havasu Landing Ferry Schedule
+  - Humboldt Schedule
+  - Huntington Schedule
+  - Imperial Valley Transit Schedule
+  - Inglewood Schedule
+  - Irvine CONNECT Schedule
+  - Kern Schedule
+  - Kings Schedule
+  - LA DOT Schedule
+  - LA Metro Bus Schedule
+  - LA Metro Rail Schedule
+  - LADPW Schedule
+  - LAX FlyAway Schedule
+  - LAX Flyaway Bus Schedule
+  - LAX Shuttles Schedule
+  - La Campana Schedule
+  - La Puente Schedule
+  - Laguna Beach Schedule
+  - Lake Schedule
+  - Lassen Schedule
+  - Lawndale Beat GMV Schedule
+  - Lawndale Schedule
+  - Lompoc Schedule
+  - Long Beach Schedule
+  - Lynwood Schedule IPS
+  - MV Shuttle Schedule
+  - Madera County Connection Schedule
+  - Madera Metro Schedule
+  - Mariposa Grove Shuttle Schedule
+  - Maywood Schedule
+  - Mendocino Schedule
+  - Merced GMV Schedule
+  - Merced Schedule
+  - Metrolink Schedule
+  - Montebello Schedule
+  - Monterey Salinas Schedule
+  - Morongo Basin Schedule
+  - Morro Bay Cal-ITP Schedule
+  - Mountain Transit GMV Schedule
+  - Mountain Transit Schedule
+  - Needles Schedule
+  - Nevada County Schedule
+  - North County Schedule
+  - Norwalk Avail Schedule
+  - OCTA Schedule
+  - OmniTrans Schedule
+  - Oregon POINT
+  - Palos Verdes PTA Schedule
+  - Pasadena Schedule
+  - Placer Schedule
+  - Plumas Schedule
+  - PresidiGo Schedule
+  - Redding Schedule
+  - Redwood Coast Schedule
+  - Riverside Schedule
+  - Rosemead Passio Schedule
+  - Roseville Schedule
+  - Roseville Transit GMV Schedule
+  - SBMTD Schedule
+  - SLO Schedule
+  - SLORTA Schedule
+  - Sage Stage Schedule
+  - San Clemente Trolley Schedule
+  - San Diego Schedule
+  - San Fernando Schedule
+  - San Joaquin Schedule
+  - San Juan Capistrano Trolley Schedule
+  - Santa Clarita Schedule
+  - Santa Maria Schedule
+  - Santa Ynez Mecatran Schedule
+  - Sierra Madre Schedule
+  - Siskiyou Schedule
+  - South County Transit Link Schedule
+  - South San Francisco Schedule
+  - Spirit Bus Passio Schedule
+  - StanRTA Schedule
+  - Stanford Schedule
+  - SunLine Avail Schedule
+  - 'TART, North Lake Tahoe Schedule'
+  - TCRTA TripShot Schedule
+  - Tahoe Transportation District GMV Schedule
+  - Tahoe Transportation District Schedule
+  - Tehama Schedule
+  - Torrance Schedule
+  - Tracy Schedule
+  - Trinity Schedule
+  - Tuolumne Remix Schedule
+  - Turlock Schedule
+  - UCSC Schedule
+  - Unitrans Schedule
+  - VCTC GMV Schedule
+  - Victor Valley GMV Schedule
+  - Victor Valley Schedule
+  - Visalia Schedule
+  - WeHo Schedule
+  - YARTS Schedule
+  - Yolobus Schedule
+  - Yosemite Valley Shuttle Schedule
+  - Yuba-Sutter Schedule
+  - Yuma Schedule
+  - eTrans Schedule
+2024-08-14:
+  - Santa Cruz Schedule
+2024-06-12:
+  - Anteater Express Schedule
+  - Lassen Flex
+  - Lynwood Schedule
+  - Manteca Schedule
+2024-05-22:
+  - El Segundo Schedule
+  - Redwood Coast Schedulel
+2024-04-17:
+  - Sacramento Schedule
+2024-03-13:
+  - Avalon Schedule
+2024-02-14:
+  - Rosemead Schedule
+2023-12-13:
+  - DowneyLINK Schedule
+  - Humboldt Flex
+  - Laguna Beach Flex
+  - Manteca Flex
+  - Placer Flex
+  - San Joaquin Flex
+  - Spirit Bus Schedule
+  - StanRTA Flex
+  - TART Flex
+  - Thousand Oaks Flex
+  - Tracy Flex
+  - Turlock Flex
+  - Union City Flex
+  - VCTC Flex
+  - WestCAT Flex
+2023-11-15:
+  - Amtrak Schedule
+  - Mission Bay Schedule
+2023-08-15:
+  - Blossom Express Schedule
+  - Eastern Sierra Flex
+2023-06-14:
+  - Tuolumne Schedule
+2023-04-12:
+  - Guadalupe Flex
+2023-03-15:
+  - TIME GMV Schedule
diff --git a/gtfs_funnel/track_publish_dates.py b/gtfs_funnel/track_publish_dates.py
new file mode 100644
index 000000000..4cf6d90e1
--- /dev/null
+++ b/gtfs_funnel/track_publish_dates.py
@@ -0,0 +1,85 @@
+"""
+Grab all the operators by service date from
+saved scheduled_trips tables from GCS.
+
+Create a yaml that tells us the most recent
+date available for each operator (schedule_gtfs_dataset_name).
+"""
+import pandas as pd
+import pyaml # use pyaml because it gets us prettier indents than yaml
+
+from pathlib import Path
+from typing import Union
+
+from shared_utils import rt_dates
+from segment_speed_utils import time_series_utils
+
+def filter_to_recent_date(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    By schedule_gtfs_dataset_name, keep the most recent
+    service_date that shows up in scheduled trips.
+    """
+    df2 = (df.groupby("name", group_keys=False)
+           .service_date
+           .max()
+           .reset_index()
+           .sort_values(["service_date", "name"], ascending=[False, True])
+           .reset_index(drop=True)
+           .astype({"service_date": "str"})
+          )
+    return df2
+
+def export_results_yml(
+    df: pd.DataFrame, 
+    export_yaml: Union[str, Path]
+):
+    """
+    Save out our results from df.
+    Convert df into a dictionary and save out dictionary results as yaml.
+    """
+    # TODO: check this list manually and there will be some 
+    # operator names that have more recent names that we are keeping,
+    # so we can remove these from our yaml
+    exclude_me = [
+        "TIME GMV"
+    ]
+    
+    df2 = df[~df.name.isin(exclude_me)]
+    
+    my_dict = {
+        **{
+            date_key: df2[df2.service_date==date_key].name.tolist() 
+            for date_key in df2.service_date.unique()
+          }  
+    }
+    
+    # sort_keys=False to prevent alphabetical sort (earliest date first)
+    # because we want to main our results and yaml with most recent date first
+    output = pyaml.dump(my_dict, sort_keys=False)
+    
+    with open(export_yaml, "w") as f:
+        f.write(output)
+    
+    print(f"{export_yaml} exported")
+    
+    return
+
+    
+if __name__ == "__main__":
+    
+    from update_vars import (GTFS_DATA_DICT, 
+                             COMPILED_CACHED_VIEWS, 
+                             PUBLISHED_OPERATORS_YAML)
+    
+    TABLE = GTFS_DATA_DICT.schedule_downloads.trips
+
+    operators = time_series_utils.concatenate_datasets_across_dates(
+        COMPILED_CACHED_VIEWS,
+        TABLE,
+        rt_dates.y2024_dates + rt_dates.y2023_dates,
+        data_type = "df",
+        get_pandas = True,
+        columns = ["name"]
+    ).drop_duplicates().pipe(filter_to_recent_date)
+    
+    export_results_yml(operators, PUBLISHED_OPERATORS_YAML)
\ No newline at end of file
diff --git a/gtfs_funnel/update_vars.py b/gtfs_funnel/update_vars.py
index e59bd1b13..02828ae30 100644
--- a/gtfs_funnel/update_vars.py
+++ b/gtfs_funnel/update_vars.py
@@ -21,4 +21,6 @@
 SCHED_GCS = GTFS_DATA_DICT.gcs_paths.SCHED_GCS
 SHARED_GCS = GTFS_DATA_DICT.gcs_paths.SHARED_GCS
 
-ntd_latest_year = 2022
\ No newline at end of file
+ntd_latest_year = 2022
+
+PUBLISHED_OPERATORS_YAML = "published_operators.yml"
\ No newline at end of file

From 9d4c93ff30b85ee0acc30a1cd9ff6c8984d1c40e Mon Sep 17 00:00:00 2001
From: tiffanychu90 <tiffany.chu@dot.ca.gov>
Date: Thu, 19 Sep 2024 19:34:04 +0000
Subject: [PATCH 03/11] add export path to update_vars for hqta

---
 .../check2_hq_corridors.ipynb                 |  6 ++---
 .../check3_hqta_points.ipynb                  | 26 +++++++++++++++++--
 high_quality_transit_areas/update_vars.py     |  3 ++-
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/high_quality_transit_areas/check2_hq_corridors.ipynb b/high_quality_transit_areas/check2_hq_corridors.ipynb
index c4440b9b0..eba631e67 100644
--- a/high_quality_transit_areas/check2_hq_corridors.ipynb
+++ b/high_quality_transit_areas/check2_hq_corridors.ipynb
@@ -10,7 +10,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "ecdd335a-be94-4a11-aaca-24a43a3b9756",
    "metadata": {},
    "outputs": [],
@@ -34,7 +34,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "4fa137db-08d5-4822-9bdd-46919ee0da7f",
    "metadata": {},
    "outputs": [],
@@ -45,7 +45,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "1e383a39-3810-41f6-a366-985126b335db",
    "metadata": {},
    "outputs": [],
diff --git a/high_quality_transit_areas/check3_hqta_points.ipynb b/high_quality_transit_areas/check3_hqta_points.ipynb
index 42e7d19f6..7869c49f4 100644
--- a/high_quality_transit_areas/check3_hqta_points.ipynb
+++ b/high_quality_transit_areas/check3_hqta_points.ipynb
@@ -52,7 +52,7 @@
     "    if \"service_date\" in gdf.columns:\n",
     "        gdf = gdf.drop(columns = \"service_date\")\n",
     "        \n",
-    "    m = gdf.explore(plot_col, categorical=True, tiles = TILES)\n",
+    "    m = gdf.explore(plot_col, categorical=True, tiles = TILES, legend=True)\n",
     "    \n",
     "    display(m)"
    ]
@@ -171,7 +171,29 @@
    "id": "41dcbec2-16a9-4d56-9f30-9b2e83bd2741",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "gdf[gdf.agency_primary.str.contains(\"Santa Monica\")].base64_url_primary.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ace4244c-b351-4259-b8bc-75e1d0105a58",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gdf[gdf.agency_primary.str.contains(\"Santa Monica\")]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2029272f-7d46-4d11-9918-357c43253128",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "agency_primary"
+   ]
   }
  ],
  "metadata": {
diff --git a/high_quality_transit_areas/update_vars.py b/high_quality_transit_areas/update_vars.py
index 5de2605d6..dffb41557 100644
--- a/high_quality_transit_areas/update_vars.py
+++ b/high_quality_transit_areas/update_vars.py
@@ -7,4 +7,5 @@
 TEMP_GCS = f"{GCS_FILE_PATH}temp/"
 PROJECT_CRS = "EPSG:3310"
 HQTA_SEGMENT_LENGTH = 1_250 # meters
-BUFFER_METERS = 50
\ No newline at end of file
+BUFFER_METERS = 50
+EXPORT_PATH = f"{GCS_FILE_PATH}export/{analysis_date}/"
\ No newline at end of file

From 606000e2bb15a67a2c9966d3796b3e95a406a3ea Mon Sep 17 00:00:00 2001
From: tiffanychu90 <tiffany.chu@dot.ca.gov>
Date: Thu, 19 Sep 2024 19:48:35 +0000
Subject: [PATCH 04/11] run segment speeds for sep2024

---
 rt_segment_speeds/logs/avg_speeds.log                 | 9 +++++++++
 rt_segment_speeds/logs/cut_stop_segments.log          | 2 ++
 rt_segment_speeds/logs/interpolate_stop_arrival.log   | 2 ++
 rt_segment_speeds/logs/nearest_vp.log                 | 6 ++++++
 rt_segment_speeds/logs/speeds_by_segment_trip.log     | 3 +++
 rt_segment_speeds/segment_speed_utils/project_vars.py | 2 +-
 6 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/rt_segment_speeds/logs/avg_speeds.log b/rt_segment_speeds/logs/avg_speeds.log
index dd0382eaf..816292e0c 100644
--- a/rt_segment_speeds/logs/avg_speeds.log
+++ b/rt_segment_speeds/logs/avg_speeds.log
@@ -446,3 +446,12 @@
 2024-08-15 12:49:58.589 | INFO     | average_segment_speeds:single_day_segment_averages:173 - shape seg avg 0:05:18.197146
 2024-08-15 12:53:37.977 | INFO     | average_segment_speeds:single_day_segment_averages:189 - route dir seg avg 0:03:39.388630
 2024-08-15 12:53:37.978 | INFO     | average_segment_speeds:single_day_segment_averages:190 - single day segment 2024-08-14 execution time: 0:08:57.585776
+2024-09-19 11:41:23.528 | INFO     | __main__:single_day_segment_averages:173 - shape seg avg 0:04:35.831329
+2024-09-19 11:44:21.408 | INFO     | __main__:single_day_segment_averages:189 - route dir seg avg 0:02:57.880787
+2024-09-19 11:44:21.409 | INFO     | __main__:single_day_segment_averages:190 - single day segment 2024-09-18 execution time: 0:07:33.712116
+2024-09-19 12:25:35.942 | INFO     | __main__:single_day_summary_averages:90 - trip avg 0:00:15.246328
+2024-09-19 12:25:48.332 | INFO     | __main__:single_day_summary_averages:132 - route dir avg: 0:00:12.390061
+2024-09-19 12:25:48.333 | INFO     | __main__:single_day_summary_averages:133 - single day summary speed 2024-09-18 execution time: 0:00:27.636389
+2024-09-19 12:42:38.061 | INFO     | average_segment_speeds:single_day_segment_averages:173 - shape seg avg 0:05:30.126380
+2024-09-19 12:46:50.506 | INFO     | average_segment_speeds:single_day_segment_averages:189 - route dir seg avg 0:04:12.445389
+2024-09-19 12:46:50.507 | INFO     | average_segment_speeds:single_day_segment_averages:190 - single day segment 2024-09-18 execution time: 0:09:42.571769
diff --git a/rt_segment_speeds/logs/cut_stop_segments.log b/rt_segment_speeds/logs/cut_stop_segments.log
index 655030839..39bf1068a 100644
--- a/rt_segment_speeds/logs/cut_stop_segments.log
+++ b/rt_segment_speeds/logs/cut_stop_segments.log
@@ -45,3 +45,5 @@
 2024-07-30 21:16:51.780 | INFO     | __main__:<module>:244 - speedmap segments and proxy_stop_times 2023-04-16: 0:02:19.401564
 2024-08-15 10:55:47.957 | INFO     | __main__:<module>:155 - cut segments 2024-08-14: 0:21:24.282441
 2024-08-15 11:01:37.861 | INFO     | __main__:<module>:244 - speedmap segments and proxy_stop_times 2024-08-14: 0:04:20.718384
+2024-09-19 10:45:10.417 | INFO     | __main__:<module>:155 - cut segments 2024-09-18: 0:22:12.922031
+2024-09-19 10:51:18.211 | INFO     | __main__:<module>:244 - speedmap segments and proxy_stop_times 2024-09-18: 0:04:36.568172
diff --git a/rt_segment_speeds/logs/interpolate_stop_arrival.log b/rt_segment_speeds/logs/interpolate_stop_arrival.log
index 1b3b751e4..29cc7fc4a 100644
--- a/rt_segment_speeds/logs/interpolate_stop_arrival.log
+++ b/rt_segment_speeds/logs/interpolate_stop_arrival.log
@@ -99,3 +99,5 @@
 2024-08-15 11:46:45.773 | INFO     | interpolate_stop_arrival:interpolate_stop_arrivals:279 - interpolate arrivals for stop_segments 2024-08-14:  2024-08-14: 0:13:26.403842
 2024-08-15 12:31:36.711 | INFO     | interpolate_stop_arrival:interpolate_stop_arrivals:279 - interpolate arrivals for rt_stop_times 2024-08-14:  2024-08-14: 0:14:13.913489
 2024-08-15 12:42:32.459 | INFO     | interpolate_stop_arrival:interpolate_stop_arrivals:279 - interpolate arrivals for speedmap_segments 2024-08-14:  2024-08-14: 0:02:27.666741
+2024-09-19 11:34:46.012 | INFO     | interpolate_stop_arrival:interpolate_stop_arrivals:279 - interpolate arrivals for stop_segments 2024-09-18:  2024-09-18: 0:15:34.067479
+2024-09-19 12:22:50.153 | INFO     | interpolate_stop_arrival:interpolate_stop_arrivals:279 - interpolate arrivals for rt_stop_times 2024-09-18:  2024-09-18: 0:15:01.401473
diff --git a/rt_segment_speeds/logs/nearest_vp.log b/rt_segment_speeds/logs/nearest_vp.log
index ff4f162c2..bcb9357da 100644
--- a/rt_segment_speeds/logs/nearest_vp.log
+++ b/rt_segment_speeds/logs/nearest_vp.log
@@ -201,3 +201,9 @@
 2024-08-15 12:17:22.737 | INFO     | vp_around_stops:filter_to_nearest_two_vp:247 - nearest 2 vp for rt_stop_times 2024-08-14: 0:09:34.224293
 2024-08-15 12:36:39.952 | INFO     | nearest_vp_to_stop:nearest_neighbor_for_stop:178 - nearest neighbor for speedmap_segments 2024-08-14: 0:02:25.873838
 2024-08-15 12:40:04.733 | INFO     | vp_around_stops:filter_to_nearest_two_vp:247 - nearest 2 vp for speedmap_segments 2024-08-14: 0:03:24.363193
+2024-09-19 11:06:22.823 | INFO     | nearest_vp_to_stop:nearest_neighbor_for_stop:178 - nearest neighbor for stop_segments 2024-09-18: 0:13:00.998435
+2024-09-19 11:19:11.854 | INFO     | vp_around_stops:filter_to_nearest_two_vp:247 - nearest 2 vp for stop_segments 2024-09-18: 0:09:50.183557
+2024-09-19 11:57:34.800 | INFO     | nearest_vp_to_stop:nearest_neighbor_for_stop:178 - nearest neighbor for rt_stop_times 2024-09-18: 0:12:19.977712
+2024-09-19 12:07:48.692 | INFO     | vp_around_stops:filter_to_nearest_two_vp:247 - nearest 2 vp for rt_stop_times 2024-09-18: 0:10:11.973530
+2024-09-19 12:28:39.454 | INFO     | nearest_vp_to_stop:nearest_neighbor_for_stop:178 - nearest neighbor for speedmap_segments 2024-09-18: 0:02:33.742427
+2024-09-19 12:32:09.310 | INFO     | vp_around_stops:filter_to_nearest_two_vp:247 - nearest 2 vp for speedmap_segments 2024-09-18: 0:03:29.417591
diff --git a/rt_segment_speeds/logs/speeds_by_segment_trip.log b/rt_segment_speeds/logs/speeds_by_segment_trip.log
index a7e368136..18527e3c9 100644
--- a/rt_segment_speeds/logs/speeds_by_segment_trip.log
+++ b/rt_segment_speeds/logs/speeds_by_segment_trip.log
@@ -110,3 +110,6 @@
 2024-08-15 11:48:10.483 | INFO     | stop_arrivals_to_speed:calculate_speed_from_stop_arrivals:176 - speeds by segment for stop_segments 2024-08-14: 0:01:24.614156
 2024-08-15 12:33:12.660 | INFO     | stop_arrivals_to_speed:calculate_speed_from_stop_arrivals:176 - speeds by segment for rt_stop_times 2024-08-14: 0:01:35.909290
 2024-08-15 12:44:22.889 | INFO     | stop_arrivals_to_speed:calculate_speed_from_stop_arrivals:176 - speeds by segment for speedmap_segments 2024-08-14: 0:01:41.398512
+2024-09-19 11:36:29.235 | INFO     | stop_arrivals_to_speed:calculate_speed_from_stop_arrivals:176 - speeds by segment for stop_segments 2024-09-18: 0:01:43.166792
+2024-09-19 12:25:01.693 | INFO     | stop_arrivals_to_speed:calculate_speed_from_stop_arrivals:176 - speeds by segment for rt_stop_times 2024-09-18: 0:02:11.499303
+2024-09-19 12:36:50.740 | INFO     | stop_arrivals_to_speed:calculate_speed_from_stop_arrivals:176 - speeds by segment for speedmap_segments 2024-09-18: 0:01:46.975907
diff --git a/rt_segment_speeds/segment_speed_utils/project_vars.py b/rt_segment_speeds/segment_speed_utils/project_vars.py
index 7d375c817..3c9f7a259 100644
--- a/rt_segment_speeds/segment_speed_utils/project_vars.py
+++ b/rt_segment_speeds/segment_speed_utils/project_vars.py
@@ -11,7 +11,7 @@
 SHARED_GCS = GTFS_DATA_DICT.gcs_paths.SHARED_GCS
 PUBLIC_GCS = GTFS_DATA_DICT.gcs_paths.PUBLIC_GCS
 
-analysis_date = rt_dates.DATES["aug2024"]
+analysis_date = rt_dates.DATES["sep2024"]
 
 oct2023_week = rt_dates.get_week("oct2023", exclude_wed=True)
 apr2023_week = rt_dates.get_week("apr2023", exclude_wed=True)

From 4145dd4357535730b210615d79efc6df06427e25 Mon Sep 17 00:00:00 2001
From: tiffanychu90 <tiffany.chu@dot.ca.gov>
Date: Thu, 19 Sep 2024 20:22:39 +0000
Subject: [PATCH 05/11] rt_vs_sched for sep2024

---
 rt_scheduled_v_ran/logs/rt_v_scheduled_route_metrics.log | 1 +
 rt_scheduled_v_ran/logs/rt_v_scheduled_trip_metrics.log  | 3 +++
 rt_scheduled_v_ran/scripts/update_vars.py                | 3 +--
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/rt_scheduled_v_ran/logs/rt_v_scheduled_route_metrics.log b/rt_scheduled_v_ran/logs/rt_v_scheduled_route_metrics.log
index 34acf4234..637e03c7d 100644
--- a/rt_scheduled_v_ran/logs/rt_v_scheduled_route_metrics.log
+++ b/rt_scheduled_v_ran/logs/rt_v_scheduled_route_metrics.log
@@ -69,3 +69,4 @@
 2024-08-05 10:46:49.044 | INFO     | __main__:route_metrics:84 - route aggregation 2024-07-17: 0:00:03.060244
 2024-08-05 10:49:43.399 | INFO     | __main__:route_metrics:84 - route aggregation 2024-07-17: 0:00:02.982204
 2024-08-15 13:24:21.737 | INFO     | __main__:route_metrics:84 - route aggregation 2024-08-14: 0:00:02.641057
+2024-09-19 13:19:02.357 | INFO     | __main__:route_metrics:84 - route aggregation 2024-09-18: 0:00:02.698805
diff --git a/rt_scheduled_v_ran/logs/rt_v_scheduled_trip_metrics.log b/rt_scheduled_v_ran/logs/rt_v_scheduled_trip_metrics.log
index 1032c2ed7..4cc78f5a0 100644
--- a/rt_scheduled_v_ran/logs/rt_v_scheduled_trip_metrics.log
+++ b/rt_scheduled_v_ran/logs/rt_v_scheduled_trip_metrics.log
@@ -450,3 +450,6 @@
 2024-08-15 12:57:49.299 | INFO     | __main__:rt_schedule_trip_metrics:280 - tabular trip metrics 2024-08-14: 0:02:33.455269
 2024-08-15 13:22:11.674 | INFO     | __main__:rt_schedule_trip_metrics:285 - spatial trip metrics 2024-08-14: 0:24:22.374322
 2024-08-15 13:23:57.590 | INFO     | __main__:rt_schedule_trip_metrics:333 - Total run time for metrics on 2024-08-14: 0:28:41.746058
+2024-09-19 12:52:30.501 | INFO     | __main__:rt_schedule_trip_metrics:280 - tabular trip metrics 2024-09-18: 0:02:49.593356
+2024-09-19 13:16:44.431 | INFO     | __main__:rt_schedule_trip_metrics:285 - spatial trip metrics 2024-09-18: 0:24:13.930638
+2024-09-19 13:18:42.287 | INFO     | __main__:rt_schedule_trip_metrics:333 - Total run time for metrics on 2024-09-18: 0:29:01.379486
diff --git a/rt_scheduled_v_ran/scripts/update_vars.py b/rt_scheduled_v_ran/scripts/update_vars.py
index a63703256..c29e2d583 100644
--- a/rt_scheduled_v_ran/scripts/update_vars.py
+++ b/rt_scheduled_v_ran/scripts/update_vars.py
@@ -5,8 +5,7 @@
 apr2023_week = rt_dates.get_week("apr2023", exclude_wed=True)
 apr2024_week = rt_dates.get_week("apr2024", exclude_wed=True)
 
-analysis_date_list = [
-    rt_dates.DATES["aug2024"]]
+analysis_date_list = [rt_dates.DATES["sep2024"]]
 
 
 GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

From a9f622babc7b226bca4a1b6063efecda46d53d97 Mon Sep 17 00:00:00 2001
From: tiffanychu90 <tiffany.chu@dot.ca.gov>
Date: Thu, 19 Sep 2024 20:47:48 +0000
Subject: [PATCH 06/11] (refactor): hqta major transit stop compilation

---
 .../A1_rail_ferry_brt_stops.py                | 328 +++++++-----------
 .../check1_downloads.ipynb                    | 174 +---------
 2 files changed, 129 insertions(+), 373 deletions(-)

diff --git a/high_quality_transit_areas/A1_rail_ferry_brt_stops.py b/high_quality_transit_areas/A1_rail_ferry_brt_stops.py
index 9705610f9..a35aa0524 100644
--- a/high_quality_transit_areas/A1_rail_ferry_brt_stops.py
+++ b/high_quality_transit_areas/A1_rail_ferry_brt_stops.py
@@ -1,9 +1,6 @@
 """
-Download rail, ferry, BRT stops.
-Export combined rail/ferry/BRT data into GCS.
-
-Clean up the combined rail/ferry/BRT points
-and get it ready to be combined with other bus-related points.
+Assemble major transit stops for rail, BRT, and ferry 
+and export to GCS.
 
 Turn rail_ferry_brt.ipynb and combine_and_visualize.ipynb 
 into scripts.
@@ -19,7 +16,7 @@
 from calitp_data_analysis import utils
 from segment_speed_utils import helpers
 from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS
-from update_vars import GCS_FILE_PATH, analysis_date, TEMP_GCS
+from update_vars import GCS_FILE_PATH, analysis_date
 
 catalog = intake.open_catalog("*.yml")
 
@@ -81,224 +78,159 @@
     '13805', '141012',
 ]
 
-def filter_trips_to_route_type(
-    analysis_date: str, 
-    route_types: list
-) -> pd.DataFrame:
+def assemble_stops(analysis_date: str) -> gpd.GeoDataFrame:
     """
-    Can use route_type_* from stops table, but since BRT needs to start 
-    from trips, might as well just get it from trips.
+    Start with stop_times, attach stop geometry, 
+    and also route info (route_type) from trips table.
     """
-    
-    trips = helpers.import_scheduled_trips(
+    stop_times = helpers.import_scheduled_stop_times(
         analysis_date,
-        columns = ["feed_key", "name", "trip_id", 
-                   "route_id", "route_type", "route_desc"],
+        columns = ["feed_key", "schedule_gtfs_dataset_key",
+                   "stop_id", "trip_instance_key"],
+        with_direction = True,
+        get_pandas = True
     )
     
-    if isinstance(route_types, list):
-        trips_subset = trips[trips.route_type.isin(route_types)]
-    
-    elif route_types == "brt": 
-        trips_subset = filter_to_brt_trips(trips)
-        
-    trips_subset = (trips_subset
-                    .drop(columns = "route_desc")
-                    .drop_duplicates()
-                    .reset_index(drop=True)
-                   )
-    
-    return trips_subset
-
-    
-def filter_to_brt_trips(trips: pd.DataFrame) -> pd.DataFrame:
-    """
-    Start with trips table and filter to specific routes that
-    are BRT
-    """    
-    BRT_ROUTE_FILTERING = {
-        "Bay Area 511 AC Transit Schedule": {"route_id": ac_transit_route_id},
-        "LA Metro Bus Schedule": {"route_desc": metro_route_desc},
-        "Bay Area 511 Muni Schedule": {"route_id": muni_route_id},
-        # Omni BRT -- too infrequent!
-        #"OmniTrans Schedule": {"route_short_name": ["sbX"]}
-    }             
-    
-    all_brt_trips = pd.DataFrame()
-    
-    for name, filtering_cond in BRT_ROUTE_FILTERING.items():
-        for col, filtering_list in filtering_cond.items():
-            trips_subset = trips[
-                (trips.name == name) & 
-                (trips[col].isin(filtering_list))]
-            
-            all_brt_trips = pd.concat([all_brt_trips, trips_subset], axis=0)
-    
-    return all_brt_trips
-    
-
-def filter_unique_stops_for_trips(
-    analysis_date: str, 
-    trip_df: pd.DataFrame
-) -> gpd.GeoDataFrame:
-    """
-    Start with all operators' stop_times, and narrow down to the trip_ids
-    present for the route_type and keep the unique stops.
-    
-    Then attach the stop's point geometry.
-    """
-    stop_times = helpers.import_scheduled_stop_times(
+    trips = helpers.import_scheduled_trips(
         analysis_date,
-        with_direction = False,
+        columns = [
+            "name",
+            "trip_instance_key", 
+            "route_id", "route_type", "route_desc"
+        ],
         get_pandas = True
     )
-    
-    keep_stop_cols = [
-        "feed_key", "name", 
-        "stop_id", 
-        "route_id", "route_type",         
-        # let's keep route_id, since we double check in a notebook
-    ]
-    
-    stops_for_trips = pd.merge(
+       
+    stops_with_route = pd.merge(
         stop_times,
-        trip_df,
-        on = ["feed_key", "trip_id"],
+        trips,
+        on = "trip_instance_key",
         how = "inner"
-    )[keep_stop_cols].drop_duplicates().reset_index(drop=True)
+    ).drop(
+        columns = "trip_instance_key"
+    ).drop_duplicates().reset_index(drop=True)
         
     # Attach stop geometry
     stops = helpers.import_scheduled_stops(
         analysis_date,
+        columns = ["feed_key", "stop_id", "stop_name", "geometry"],
+        get_pandas = True
     )
     
     stops_with_geom = pd.merge(
         stops, 
-        stops_for_trips,
+        stops_with_route,
         on = ["feed_key", "stop_id"],
         how = "inner"
-    )[keep_stop_cols + ["stop_name", "geometry"]]
+    )
     
     return stops_with_geom
-    
 
-def grab_rail_data(analysis_date: str):
+
+def grab_rail_stops(
+    gdf: gpd.GeoDataFrame, 
+    route_types: list = ['0', '1', '2']
+) -> gpd.GeoDataFrame:
     """
     Grab all the rail stops.
+    """           
+    return gdf[
+        gdf.route_type.isin(route_types)
+    ].reset_index(drop=True).assign(hqta_type = "major_stop_rail")
+  
+
+def grab_ferry_stops(
+    gdf: gpd.GeoDataFrame, 
+    route_types: list = ['4']
+) -> gpd.GeoDataFrame:
     """
-    rail_route_types = ['0', '1', '2']            
-    
-    rail_trips = filter_trips_to_route_type(analysis_date, rail_route_types)
-    rail_stops = filter_unique_stops_for_trips(analysis_date, rail_trips)
-                    
-    utils.geoparquet_gcs_export(
-        rail_stops,
-        TEMP_GCS,
-        "rail_stops"
-    )
-    
-    
-def grab_brt_data(analysis_date: str):
-    """
-    Grab BRT routes, stops data for certain operators in CA by analysis date.
-    """
-                             
-    brt_trips = filter_trips_to_route_type(analysis_date, "brt")
-    brt_stops = filter_unique_stops_for_trips(analysis_date, brt_trips)
-            
-    utils.geoparquet_gcs_export(
-        brt_stops,
-        TEMP_GCS,
-        "brt_stops"
-    )
+    Grab all the ferry stops.
+    """    
+    # only stops without bus service
+    angel_and_alcatraz = ['2483552', '2483550', '43002']         
     
+    return gdf[
+        (gdf.route_type.isin(route_types)) & 
+        ~(gdf.stop_id.isin(angel_and_alcatraz))
+    ].reset_index(drop=True).assign(hqta_type = "major_stop_ferry")
+
 
-def additional_brt_filtering_out_stops(
-    df: gpd.GeoDataFrame, 
+def grab_brt_stops(
+    gdf: gpd.GeoDataFrame, 
+    route_types: list = ["3"]
 ) -> gpd.GeoDataFrame:
     """
-    df: geopandas.GeoDataFrame
-        Input BRT stops data (combined across operators)
+    Start with the stops that has route information
+    and start filtering based on operator name, route_id / route_desc, 
+    and stop_ids to include or exclude.
+    
+    The stop id lists were manually provided (by Muni) and/or verified by us.
     """
     metro_name = "LA Metro Bus Schedule"
     muni_name = "Bay Area 511 Muni Schedule"
+    ac_transit_name = "Bay Area 511 AC Transit Schedule"
+    # Omni BRT -- too infrequent! "route_short_name": ["sbX"]
     
-    muni = df[df.name == muni_name].query(
-        'stop_id in @muni_brt_include'
-    )
-    
-    # For Metro, unable to filter out non-station stops using GTFS, manual list
-    metro = df[df.name == metro_name].query(
-        'stop_id not in @metro_j_exclude')
-    
-    muni_metro = pd.concat([muni, metro], axis=0)
+    BRT_ROUTE_FILTERING = {
+        "Bay Area 511 AC Transit Schedule": {"route_id": ac_transit_route_id},
+        "LA Metro Bus Schedule": {"route_desc": metro_route_desc},
+    }   
     
-    other_operators = df[~df.name.isin([metro_name, muni_name])]
-
-    brt_df_stops = pd.concat(
-        [muni_metro, other_operators], axis=0
-    ).sort_values(["feed_key", "name"]).reset_index(drop=True)
+    brt_operator_stops = gdf[
+        (gdf.route_type.isin(route_types)) & 
+        (gdf.name.isin([metro_name, muni_name, ac_transit_name])) 
+    ]
     
-    return brt_df_stops
-
-
-def grab_ferry_data(analysis_date: str):
-    """
-    Grab all the ferry stops.
-    """
-    ferry_route_types = ['4']
-        
-    ferry_trips = filter_trips_to_route_type(analysis_date, ferry_route_types)
-    ferry_stops = filter_unique_stops_for_trips(analysis_date, ferry_trips)
-
-    # only stops without bus service
-    angel_and_alcatraz = ['2483552', '2483550', '43002'] 
+    muni_brt = brt_operator_stops[
+        (brt_operator_stops.name == muni_name) & 
+        (brt_operator_stops.route_id.isin(muni_route_id)) & 
+        (brt_operator_stops.stop_id.isin(muni_brt_include))
+    ]
     
-    ferry_stops = ferry_stops[
-        ~ferry_stops.stop_id.isin(angel_and_alcatraz)
-    ].reset_index(drop=True)
+    # For Metro, unable to filter out non-station stops using GTFS, manual list
+    metro_brt = brt_operator_stops[
+        (brt_operator_stops.name == metro_name) & 
+        (brt_operator_stops.route_desc.isin(metro_route_desc)) & 
+        ~(brt_operator_stops.stop_id.isin(metro_j_exclude))
+    ]
     
-    utils.geoparquet_gcs_export(
-        ferry_stops,
-        TEMP_GCS,
-        "ferry_stops"
-    )
-
+    ac_transit_brt = brt_operator_stops[
+        (brt_operator_stops.name == ac_transit_name) & 
+        (brt_operator_stops.route_id.isin(ac_transit_route_id))
+    ]
+  
+    brt_stops = pd.concat(
+        [muni_brt, metro_brt, ac_transit_brt], axis=0
+    ).reset_index(drop=True).assign(hqta_type = "major_stop_brt")
     
-def clip_to_ca(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
-    """
-    Clip to CA boundaries. 
-    """    
-    ca = catalog.ca_boundary.read().to_crs(gdf.crs)
+    return brt_stops
 
-    gdf2 = gdf.clip(ca, keep_geom_type = False).reset_index(drop=True)
 
-    return gdf2
-    
-    
-def get_rail_ferry_brt_extract() -> gpd.GeoDataFrame:
+def compile_rail_ferry_brt_stops(
+     list_of_files: list
+ ) -> gpd.GeoDataFrame:
     """
     Prepare the rail / ferry / BRT stops to be assembled with
     the bus_hqta types and saved into the hqta_points file.
     """
-    df = catalog.rail_brt_ferry_initial.read()
-    
-    keep_cols = ["feed_key", "name", "stop_id", 
-                 "route_type", "geometry"]
+    df = pd.concat(
+        list_of_files, 
+        axis=0, ignore_index=True
+    )
     
-    rail_types = ["0", "1", "2"]
-    bus_types = ["3"]
-    ferry_types = ["4"]
+    keep_cols = [
+        "schedule_gtfs_dataset_key", "feed_key", 
+        "stop_id", "stop_name",
+        "route_id", "route_type",
+        "hqta_type", "geometry"
+    ]
     
-    df2 = (df[keep_cols].assign(
-            hqta_type = df.route_type.map(
-                lambda x: "major_stop_rail" if x in rail_types
-                else "major_stop_brt" if x in bus_types
-                else "major_stop_ferry" if x in ferry_types 
-                else "missing" # add flag to make it easier to check results
-            )
-        ).rename(columns = {"feed_key": "feed_key_primary"})
-           .drop(columns = ["route_type", "name"])
+    df2 = (df[keep_cols]
+           .sort_values(["feed_key", "stop_id"]).reset_index(drop=True)
+           .rename(columns = {
+               "feed_key": "feed_key_primary",
+               "schedule_gtfs_dataset_key": "schedule_gtfs_dataset_key_primary"
+           })
     )
 
     return df2 
@@ -317,39 +249,25 @@ def get_rail_ferry_brt_extract() -> gpd.GeoDataFrame:
 
     start = datetime.datetime.now()
     
-    # Rail
-    grab_rail_data(analysis_date)
-    rail_stops = gpd.read_parquet(f"{TEMP_GCS}rail_stops.parquet")
-
-    # BRT
-    grab_brt_data(analysis_date)
-    brt_stops = gpd.read_parquet(f"{TEMP_GCS}brt_stops.parquet")
-    brt_stops = additional_brt_filtering_out_stops(
-        brt_stops)
+    stops_route_gdf = assemble_stops(analysis_date)
     
-    # Ferry
-    grab_ferry_data(analysis_date)
-    ferry_stops = gpd.read_parquet(f"{TEMP_GCS}ferry_stops.parquet")
-        
-    # Concatenate datasets that need to be clipped to CA
-    rail_brt = pd.concat([
-        rail_stops,
-        brt_stops
-    ], axis=0, ignore_index= True).pipe(clip_to_ca)
-        
-    # Concatenate all together
-    rail_brt_ferry = pd.concat([
-        rail_brt,
-        ferry_stops
-    ], axis=0, ignore_index=True)
+    rail_stops = grab_rail_stops(stops_route_gdf)
+    ferry_stops = grab_ferry_stops(stops_route_gdf)
+    brt_stops = grab_brt_stops(stops_route_gdf)
+    
+    major_transit_stops = compile_rail_ferry_brt_stops(
+        [rail_stops, ferry_stops, brt_stops]
+    )
     
-    # Export to GCS
     utils.geoparquet_gcs_export(
-        rail_brt_ferry, 
+        major_transit_stops, 
         GCS_FILE_PATH, 
         "rail_brt_ferry"
     )
     
     end = datetime.datetime.now()
-    logger.info(f"A1_rail_ferry_brt_stops {analysis_date} "
-                f"execution time: {end - start}")
\ No newline at end of file
+    
+    logger.info(
+        f"A1_rail_ferry_brt_stops {analysis_date} "
+        f"execution time: {end - start}"
+    )
\ No newline at end of file
diff --git a/high_quality_transit_areas/check1_downloads.ipynb b/high_quality_transit_areas/check1_downloads.ipynb
index 0abcf7d82..b12fc7687 100644
--- a/high_quality_transit_areas/check1_downloads.ipynb
+++ b/high_quality_transit_areas/check1_downloads.ipynb
@@ -23,7 +23,7 @@
     "from IPython.display import Markdown\n",
     "\n",
     "from segment_speed_utils import helpers\n",
-    "from update_vars import analysis_date, TEMP_GCS\n",
+    "from update_vars import analysis_date, GCS_FILE_PATH\n",
     "\n",
     "# Map arguments\n",
     "TILES = \"Carto DB Positron\""
@@ -47,167 +47,6 @@
     "    display(m)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "c27ae680-8516-46f9-98be-92ce69a20007",
-   "metadata": {},
-   "source": [
-    "## After `A1_download_rail_ferry_brt`\n",
-    "\n",
-    "* There are some stops to remove. \n",
-    "* Once finalized, can run `A2_combine_stops`"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "22f1a4b9-6846-4d48-a4ba-70fa4006f155",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import A1_rail_ferry_brt_stops as rail_ferry_brt"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "99d703a2-80fd-4ade-bc45-b6d36113892c",
-   "metadata": {},
-   "source": [
-    "### LA Metro (182)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a5cf0189-b403-48b0-a0ab-12d3ae2ea91f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "brt_stops = gpd.read_parquet(f\"{TEMP_GCS}brt_stops.parquet\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9bd567f3-f31d-4455-b4d3-b18369b40b0c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "brt_stops_filtered = rail_ferry_brt.additional_brt_filtering_out_stops(\n",
-    "    brt_stops\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "eafb7cf4-f20e-4409-a55d-eac9c9fb9b21",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "name = \"LA Metro Bus Schedule\"\n",
-    "make_map(brt_stops[brt_stops.name==name], \"route_id\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7948a5cc-d0fa-4979-9747-473912fc4e55",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "make_map(brt_stops_filtered[brt_stops_filtered.name==name], \n",
-    "         \"route_id\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1c53ad6b-baf6-4b92-9065-e3d91ac91d93",
-   "metadata": {},
-   "source": [
-    "### SF Muni (282)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1a3829f0-62cf-4383-894d-f01188638157",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "name = \"Bay Area 511 Muni Schedule\"\n",
-    "make_map(brt_stops[brt_stops.name==name], \"route_id\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d4b93ddd-1b85-4cbe-a2a1-c894db5d4017",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "make_map(brt_stops_filtered[brt_stops_filtered.name==name], \n",
-    "         \"route_id\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "635410b7-55e6-4025-a30c-539dee55a7d6",
-   "metadata": {},
-   "source": [
-    "### AC Transit (4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7a88fb7d-bda2-4b79-9fbf-d39e188f8352",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "name = \"Bay Area 511 AC Transit Schedule\"\n",
-    "make_map(brt_stops[brt_stops.name==name], \"route_id\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c0ca6814-991a-4689-9de8-c0a1911bc9e2",
-   "metadata": {},
-   "source": [
-    "## After `A3_rail_ferry_brt_extract`\n",
-    "\n",
-    "#### BRT Service likely meeting [PRC 21060.2](https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=PRC&sectionNum=21060.2.&highlight=true&keyword=bus%20rapid%20transit) definition:\n",
-    "\n",
-    "* LA Metro Orange, Silver excluding street running (stop flags only)\n",
-    "* ~~Omnitrans sbX, all stops (curbside stations are well-defined, with fare prepayment)~~\n",
-    "    * insufficient frequency 5/16\n",
-    "* AC Transit Tempo, all stops (curbside stations are well-defined, with fare prepayment)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4ddca815-b484-45eb-bdaf-5f30236038d3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "stops = rail_ferry_brt.get_rail_ferry_brt_extract()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0f828224-b88b-4505-95af-a99634610ed2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "operators = helpers.import_scheduled_trips(\n",
-    "    analysis_date,\n",
-    "    columns = [\"feed_key\", \"name\"],\n",
-    "    get_pandas = True\n",
-    ").rename(columns = {\"feed_key\": \"feed_key_primary\"})"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -215,6 +54,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "stops = gpd.read_parquet(\n",
+    "    f\"{GCS_FILE_PATH}rail_brt_ferry.parquet\"\n",
+    ")\n",
+    "\n",
     "hqta_types = list(stops.hqta_type.unique())"
    ]
   },
@@ -226,14 +69,9 @@
    "outputs": [],
    "source": [
     "for i in hqta_types:\n",
-    "    subset = stops[stops.hqta_type==i].merge(\n",
-    "        operators,\n",
-    "        on = \"feed_key_primary\"\n",
-    "    )\n",
-    "    \n",
     "    display(Markdown(f\"### HQTA Type: {i}\"))\n",
     "    \n",
-    "    make_map(subset, \"name\")"
+    "    make_map(stops[stops.hqta_type==i], \"route_id\")"
    ]
   },
   {

From c55a6ba55832ab939b4d801f3d9ec5d98f7cfc1d Mon Sep 17 00:00:00 2001
From: tiffanychu90 <tiffany.chu@dot.ca.gov>
Date: Wed, 25 Sep 2024 17:33:00 +0000
Subject: [PATCH 07/11] (refactor): use schedule_gtfs_dataset_key instead of
 feed_key, simplify functions, separate out where assumptions might change

---
 .../A1_rail_ferry_brt_stops.py                |  19 +-
 .../B1_create_hqta_segments.py                |   5 +-
 .../B2_sjoin_stops_to_segments.py             |  46 ++-
 .../C1_prep_pairwise_intersections.py         |  10 +-
 .../C2_get_intersections.py                   |  15 +-
 .../C3_create_bus_hqta_types.py               |  84 ++---
 .../D1_assemble_hqta_points.py                | 293 +++++++-----------
 .../D2_assemble_hqta_polygons.py              | 124 ++++----
 high_quality_transit_areas/catalog.yml        |  10 +-
 .../logs/hqta_processing.log                  |  15 +
 high_quality_transit_areas/update_vars.py     |   5 +-
 11 files changed, 299 insertions(+), 327 deletions(-)

diff --git a/high_quality_transit_areas/A1_rail_ferry_brt_stops.py b/high_quality_transit_areas/A1_rail_ferry_brt_stops.py
index a35aa0524..7dcb87eef 100644
--- a/high_quality_transit_areas/A1_rail_ferry_brt_stops.py
+++ b/high_quality_transit_areas/A1_rail_ferry_brt_stops.py
@@ -13,6 +13,7 @@
 
 from loguru import logger
 
+import _utils
 from calitp_data_analysis import utils
 from segment_speed_utils import helpers
 from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS
@@ -219,19 +220,16 @@ def compile_rail_ferry_brt_stops(
     )
     
     keep_cols = [
-        "schedule_gtfs_dataset_key", "feed_key", 
+        "schedule_gtfs_dataset_key", 
         "stop_id", "stop_name",
         "route_id", "route_type",
         "hqta_type", "geometry"
     ]
     
     df2 = (df[keep_cols]
-           .sort_values(["feed_key", "stop_id"]).reset_index(drop=True)
-           .rename(columns = {
-               "feed_key": "feed_key_primary",
-               "schedule_gtfs_dataset_key": "schedule_gtfs_dataset_key_primary"
-           })
-    )
+           .sort_values(["schedule_gtfs_dataset_key", "stop_id"]).reset_index(drop=True)
+           .pipe(_utils.primary_rename)
+          )
 
     return df2 
 
@@ -251,6 +249,13 @@ def compile_rail_ferry_brt_stops(
     
     stops_route_gdf = assemble_stops(analysis_date)
     
+    # let's save this to use as a crosswalk to fill in info
+    utils.geoparquet_gcs_export(
+        stops_route_gdf,
+        GCS_FILE_PATH,
+        "stops_to_route"
+    )
+    
     rail_stops = grab_rail_stops(stops_route_gdf)
     ferry_stops = grab_ferry_stops(stops_route_gdf)
     brt_stops = grab_brt_stops(stops_route_gdf)
diff --git a/high_quality_transit_areas/B1_create_hqta_segments.py b/high_quality_transit_areas/B1_create_hqta_segments.py
index b446f4133..552573499 100644
--- a/high_quality_transit_areas/B1_create_hqta_segments.py
+++ b/high_quality_transit_areas/B1_create_hqta_segments.py
@@ -114,8 +114,7 @@ def select_shapes_and_segment(
     ).query(
         'shape_array_key not in @outside_amtrak_shapes'
     ).drop(
-        columns = ["schedule_gtfs_dataset_key", 
-                   "shape_array_key", "route_length"]
+        columns = ["feed_key", "shape_array_key", "route_length"]
     ).fillna({"direction_id": 0}).astype({"direction_id": "int"})
     
     routes_both_dir = (gdf.route_key
@@ -147,7 +146,7 @@ def select_shapes_and_segment(
         segment_col = "segment_geometry"
     )
     
-    route_cols = ["feed_key", "route_id", "route_key"]
+    route_cols = ["schedule_gtfs_dataset_key", "route_id", "route_key"]
 
     # Attach other route info
     hqta_segments = pd.merge(
diff --git a/high_quality_transit_areas/B2_sjoin_stops_to_segments.py b/high_quality_transit_areas/B2_sjoin_stops_to_segments.py
index 8a61cb7b8..085e170c4 100644
--- a/high_quality_transit_areas/B2_sjoin_stops_to_segments.py
+++ b/high_quality_transit_areas/B2_sjoin_stops_to_segments.py
@@ -7,7 +7,6 @@
 """
 import datetime
 import geopandas as gpd
-import numpy as np
 import pandas as pd
 import sys
 
@@ -15,36 +14,49 @@
 
 from calitp_data_analysis import utils
 from segment_speed_utils import helpers, gtfs_schedule_wrangling
-from update_vars import GCS_FILE_PATH, analysis_date, PROJECT_CRS
+from update_vars import GCS_FILE_PATH, analysis_date, PROJECT_CRS, SEGMENT_BUFFER_METERS
 
-def max_trips_by_group(df: pd.DataFrame, 
-                       group_cols: list,
-                       max_col: str = "n_trips"
-                      ) -> pd.DataFrame:
+def max_trips_by_group(
+    df: pd.DataFrame, 
+    group_cols: list,
+    max_col: str = "n_trips"
+) -> pd.DataFrame:
     """
     Find the max trips, by stop_id or by hqta_segment_id.
     Put in a list of group_cols to find the max.
     Can also subset for AM or PM by df[df.departure_hour < 12]
     """
     df2 = (df.groupby(group_cols)
-           .agg({max_col: np.max})
+           .agg({max_col: "max"})
            .reset_index()
           )
     
     return df2 
 
 
-def stop_times_aggregation_max_by_stop(stop_times: pd.DataFrame) -> pd.DataFrame:
+def stop_times_aggregation_max_by_stop(
+    stop_times: pd.DataFrame, 
+    analysis_date: str
+) -> pd.DataFrame:
     """
     Take the stop_times table 
     and group by stop_id-departure hour
     and count how many trips occur.
     """
-    stop_cols = ["feed_key", "stop_id"]
+    stop_cols = ["schedule_gtfs_dataset_key", "stop_id"]
 
+    gtfs_key = helpers.import_scheduled_trips(
+        analysis_date,
+        columns = ["feed_key", "gtfs_dataset_key"],
+        get_pandas = True
+    )
+    
     stop_times = stop_times.assign(
         departure_hour = pd.to_datetime(
             stop_times.departure_sec, unit="s").dt.hour
+    ).merge(
+        gtfs_key,
+        on = "feed_key"
     )
             
     # Aggregate how many trips are made at that stop by departure hour
@@ -124,7 +136,7 @@ def hqta_segment_keep_one_stop(
     
     Returns gdf where each segment only appears once.
     """
-    stop_cols = ["feed_key", "stop_id"]
+    stop_cols = ["schedule_gtfs_dataset_key", "stop_id"]
     
     segment_to_stop_times = pd.merge(
         hqta_segments, 
@@ -168,7 +180,7 @@ def sjoin_stops_and_stop_times_to_hqta_segments(
     hqta_segments: gpd.GeoDataFrame, 
     stops: gpd.GeoDataFrame,
     stop_times: pd.DataFrame,
-    buffer_size: int = 50,
+    buffer_size: int,
     hq_transit_threshold: int = 4,
 ) -> gpd.GeoDataFrame:
     """
@@ -200,7 +212,6 @@ def sjoin_stops_and_stop_times_to_hqta_segments(
                                (x.pm_max_trips >= hq_transit_threshold))
             else False, axis=1)
     ).drop(columns = drop_cols)
-    
 
     return segment_hq_corr
 
@@ -221,8 +232,8 @@ def sjoin_stops_and_stop_times_to_hqta_segments(
     # takes 1 min
     max_arrivals_by_stop = helpers.import_scheduled_stop_times(
         analysis_date,
-        get_pandas = True
-    ).pipe(stop_times_aggregation_max_by_stop)
+        get_pandas = True,
+    ).pipe(stop_times_aggregation_max_by_stop, analysis_date)
     
     max_arrivals_by_stop.to_parquet(
         f"{GCS_FILE_PATH}max_arrivals_by_stop.parquet")
@@ -242,7 +253,7 @@ def sjoin_stops_and_stop_times_to_hqta_segments(
         hqta_segments, 
         stops,
         max_arrivals_by_stop,
-        buffer_size = 50, #50meters
+        buffer_size = SEGMENT_BUFFER_METERS, #50meters
         hq_transit_threshold = 4
     )
         
@@ -253,7 +264,8 @@ def sjoin_stops_and_stop_times_to_hqta_segments(
     )
     
     end = datetime.datetime.now()
-    logger.info(f"B2_sjoin_stops_to_segments {analysis_date} "
-                f"execution time: {end - start}")
+    logger.info(
+        f"B2_sjoin_stops_to_segments {analysis_date} "
+        f"execution time: {end - start}")
     
     #client.close()
diff --git a/high_quality_transit_areas/C1_prep_pairwise_intersections.py b/high_quality_transit_areas/C1_prep_pairwise_intersections.py
index 9e708b596..12d9ccb4f 100644
--- a/high_quality_transit_areas/C1_prep_pairwise_intersections.py
+++ b/high_quality_transit_areas/C1_prep_pairwise_intersections.py
@@ -35,8 +35,6 @@ def prep_bus_corridors(is_hq_corr: bool) -> gpd.GeoDataFrame:
     return bus_hqtc
 
 
-
-
 def sjoin_against_other_operators(
     in_group_df: gpd.GeoDataFrame, 
     out_group_df: gpd.GeoDataFrame
@@ -105,7 +103,7 @@ def pairwise_intersections(
             (corridors_gdf.hqta_segment_id.isin(segments_p2))]
         .drop_duplicates()
         .sort_values(
-            ["feed_key", "route_id", "hqta_segment_id"], 
+            ["schedule_gtfs_dataset_key", "route_id", "hqta_segment_id"], 
             ascending = [True, True, True])
         .reset_index(drop=True)
     )
@@ -139,7 +137,9 @@ def pairwise_intersections(
     pairwise_intersections(corridors)    
     
     end = datetime.datetime.now()
-    logger.info(f"C1_prep_pairwise_intersections {analysis_date} "
-                f"execution time: {end - start}")
+    logger.info(
+        f"C1_prep_pairwise_intersections {analysis_date} "
+        f"execution time: {end - start}"
+    )
 
     #client.close()
\ No newline at end of file
diff --git a/high_quality_transit_areas/C2_get_intersections.py b/high_quality_transit_areas/C2_get_intersections.py
index 30dbcb834..85ed56c20 100644
--- a/high_quality_transit_areas/C2_get_intersections.py
+++ b/high_quality_transit_areas/C2_get_intersections.py
@@ -12,7 +12,6 @@
 import datetime
 import geopandas as gpd
 import intake
-import os
 import pandas as pd
 import sys
 
@@ -39,10 +38,10 @@ def attach_geometry_to_pairs(
         
     }
     
-    col_order = ["feed_key"] + segment_cols + list(rename_cols.values())
+    col_order = ["schedule_gtfs_dataset_key"] + segment_cols + list(rename_cols.values())
     
     pairs_with_geom1 = pd.merge(
-        corridors[["feed_key"] + segment_cols],
+        corridors[["schedule_gtfs_dataset_key"] + segment_cols],
         intersecting_pairs, 
         on = "hqta_segment_id",
         how = "inner"
@@ -57,7 +56,7 @@ def attach_geometry_to_pairs(
     )
 
     gdf = (pairs_with_geom2.reindex(columns = col_order)
-           .sort_values(["feed_key", "hqta_segment_id", 
+           .sort_values(["schedule_gtfs_dataset_key", "hqta_segment_id", 
                          "intersect_hqta_segment_id"])
            .reset_index(drop=True)
           )
@@ -90,7 +89,7 @@ def find_intersections(pairs_table: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
     # Concatenate and add this column to pairs_table, join by index 
     gdf = pd.concat([
         results_df,
-        pairs_table[["feed_key", "hqta_segment_id"]], 
+        pairs_table[["schedule_gtfs_dataset_key", "hqta_segment_id"]], 
     ], axis=1)
     
     return gdf    
@@ -122,7 +121,9 @@ def find_intersections(pairs_table: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
     )
  
     end = datetime.datetime.now()
-    logger.info(f"C2_find_intersections {analysis_date} "
-                f"execution time: {end - start}")
+    logger.info(
+        f"C2_find_intersections {analysis_date} "
+        f"execution time: {end - start}"
+    )
     
     #client.close()
\ No newline at end of file
diff --git a/high_quality_transit_areas/C3_create_bus_hqta_types.py b/high_quality_transit_areas/C3_create_bus_hqta_types.py
index bed753ffb..1f0574e41 100644
--- a/high_quality_transit_areas/C3_create_bus_hqta_types.py
+++ b/high_quality_transit_areas/C3_create_bus_hqta_types.py
@@ -16,15 +16,15 @@
 
 from loguru import logger
 
+import _utils
 import C1_prep_pairwise_intersections as prep_clip
 
 from calitp_data_analysis import utils
 from segment_speed_utils import helpers
 from update_vars import (GCS_FILE_PATH, analysis_date, 
-                         PROJECT_CRS, BUFFER_METERS
+                         PROJECT_CRS, SEGMENT_BUFFER_METERS
                         )
 
-
 def buffer_around_intersections(buffer_size: int) -> gpd.GeoDataFrame: 
     """
     Draw 50 m buffers around intersections to better catch stops
@@ -55,28 +55,29 @@ def create_major_stop_bus(
     """
     # Narrow down all stops to only include stops from operators
     # that also have some bus corridor intersection result
-    included_operators = bus_intersections.feed_key.unique()
-    major_stops = all_stops[all_stops.feed_key.isin(included_operators)]
+    included_operators = bus_intersections.schedule_gtfs_dataset_key.unique()
+    major_stops = all_stops[
+        all_stops.schedule_gtfs_dataset_key.isin(included_operators)
+    ]
     
     major_bus_stops_in_intersections = (
         gpd.sjoin(
             major_stops,
-            bus_intersections[["feed_key", "geometry"]],
+            bus_intersections[["schedule_gtfs_dataset_key", "geometry"]],
             how = "inner",
-            predicate = "within"
-        ).drop(columns = "index_right")
-        .drop_duplicates(
-            subset=["feed_key_left", "stop_id", "feed_key_right"])
+            predicate = "within",
+            lsuffix="primary", rsuffix="secondary"
+        ).drop_duplicates(
+            subset=[
+                "schedule_gtfs_dataset_key_primary", "stop_id", 
+                "schedule_gtfs_dataset_key_secondary"])
     ).reset_index(drop=True)
     
     stops_in_intersection = (
         major_bus_stops_in_intersections.assign(
             hqta_type = "major_stop_bus",
-            ).rename(columns = 
-                     {"feed_key_left": "feed_key_primary", 
-                      "feed_key_right": "feed_key_secondary",
-                     })
-          [["feed_key_primary", "feed_key_secondary", 
+        )[["schedule_gtfs_dataset_key_primary", 
+            "schedule_gtfs_dataset_key_secondary", 
             "stop_id", "geometry", "hqta_type"]]
     )
     
@@ -96,23 +97,24 @@ def create_stops_along_corridors(all_stops: gpd.GeoDataFrame) -> gpd.GeoDataFram
                      [["hqta_segment_id", "geometry"]]
                     )
     
-    stop_cols = ["feed_key", "stop_id"]
-    
-    stops_in_hq_corr = (gpd.sjoin(
-                            all_stops, 
-                            bus_corridors[["geometry"]],
-                            how = "inner", 
-                            predicate = "intersects"
-                        ).drop(columns = "index_right")
-                        .drop_duplicates(subset=stop_cols)
-                        .reset_index(drop=True)
-                       )
-    
-    stops_in_hq_corr2 = (stops_in_hq_corr.assign(
-                            hqta_type = "hq_corridor_bus",
-                        )[stop_cols + ["hqta_type", "geometry"]]
-                         .rename(columns = {"feed_key": "feed_key_primary"})
-                        )
+    stop_cols = ["schedule_gtfs_dataset_key", "stop_id"]
+    
+    stops_in_hq_corr = (
+        gpd.sjoin(
+            all_stops, 
+            bus_corridors[["geometry"]],
+            how = "inner", 
+            predicate = "intersects"
+        ).drop_duplicates(subset=stop_cols)
+        .reset_index(drop=True)
+    )
+    
+    stops_in_hq_corr2 = (
+        stops_in_hq_corr.assign(
+            hqta_type = "hq_corridor_bus",
+        )[stop_cols + ["hqta_type", "geometry"]]
+        .pipe(_utils.primary_rename)
+    )
     
     return stops_in_hq_corr2
 
@@ -131,14 +133,24 @@ def create_stops_along_corridors(all_stops: gpd.GeoDataFrame) -> gpd.GeoDataFram
     
     # Start with the gdf of all the hqta_segments
     # that have a sjoin with an orthogonal route
-    bus_intersections = buffer_around_intersections(BUFFER_METERS)
+    bus_intersections = buffer_around_intersections(SEGMENT_BUFFER_METERS)
 
     # Grab point geom with all stops
+    gtfs_keys = helpers.import_scheduled_trips(
+        analysis_date,
+        columns = ["feed_key", "gtfs_dataset_key"],
+        get_pandas=True
+    )
+    
     all_stops = helpers.import_scheduled_stops(
         analysis_date,
         get_pandas = True,
+        columns = ["feed_key", "stop_id", "geometry"],
         crs = PROJECT_CRS
-    )
+    ).merge(
+        gtfs_keys,
+        on = "feed_key",
+    ).drop(columns = "feed_key")
         
     # Create hqta_type == major_stop_bus
     major_stop_bus = create_major_stop_bus(all_stops, bus_intersections)
@@ -160,7 +172,9 @@ def create_stops_along_corridors(all_stops: gpd.GeoDataFrame) -> gpd.GeoDataFram
     )
     
     end = datetime.datetime.now()
-    logger.info(f"C3_create_bus_hqta_types {analysis_date} "
-                f"execution time: {end - start}")
+    logger.info(
+        f"C3_create_bus_hqta_types {analysis_date} "
+        f"execution time: {end - start}"
+    )
     
     #client.close()
\ No newline at end of file
diff --git a/high_quality_transit_areas/D1_assemble_hqta_points.py b/high_quality_transit_areas/D1_assemble_hqta_points.py
index 4226bedb5..4441b38c7 100644
--- a/high_quality_transit_areas/D1_assemble_hqta_points.py
+++ b/high_quality_transit_areas/D1_assemble_hqta_points.py
@@ -17,198 +17,123 @@
 
 from loguru import logger
 
-from A1_rail_ferry_brt_stops import clip_to_ca, get_rail_ferry_brt_extract
+import _utils
 from calitp_data_analysis import geography_utils, utils
 from segment_speed_utils import helpers
 from shared_utils import gtfs_utils_v2
-from update_vars import analysis_date, GCS_FILE_PATH, PROJECT_CRS
+from update_vars import analysis_date, GCS_FILE_PATH, PROJECT_CRS, EXPORT_PATH
 
 catalog = intake.open_catalog("*.yml")
-EXPORT_PATH = f"{GCS_FILE_PATH}export/{analysis_date}/"
 
-def hqta_details(row) -> str:
+def combine_stops_by_hq_types(crs: str) -> gpd.GeoDataFrame:
     """
-    Add HQTA details of why nulls are present 
-    based on feedback from open data users.
-    """
-    if row.hqta_type == "major_stop_bus":
-        if row.feed_key_primary != row.feed_key_secondary:
-            return "intersection_2_bus_routes_different_operators"
-        else:
-            return "intersection_2_bus_routes_same_operator"
-    elif row.hqta_type == "hq_corridor_bus":
-        return "stop_along_hq_bus_corridor_single_operator"
-    elif row.hqta_type in ["major_stop_ferry", 
-                           "major_stop_brt", "major_stop_rail"]:
-        # (not sure if ferry, brt, rail, primary/secondary ids are filled in.)
-        return row.hqta_type + "_single_operator"
-    
-    
-def merge_in_max_arrivals_by_stop(
-    hqta_points: gpd.GeoDataFrame,
-    max_arrivals: pd.DataFrame
-) -> gpd.GeoDataFrame:
-    """
-    Merge combined hqta points across all categories with
+    Concatenate combined hqta points across all categories then merge in
     the maximum arrivals for each stop (keep if it shows up in hqta_points) 
     with left merge.
-    """    
-    with_stops = pd.merge(
-        hqta_points,
-        max_arrivals.rename(columns = {"feed_key": "feed_key_primary"}),
-        on = ["feed_key_primary", "stop_id"],
-        how = "left"
-    )
+    """  
+    rail_ferry_brt = catalog.rail_brt_ferry_stops.read().to_crs(
+        crs)
+    major_stop_bus = catalog.major_stop_bus.read().to_crs(crs)
+    stops_in_corridor = catalog.stops_in_hq_corr.read().to_crs(crs)
     
-    # Combine AM max and PM max into 1 column
     trip_count_cols = ["am_max_trips", "pm_max_trips"]
-    
-    with_stops2 = with_stops.assign(
-        peak_trips = (with_stops[trip_count_cols]
-                          .min(axis=1)
-                          .fillna(0).astype(int))
+
+    max_arrivals = pd.read_parquet(
+        f"{GCS_FILE_PATH}max_arrivals_by_stop.parquet", 
+        columns = ["schedule_gtfs_dataset_key", 
+                   "stop_id"] + trip_count_cols
+    ).pipe(_utils.primary_rename)
+    
+    # Combine AM max and PM max into 1 column   
+    # if am_max_trips = 4 and pm_max_trips = 5, we'll choose 4.
+    max_arrivals = max_arrivals.assign(
+        peak_trips = max_arrivals[trip_count_cols].min(axis=1)
     ).drop(columns = trip_count_cols)
     
-    return with_stops2
-
+    hqta_points_combined = pd.concat([
+        major_stop_bus,
+        stops_in_corridor,
+        rail_ferry_brt,
+    ], axis=0)
     
-def add_route_info(hqta_points: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
-    """
-    Use feed_key-stop_id to add route_id back in, 
-    using the trips and stop_times table.
-    """    
-    stop_times = helpers.import_scheduled_stop_times(
-        analysis_date,
-        columns = ["feed_key", "stop_id", "trip_id"],
-        get_pandas = True,
-        with_direction = False
-    )
-
-    trips = helpers.import_scheduled_trips(
-        analysis_date,
-        columns = ["feed_key", "gtfs_dataset_key", "trip_id", "route_id"],
-        get_pandas = True
-    )
+    # Merge in max arrivals
+    with_stops = pd.merge(
+        hqta_points_combined,
+        max_arrivals,
+        on = ["schedule_gtfs_dataset_key_primary", "stop_id"],
+        how = "left"
+    ).fillna({"peak_trips": 0}).astype({"peak_trips": "int"})
     
-    stop_cols = ["feed_key", "stop_id"]
-    trip_cols = ["feed_key", "trip_id"]
+    keep_stop_cols = [
+        "schedule_gtfs_dataset_key_primary", "schedule_gtfs_dataset_key_secondary",
+        "stop_id", "geometry",
+        "hqta_type", "peak_trips", "hqta_details"
+    ]
     
-    one_trip = (stop_times[stop_cols + ["trip_id"]]
-                .drop_duplicates(subset=stop_cols)
-                .reset_index(drop=True)
-               )
+    with_stops = with_stops.assign(
+        hqta_details = with_stops.apply(_utils.add_hqta_details, axis=1)
+    )[keep_stop_cols]
     
-    with_route_info = pd.merge(
-        one_trip,
-        trips[trip_cols + [
-            "schedule_gtfs_dataset_key", "route_id"
-        ]].drop_duplicates(),
-        on = trip_cols,
-        how = "inner",
-        validate = "m:1" # one_trip has many stops for that trip
-    ).rename(columns = {"feed_key": "feed_key_primary"})
+    return with_stops
 
-    hqta_points_with_route = pd.merge(
-        hqta_points,
-        with_route_info,
-        on = ["feed_key_primary", "stop_id"],
-        how = "inner",
-        validate = "m:1"
-    ).drop(columns = "trip_id")
-    
-    # Clip to CA -- remove ferry or else we're losing it in the clip
-    not_ferry = hqta_points_with_route[
-        hqta_points_with_route.hqta_type != "major_stop_ferry"
-    ].pipe(clip_to_ca)
-    
-    is_ferry = hqta_points_with_route[
-        hqta_points_with_route.hqta_type == "major_stop_ferry"]
-    
-    ca_hqta_points = pd.concat(
-        [not_ferry, is_ferry], axis=0
-    ).reset_index(drop=True)
-    
-    return ca_hqta_points
 
-    
-def get_agency_info(df: pd.DataFrame, date: str) -> pd.DataFrame:
+def get_agency_crosswalk(analysis_date: str) -> pd.DataFrame:
     """
-    HQTA analysis uses feed_key to link across schedule tables.
-    But, from trips table, we have schedule_gtfs_dataset_key,
-    and we can use that to join to our saved crosswalk.
+    Import crosswalk for changing schedule_gtfs_dataset_key to 
+    organization_name/source_record_id
     """
-    crosswalk = helpers.import_schedule_gtfs_key_organization_crosswalk(
-        date,
+    agency_info = helpers.import_schedule_gtfs_key_organization_crosswalk(
+        analysis_date,
         columns = [
             "schedule_gtfs_dataset_key",
             "organization_name", "organization_source_record_id", 
             "base64_url"]
-    ).rename(columns = {
+        ).rename(columns = {
         "organization_name": "agency",
         "organization_source_record_id": "org_id"
-    })[["schedule_gtfs_dataset_key", 
-     "agency", "org_id", "base64_url"]]
-
-    return crosswalk
+    })
     
+    return agency_info
 
-def add_agency_names_hqta_details(
+        
+def add_route_agency_info(
     gdf: gpd.GeoDataFrame, 
     analysis_date: str
-) -> gpd.GeoDataFrame:
+) -> gpd.GeoDataFrame :
     """
-    Add agency names by merging it in with our crosswalk
-    to get the primary feed_key and primary agency name.
+    Make sure route info is filled in for all stops.
     
-    Then use a function to add secondary feed_key and secondary agency name 
-    and hqta_details column.
-    hqta_details makes it clearer for open data portal users why
-    some ID / agency name columns show the same info or are missing.
+    Add agency names by merging it in with our crosswalk
+    and populate primary and secondary operator information.
     """
-    feeds_df = gdf.rename(
-        columns = {"feed_key_primary": "feed_key"})[
-        ["feed_key", "schedule_gtfs_dataset_key"]
-    ].drop_duplicates()
+    stop_with_route_crosswalk = catalog.stops_info_crosswalk.read()
     
-    crosswalk = get_agency_info(feeds_df, analysis_date)
+    agency_info = get_agency_crosswalk(analysis_date)
     
-    agency_info = pd.merge(
-        feeds_df,
-        crosswalk,
-        on = "schedule_gtfs_dataset_key",
-        how = "inner"
-    ).drop(columns = "schedule_gtfs_dataset_key")
-
-    # Merge in organization ids for feed_key_primary
-    # and feed_key_secondary
+    # Make sure all the stops have route_id
     gdf2 = pd.merge(
         gdf,
+        stop_with_route_crosswalk[
+            ["schedule_gtfs_dataset_key", 
+             "stop_id", "route_id"]].drop_duplicates().pipe(_utils.primary_rename),
+        on = ["schedule_gtfs_dataset_key_primary", "stop_id"],
+        how = "inner"
+    )
+    
+    # Make sure gtfs_dataset_name and organization columns are added
+    gdf3 = pd.merge(
+        gdf2,
         agency_info.add_suffix("_primary"),
-        on = "feed_key_primary",
+        on = "schedule_gtfs_dataset_key_primary",
         how = "inner"
     ).merge(
         agency_info.add_suffix("_secondary"),
-        on = "feed_key_secondary",
+        on = "schedule_gtfs_dataset_key_secondary",
         how = "left" 
         # left bc we don't want to drop rows that have secondary operator
     )
-
-    gdf2 = gdf2.assign(
-        hqta_details = gdf2.apply(hqta_details, axis=1),
-    )
-    
-    # Additional clarification of hq_corridor_bus, 
-    # only for hqta_stops, not hqta_polygons
-    gdf2["hqta_details"] = gdf2.apply(
-        lambda x: "corridor_frequent_stop" if (
-            (x.hqta_type == "hq_corridor_bus") and 
-            (x.peak_trips >= 4)
-        ) else "corridor_other_stop" if (
-            (x.hqta_type == "hq_corridor_bus") and 
-            (x.peak_trips < 4) 
-        ) else x.hqta_details, axis = 1)
     
-    return gdf2  
+    return gdf3
 
 
 def final_processing(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
@@ -217,6 +142,19 @@ def final_processing(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
     Subset to columns, drop duplicates, sort for readability,
     always project into WGS84.
     """
+    # Clip to CA -- remove ferry or else we're losing it in the clip
+    not_ferry = gdf[
+        gdf.hqta_type != "major_stop_ferry"
+    ].pipe(_utils.clip_to_ca)
+    
+    is_ferry = gdf[
+        gdf.hqta_type == "major_stop_ferry"
+    ]
+    
+    gdf2 = pd.concat(
+        [not_ferry, is_ferry], axis=0
+    ).reset_index(drop=True)
+    
     public_feeds = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys()
     
     keep_cols = [
@@ -230,9 +168,10 @@ def final_processing(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
         "geometry"
     ]
     
-    gdf2 = (
-        gdf[gdf.schedule_gtfs_dataset_key.isin(public_feeds)]
-        .reindex(columns = keep_cols)
+    gdf3 = (
+        gdf2[
+            (gdf2.schedule_gtfs_dataset_key_primary.isin(public_feeds)) 
+        ].reindex(columns = keep_cols)
         .drop_duplicates(
             subset=["agency_primary", "hqta_type", "stop_id", "route_id"])
         .sort_values(["agency_primary", "hqta_type", "stop_id"])
@@ -240,8 +179,8 @@ def final_processing(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
         .to_crs(geography_utils.WGS84)
     )
     
-    return gdf2
-    
+    return gdf3
+   
     
 if __name__=="__main__":
         
@@ -251,42 +190,15 @@ def final_processing(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
                level="INFO")
     
     start = datetime.datetime.now()
-    
-    rail_ferry_brt = get_rail_ferry_brt_extract().to_crs(
-        PROJECT_CRS)
-    major_stop_bus = catalog.major_stop_bus.read().to_crs(PROJECT_CRS)
-    stops_in_corridor = catalog.stops_in_hq_corr.read().to_crs(PROJECT_CRS)
-    max_arrivals_by_stop = pd.read_parquet(
-        f"{GCS_FILE_PATH}max_arrivals_by_stop.parquet", 
-        columns = ["feed_key", "stop_id", "am_max_trips", "pm_max_trips"]
-    ).rename(columns = {"feed_key": "feed_key_primary"})
-    
-    # Combine all the points data
-    hqta_points_combined = pd.concat([
-        major_stop_bus,
-        stops_in_corridor,
-        # add name at once, rail/ferry/brt is only one with it...
-        # but we used it to double check downloads were correct
-        rail_ferry_brt,
-    ], axis=0)
-    
-    hqta_points_combined2 = merge_in_max_arrivals_by_stop(
-        hqta_points_combined, max_arrivals_by_stop)
-        
-    # Add in route_id 
-    hqta_points_with_route_info = add_route_info(hqta_points_combined2)
-    
-    # Add agency names, hqta_details, project back to WGS84
-    gdf = add_agency_names_hqta_details(
-        hqta_points_with_route_info, analysis_date
-    )
-    
-    cols = [i for i in gdf.columns if "_primary" in i or "_secondary" in i]
-    gdf[cols].drop_duplicates().reset_index(drop=True).to_parquet(
-        f"{GCS_FILE_PATH}feed_key_org_crosswalk.parquet"
-    )
-    
-    gdf = final_processing(gdf)
+
+    # Combine all the points data and merge in max_arrivals 
+    hqta_points_combined = combine_stops_by_hq_types(crs=PROJECT_CRS)    
+
+    # Add in route_id and agency info
+    hqta_points_with_info = add_route_agency_info(
+        hqta_points_combined, analysis_date)
+
+    gdf = final_processing(hqta_points_with_info)
     
     # Export to GCS
     # Stash this date's into its own folder
@@ -304,5 +216,8 @@ def final_processing(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    )
     
     end = datetime.datetime.now()
-    logger.info(f"D1_assemble_hqta_points {analysis_date} "
-                f"execution time: {end - start}")
\ No newline at end of file
+    logger.info(
+        f"D1_assemble_hqta_points {analysis_date} "
+        f"execution time: {end - start}"
+    )
+    
\ No newline at end of file
diff --git a/high_quality_transit_areas/D2_assemble_hqta_polygons.py b/high_quality_transit_areas/D2_assemble_hqta_polygons.py
index 1270c630a..7f0328b7e 100644
--- a/high_quality_transit_areas/D2_assemble_hqta_polygons.py
+++ b/high_quality_transit_areas/D2_assemble_hqta_polygons.py
@@ -11,72 +11,85 @@
 
 from loguru import logger
 
-import C1_prep_pairwise_intersections as prep_clip
-import D1_assemble_hqta_points as assemble_hqta_points
+import _utils
+from C1_prep_pairwise_intersections import prep_bus_corridors
+from D1_assemble_hqta_points import get_agency_crosswalk
 from calitp_data_analysis import utils, geography_utils
-from D1_assemble_hqta_points import EXPORT_PATH, add_route_info
-from update_vars import GCS_FILE_PATH, analysis_date, PROJECT_CRS
+from update_vars import (GCS_FILE_PATH, analysis_date, PROJECT_CRS, EXPORT_PATH,
+                         HALF_MILE_BUFFER_METERS, CORRIDOR_BUFFER_METERS
+                        )
 
 catalog = intake.open_catalog("*.yml")
 
-def get_dissolved_hq_corridor_bus(
-    gdf: gpd.GeoDataFrame, 
-    analysis_date: str
+def buffer_hq_corridor_bus(
+    analysis_date: str,
+    buffer_meters: int,
 ) -> gpd.GeoDataFrame:
     """
-    Take each segment, then dissolve by operator,
-    and use this dissolved polygon in hqta_polygons.
+    Buffer hq bus corridors.
     
-    Draw a buffer around this.
+    Start with bus corridors, filter to those that are high quality,
+    and do a dissolve.
+    After the dissolve, buffer by an additional amount to 
+    get the full 0.5 mile buffer.
     """
-    # Can keep route_id in dissolve, but route_id is not kept in final 
-    # export, so there would be multiple rows for multiple route_ids, 
-    # and no way to distinguish between them
-    keep_cols = ['feed_key', 'hq_transit_corr', 'route_id']
+    gdf = prep_bus_corridors(
+        is_hq_corr=True
+    ).to_crs(PROJECT_CRS)
+    
+    keep_cols = ['schedule_gtfs_dataset_key', 'route_id']
     
     dissolved = (gdf[keep_cols + ["geometry"]]
                  .dissolve(by=keep_cols)
                  .reset_index()
                 )
     
-    # For hq_corridor_bus, we have feed_key again, and need to 
-    # add agency_name, or else this category will have missing name values
-    corridor_cols = [
-        "feed_key", "hqta_type", "route_id", "geometry"
-    ]
+    # Bus corridors are already buffered 50 meters, 
+    # so will buffer 705 meters to get 0.5 mile radius
     corridors = dissolved.assign(
-        geometry = dissolved.geometry.buffer(755),
+        geometry = dissolved.geometry.buffer(buffer_meters),
         # overwrite hqta_type for this polygon
         hqta_type = "hq_corridor_bus",
-    )[corridor_cols].rename(
-        columns = {"feed_key": "feed_key_primary"}
-    )
-    
-    crosswalk = pd.read_parquet(
-        f"{GCS_FILE_PATH}feed_key_org_crosswalk.parquet"
-    )
-    primary_agency_cols = [i for i in crosswalk.columns if "_primary" in i]
+    ).pipe(_utils.primary_rename)
     
-    crosswalk = crosswalk[primary_agency_cols].drop_duplicates()
+    agency_info = get_agency_crosswalk(analysis_date)
     
+    # Make sure gtfs_dataset_name and organization columns are added
     corridors2 = pd.merge(
         corridors,
-        crosswalk,
-        on = "feed_key_primary",
+        agency_info.add_suffix("_primary"),
+        on = "schedule_gtfs_dataset_key_primary",
         how = "inner"
     )
     
     corridors2 = corridors2.assign(
-        hqta_details = corridors2.apply(
-            assemble_hqta_points.hqta_details, axis=1),
+        hqta_details = "stop_along_hq_bus_corridor_single_operator"
     )
-
+    
     return corridors2
 
 
-def filter_and_buffer(
-    hqta_points: gpd.GeoDataFrame, 
-    hqta_segments: gpd.GeoDataFrame, 
+def buffer_major_transit_stops(
+    buffer_meters: int
+) -> gpd.GeoDataFrame:
+    """
+    Buffer major transit stops. 
+    Start with hqta points and filter out the hq_corridor_bus types.
+    """
+    hqta_points = catalog.hqta_points.read().to_crs(PROJECT_CRS)
+
+    stops = hqta_points[hqta_points.hqta_type != "hq_corridor_bus"]
+    
+    # General buffer distance: 1/2mi ~= 805 meters
+    # stops are already buffered 
+    stops = stops.assign(
+        geometry = stops.geometry.buffer(buffer_meters)
+    )
+
+    return stops
+
+
+def combine_corridors_and_stops(
     analysis_date: str
 ) -> gpd.GeoDataFrame:
     """
@@ -85,19 +98,18 @@ def filter_and_buffer(
     Buffers are already drawn for corridors and stops, so 
     draw new buffers, and address each hqta_type separately.
     """
-    stops = hqta_points[hqta_points.hqta_type != "hq_corridor_bus"]
-    
-    corridors = get_dissolved_hq_corridor_bus(hqta_segments, analysis_date)
+    corridors = buffer_hq_corridor_bus(
+        analysis_date,
+        buffer_meters = CORRIDOR_BUFFER_METERS,
+    )
     
-    # General buffer distance: 1/2mi ~= 805 meters
-    # Bus corridors are already buffered 100 meters, so will buffer 705 meters
-    stops = stops.assign(
-        geometry = stops.geometry.buffer(705)
+    major_transit_stops = buffer_major_transit_stops(
+        buffer_meters = HALF_MILE_BUFFER_METERS
     )
     
     hqta_polygons = pd.concat([
         corridors, 
-        stops
+        major_transit_stops
     ], axis=0).to_crs(geography_utils.WGS84)
     
     return hqta_polygons
@@ -106,7 +118,6 @@ def filter_and_buffer(
 def final_processing(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
     """
     Drop extra columns, get sorting done.
-    Used to drop bad stops, but these all look ok.
     """    
     keep_cols = [
         "agency_primary", "agency_secondary",
@@ -116,7 +127,6 @@ def final_processing(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
         "geometry"
     ]
     
-    # Drop bad stops, subset columns
     gdf2 = (
         gdf[keep_cols]
             .drop_duplicates()
@@ -138,17 +148,8 @@ def final_processing(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
     
     start = datetime.datetime.now()
     
-    hqta_points = catalog.hqta_points.read().to_crs(PROJECT_CRS)
-    bus_hq_corr = prep_clip.prep_bus_corridors(
-        is_hq_corr=True
-    ).to_crs(PROJECT_CRS)
-    
-    # Filter and buffer for stops (805 m) and corridors (755 m)
-    # and add agency_names
-    gdf = filter_and_buffer(
-        hqta_points, bus_hq_corr, analysis_date
-    ).pipe(final_processing)
-            
+    gdf = combine_corridors_and_stops(analysis_date).pipe(final_processing)
+       
     # Export to GCS
     utils.geoparquet_gcs_export(
         gdf, 
@@ -164,5 +165,8 @@ def final_processing(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
     )    
        
     end = datetime.datetime.now()
-    logger.info(f"D2_assemble_hqta_polygons {analysis_date} "
-                f"execution time: {end - start}")
\ No newline at end of file
+    logger.info(
+        f"D2_assemble_hqta_polygons {analysis_date} "
+        f"execution time: {end - start}"
+    )
+    
\ No newline at end of file
diff --git a/high_quality_transit_areas/catalog.yml b/high_quality_transit_areas/catalog.yml
index eb478de3d..ad92b3417 100644
--- a/high_quality_transit_areas/catalog.yml
+++ b/high_quality_transit_areas/catalog.yml
@@ -15,9 +15,15 @@ sources:
           urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/ca_boundary.parquet
     ## INTERMEDIATE DATA
     # Source: A1_rail_ferry_brt_stops.py
+    stops_info_crosswalk:
+        driver: geoparquet
+        description: Assembled stop gdf with route info from trips table.
+        args:
+          urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/stops_to_route.parquet
+    # Source: A1_rail_ferry_brt_stops.py
     rail_brt_ferry_initial:
         driver: geoparquet
-        description: Rail / BRT / Ferry points created in A2_combine_stops.py
+        description: All the major transit stops (Rail / BRT / Ferry).
         args:
           urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/rail_brt_ferry.parquet
     # Source: B1_create_hqta_segments.py
@@ -35,7 +41,7 @@ sources:
     # Source: C1_prep_pairwise_intersections.py
     pairwise_intersections:
         driver: parquet
-        description: Use dask_geopandas.sjoin to find which hqta segments do intersect at some point. Created in C1_prep_pairwise_intersections.py. 
+        description: Use spatial join to find which hqta segments do intersect at some point. Created in C1_prep_pairwise_intersections.py. 
         args:
           urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/pairwise.parquet   
     # Source: C1_prep_pairwise_intersections.py
diff --git a/high_quality_transit_areas/logs/hqta_processing.log b/high_quality_transit_areas/logs/hqta_processing.log
index 66aa62c21..3811550e5 100644
--- a/high_quality_transit_areas/logs/hqta_processing.log
+++ b/high_quality_transit_areas/logs/hqta_processing.log
@@ -142,3 +142,18 @@
 2024-09-19 09:32:36.082 | INFO     | __main__:<module>:163 - C3_create_bus_hqta_types 2024-09-18 execution time: 0:00:37.486499
 2024-09-19 09:33:22.863 | INFO     | __main__:<module>:307 - D1_assemble_hqta_points 2024-09-18 execution time: 0:00:27.668799
 2024-09-19 09:36:35.489 | INFO     | __main__:<module>:167 - D2_assemble_hqta_polygons 2024-09-18 execution time: 0:00:26.678607
+2024-09-19 14:07:11.844 | INFO     | __main__:<module>:269 - A1_rail_ferry_brt_stops 2024-09-18 execution time: 0:00:16.390254
+2024-09-19 14:10:48.407 | INFO     | __main__:<module>:276 - A1_rail_ferry_brt_stops 2024-09-18 execution time: 0:00:17.570946
+2024-09-19 14:51:15.926 | INFO     | __main__:<module>:248 - B1_create_hqta_segments execution time: 0:04:11.305371
+2024-09-19 14:55:50.079 | INFO     | __main__:<module>:268 - B2_sjoin_stops_to_segments 2024-09-18 execution time: 0:00:50.195775
+2024-09-19 14:56:18.048 | INFO     | __main__:<module>:140 - C1_prep_pairwise_intersections 2024-09-18 execution time: 0:00:09.222281
+2024-09-19 14:57:06.861 | INFO     | __main__:<module>:124 - C2_find_intersections 2024-09-18 execution time: 0:00:32.170864
+2024-09-19 15:04:46.269 | INFO     | __main__:<module>:176 - C3_create_bus_hqta_types 2024-09-18 execution time: 0:00:28.423966
+2024-09-25 10:10:35.837 | INFO     | __main__:<module>:275 - A1_rail_ferry_brt_stops 2024-09-18 execution time: 0:00:16.979217
+2024-09-25 10:18:42.737 | INFO     | __main__:<module>:248 - B1_create_hqta_segments execution time: 0:07:48.832296
+2024-09-25 10:20:17.796 | INFO     | __main__:<module>:267 - B2_sjoin_stops_to_segments 2024-09-18 execution time: 0:01:12.987290
+2024-09-25 10:20:52.130 | INFO     | __main__:<module>:140 - C1_prep_pairwise_intersections 2024-09-18 execution time: 0:00:10.962771
+2024-09-25 10:22:04.117 | INFO     | __main__:<module>:124 - C2_find_intersections 2024-09-18 execution time: 0:00:49.634882
+2024-09-25 10:25:48.480 | INFO     | __main__:<module>:175 - C3_create_bus_hqta_types 2024-09-18 execution time: 0:00:26.100650
+2024-09-25 10:27:51.226 | INFO     | __main__:<module>:219 - D1_assemble_hqta_points 2024-09-18 execution time: 0:00:18.606053
+2024-09-25 10:30:59.443 | INFO     | __main__:<module>:168 - D2_assemble_hqta_polygons 2024-09-18 execution time: 0:00:30.215506
diff --git a/high_quality_transit_areas/update_vars.py b/high_quality_transit_areas/update_vars.py
index dffb41557..d35684264 100644
--- a/high_quality_transit_areas/update_vars.py
+++ b/high_quality_transit_areas/update_vars.py
@@ -4,8 +4,9 @@
 
 GCS_FILE_PATH = ("gs://calitp-analytics-data/data-analyses/"
                  "high_quality_transit_areas/")
-TEMP_GCS = f"{GCS_FILE_PATH}temp/"
 PROJECT_CRS = "EPSG:3310"
 HQTA_SEGMENT_LENGTH = 1_250 # meters
-BUFFER_METERS = 50
+SEGMENT_BUFFER_METERS = 50 # buffer around segment to sjoin to stops
+HALF_MILE_BUFFER_METERS = 805 # half mile ~ 805 meters
+CORRIDOR_BUFFER_METERS = HALF_MILE_BUFFER_METERS - SEGMENT_BUFFER_METERS # 755 meters
 EXPORT_PATH = f"{GCS_FILE_PATH}export/{analysis_date}/"
\ No newline at end of file

From 1ff8a34ef7ceb49342485b911b2cd1a06e0dbb78 Mon Sep 17 00:00:00 2001
From: tiffanychu90 <tiffany.chu@dot.ca.gov>
Date: Wed, 25 Sep 2024 17:36:27 +0000
Subject: [PATCH 08/11] rename files, use makefile to set order

---
 high_quality_transit_areas/Makefile           | 16 ++++----
 high_quality_transit_areas/README.md          | 20 +++++-----
 ...hqta_points.py => assemble_hqta_points.py} |  0
 ..._polygons.py => assemble_hqta_polygons.py} |  4 +-
 high_quality_transit_areas/catalog.yml        | 38 +++++++++----------
 ...hqta_types.py => create_bus_hqta_types.py} |  5 +--
 ...ta_segments.py => create_hqta_segments.py} |  0
 ..._intersections.py => get_intersections.py} |  0
 ...ions.py => prep_pairwise_intersections.py} |  0
 ...y_brt_stops.py => rail_ferry_brt_stops.py} |  0
 ...segments.py => sjoin_stops_to_segments.py} |  0
 11 files changed, 40 insertions(+), 43 deletions(-)
 rename high_quality_transit_areas/{D1_assemble_hqta_points.py => assemble_hqta_points.py} (100%)
 rename high_quality_transit_areas/{D2_assemble_hqta_polygons.py => assemble_hqta_polygons.py} (97%)
 rename high_quality_transit_areas/{C3_create_bus_hqta_types.py => create_bus_hqta_types.py} (97%)
 rename high_quality_transit_areas/{B1_create_hqta_segments.py => create_hqta_segments.py} (100%)
 rename high_quality_transit_areas/{C2_get_intersections.py => get_intersections.py} (100%)
 rename high_quality_transit_areas/{C1_prep_pairwise_intersections.py => prep_pairwise_intersections.py} (100%)
 rename high_quality_transit_areas/{A1_rail_ferry_brt_stops.py => rail_ferry_brt_stops.py} (100%)
 rename high_quality_transit_areas/{B2_sjoin_stops_to_segments.py => sjoin_stops_to_segments.py} (100%)

diff --git a/high_quality_transit_areas/Makefile b/high_quality_transit_areas/Makefile
index 6130489cc..3f9860c33 100644
--- a/high_quality_transit_areas/Makefile
+++ b/high_quality_transit_areas/Makefile
@@ -1,12 +1,12 @@
 hqta_data:
-	python A1_rail_ferry_brt_stops.py
-	python B1_create_hqta_segments.py
-	python B2_sjoin_stops_to_segments.py
-	python C1_prep_pairwise_intersections.py
-	python C2_get_intersections.py
-	python C3_create_bus_hqta_types.py
-	python D1_assemble_hqta_points.py
-	python D2_assemble_hqta_polygons.py 
+	python rail_ferry_brt_stops.py
+	python create_hqta_segments.py
+	python sjoin_stops_to_segments.py
+	python prep_pairwise_intersections.py
+	python get_intersections.py
+	python create_bus_hqta_types.py
+	python assemble_hqta_points.py
+	python assemble_hqta_polygons.py 
 
 # Only need this is operator input changes
 # For now, Muni sent over a date-versioned list of stops
diff --git a/high_quality_transit_areas/README.md b/high_quality_transit_areas/README.md
index 44066ee52..45fb2f624 100644
--- a/high_quality_transit_areas/README.md
+++ b/high_quality_transit_areas/README.md
@@ -80,22 +80,20 @@ If not, within the `gtfs_funnel` directory, run `make download_gtfs_data` in the
 
 In terminal: `make hqta_data` to run through entire workflow.
 
-1. [Compile rail, ferry, brt data](./A1_rail_ferry_brt_stops.py)
-    * Sanity check: [check 1: downloads](./check1_downloads.ipynb)
-1. [Draw bus corridors, from routes to HQTA segments](./B1_create_hqta_segments.py)
+1. [Compile rail, ferry, brt data](./rail_ferry_brt_stops.py)
+1. [Draw bus corridors, from routes to HQTA segments](./create_hqta_segments.py)
     * Across all operators, find the longest shapes in each direction. Use a symmetric difference to grab the components that make up the route network.
     * Cut route into HQTA segments. Every segment is 1,250 m. 
     * Add in route direction.
-1. [Combine operator HQTA areas across operators](./B2_sjoin_stops_to_segments.py)
+1. [Combine operator HQTA areas across operators](./sjoin_stops_to_segments.py)
     * Attach number of stop arrivals that occur in the AM and PM and find the max
     * Do spatial join of stops to HQTA segments. Where multiple stops are present, keep the stop with the highest number of trips.
-1. [Use pairwise table to store which segments intersect](./C1_prep_pairwise_intersections.py) 
+1. [Use pairwise table to store which segments intersect](./prep_pairwise_intersections.py) 
     * Find which routes actually do intersect, and store that in a pairwise table.
-1. [Find where corridors intersect](./C2_get_intersections.py)
-1. [Create datasets for each of the hqta types](./C3_create_bus_hqta_types.py)
+1. [Find where corridors intersect](./get_intersections.py)
+1. [Create datasets for each of the hqta types](./create_bus_hqta_types.py)
     * `major_stop_bus`: the bus stop within the above intersection does not necessarily have the highest trip count
     * `hq_corridor_bus`: stops along the HQ transit corr (may not be highest trip count)
-    * Sanity check: [check 2: hq corridors](./check2_hq_corridors.ipynb)
-1. [Compile and export HQTA areas as points](./D1_assemble_hqta_points.py)
-    * Sanity check: [check 3: hqta points](./check3_hqta_points.ipynb)
-1. [Compile and export HQTA areas as polygons](./D2_assemble_hqta_polygons.py)
+1. [Compile and export HQTA areas as points](./assemble_hqta_points.py)
+    * Sanity check: [check 3: hqta points](./check_hqta_points.ipynb)
+1. [Compile and export HQTA areas as polygons](./assemble_hqta_polygons.py)
diff --git a/high_quality_transit_areas/D1_assemble_hqta_points.py b/high_quality_transit_areas/assemble_hqta_points.py
similarity index 100%
rename from high_quality_transit_areas/D1_assemble_hqta_points.py
rename to high_quality_transit_areas/assemble_hqta_points.py
diff --git a/high_quality_transit_areas/D2_assemble_hqta_polygons.py b/high_quality_transit_areas/assemble_hqta_polygons.py
similarity index 97%
rename from high_quality_transit_areas/D2_assemble_hqta_polygons.py
rename to high_quality_transit_areas/assemble_hqta_polygons.py
index 7f0328b7e..7558c68ef 100644
--- a/high_quality_transit_areas/D2_assemble_hqta_polygons.py
+++ b/high_quality_transit_areas/assemble_hqta_polygons.py
@@ -12,8 +12,8 @@
 from loguru import logger
 
 import _utils
-from C1_prep_pairwise_intersections import prep_bus_corridors
-from D1_assemble_hqta_points import get_agency_crosswalk
+from prep_pairwise_intersections import prep_bus_corridors
+from assemble_hqta_points import get_agency_crosswalk
 from calitp_data_analysis import utils, geography_utils
 from update_vars import (GCS_FILE_PATH, analysis_date, PROJECT_CRS, EXPORT_PATH,
                          HALF_MILE_BUFFER_METERS, CORRIDOR_BUFFER_METERS
diff --git a/high_quality_transit_areas/catalog.yml b/high_quality_transit_areas/catalog.yml
index ad92b3417..de57d1fff 100644
--- a/high_quality_transit_areas/catalog.yml
+++ b/high_quality_transit_areas/catalog.yml
@@ -14,69 +14,69 @@ sources:
         args:
           urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/ca_boundary.parquet
     ## INTERMEDIATE DATA
-    # Source: A1_rail_ferry_brt_stops.py
+    # Source: rail_ferry_brt_stops.py
     stops_info_crosswalk:
         driver: geoparquet
         description: Assembled stop gdf with route info from trips table.
         args:
           urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/stops_to_route.parquet
-    # Source: A1_rail_ferry_brt_stops.py
-    rail_brt_ferry_initial:
+    # Source: rail_ferry_brt_stops.py
+    rail_brt_ferry_stops:
         driver: geoparquet
         description: All the major transit stops (Rail / BRT / Ferry).
         args:
           urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/rail_brt_ferry.parquet
-    # Source: B1_create_hqta_segments.py
+    # Source: create_hqta_segments.py
     hqta_segments:
         driver: geoparquet
-        description: Cut HQTA segments across all operators. Created in B1_create_hqta_segments.py.
+        description: Cut HQTA segments across all operators. 
         args:
           urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/hqta_segments.parquet   
-    # Source: B2_sjoin_stops_to_segments.py
+    # Source: sjoin_stops_to_segments.py
     all_bus:
         driver: geoparquet
-        description: Combined hqta corridors across all operators. Created in B2_sjoin_stops_to_segments.py.
+        description: Combined hqta corridors across all operators. 
         args:
           urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/all_bus.parquet  
-    # Source: C1_prep_pairwise_intersections.py
+    # Source: prep_pairwise_intersections.py
     pairwise_intersections:
         driver: parquet
-        description: Use spatial join to find which hqta segments do intersect at some point. Created in C1_prep_pairwise_intersections.py. 
+        description: Use spatial join to find which hqta segments do intersect at some point.
         args:
           urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/pairwise.parquet   
-    # Source: C1_prep_pairwise_intersections.py
+    # Source: prep_pairwise_intersections.py
     subset_corridors:
         driver: geoparquet
-        description: Narrow down `all_bus` to hqta segments that are found in `pairwise_intersections`. Created in C1_prep_pairwise_intersections.py.
+        description: Narrow down `all_bus` to hqta segments that are found in `pairwise_intersections`.
         args:
           urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/subset_corridors.parquet 
-    # Source: C2_get_intersections.py
+    # Source: get_intersections.py
     all_intersections:
         driver: geoparquet
         description: Find where 2 bus corridors intersect by doing an intersection between the hqta segments.
         args:
           urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/all_intersections.parquet
-    # Source: C3_create_bus_hqta_types.py
+    # Source: create_bus_hqta_types.py
     major_stop_bus:
         driver: geoparquet
-        description: Bus stops that are within the bus intersections. Created in C3_create_bus_hqta_types.py.
+        description: Bus stops that are within the bus intersections. 
         args:
           urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/major_stop_bus.parquet
     stops_in_hq_corr:
         driver: geoparquet
-        description: Bus stops that are within the HQ corridors, even if they stops themselves do not have a lot of trips pass through. Created in C3_create_bus_hqta_types.py.
+        description: Bus stops that are within the HQ corridors, even if they stops themselves do not have a lot of trips pass through. 
         args:
           urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/stops_in_hq_corr.parquet
     ## FINAL DATA
-    # Source: D1_assemble_hqta_points
+    # Source: assemble_hqta_points
     hqta_points:
         driver: geoparquet
-        description: Combined point data for all HQTA types. Created in D1_assemble_hqta_points.py.
+        description: Combined point data for all HQTA types.
         args:
           urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/hqta_points.parquet
-    # Source: D2_assemble_hqta_polygons
+    # Source: assemble_hqta_polygons
     hqta_areas:
         driver: geoparquet
-        description: Combined polygon data for all HQTA types. Created in D2_assemble_hqta_polygons.py.
+        description: Combined polygon data for all HQTA types. 
         args:
           urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/hqta_areas.parquet
\ No newline at end of file
diff --git a/high_quality_transit_areas/C3_create_bus_hqta_types.py b/high_quality_transit_areas/create_bus_hqta_types.py
similarity index 97%
rename from high_quality_transit_areas/C3_create_bus_hqta_types.py
rename to high_quality_transit_areas/create_bus_hqta_types.py
index 1f0574e41..1264148b7 100644
--- a/high_quality_transit_areas/C3_create_bus_hqta_types.py
+++ b/high_quality_transit_areas/create_bus_hqta_types.py
@@ -17,8 +17,7 @@
 from loguru import logger
 
 import _utils
-import C1_prep_pairwise_intersections as prep_clip
-
+from prep_pairwise_intersections import prep_bus_corridors
 from calitp_data_analysis import utils
 from segment_speed_utils import helpers
 from update_vars import (GCS_FILE_PATH, analysis_date, 
@@ -93,7 +92,7 @@ def create_stops_along_corridors(all_stops: gpd.GeoDataFrame) -> gpd.GeoDataFram
     They may also be stops that don't meet the HQ corridor threshold, but
     are stops that physically reside in the corridor.
     """
-    bus_corridors = (prep_clip.prep_bus_corridors(is_hq_corr = True)
+    bus_corridors = (prep_bus_corridors(is_hq_corr = True)
                      [["hqta_segment_id", "geometry"]]
                     )
     
diff --git a/high_quality_transit_areas/B1_create_hqta_segments.py b/high_quality_transit_areas/create_hqta_segments.py
similarity index 100%
rename from high_quality_transit_areas/B1_create_hqta_segments.py
rename to high_quality_transit_areas/create_hqta_segments.py
diff --git a/high_quality_transit_areas/C2_get_intersections.py b/high_quality_transit_areas/get_intersections.py
similarity index 100%
rename from high_quality_transit_areas/C2_get_intersections.py
rename to high_quality_transit_areas/get_intersections.py
diff --git a/high_quality_transit_areas/C1_prep_pairwise_intersections.py b/high_quality_transit_areas/prep_pairwise_intersections.py
similarity index 100%
rename from high_quality_transit_areas/C1_prep_pairwise_intersections.py
rename to high_quality_transit_areas/prep_pairwise_intersections.py
diff --git a/high_quality_transit_areas/A1_rail_ferry_brt_stops.py b/high_quality_transit_areas/rail_ferry_brt_stops.py
similarity index 100%
rename from high_quality_transit_areas/A1_rail_ferry_brt_stops.py
rename to high_quality_transit_areas/rail_ferry_brt_stops.py
diff --git a/high_quality_transit_areas/B2_sjoin_stops_to_segments.py b/high_quality_transit_areas/sjoin_stops_to_segments.py
similarity index 100%
rename from high_quality_transit_areas/B2_sjoin_stops_to_segments.py
rename to high_quality_transit_areas/sjoin_stops_to_segments.py

From dae14d09c2c66d9e084aa986f79a8972848a8484 Mon Sep 17 00:00:00 2001
From: tiffanychu90 <tiffany.chu@dot.ca.gov>
Date: Wed, 25 Sep 2024 17:37:10 +0000
Subject: [PATCH 09/11] clean up logs

---
 .../logs/hqta_processing.log                  | 120 ------------------
 1 file changed, 120 deletions(-)

diff --git a/high_quality_transit_areas/logs/hqta_processing.log b/high_quality_transit_areas/logs/hqta_processing.log
index 3811550e5..9db268abf 100644
--- a/high_quality_transit_areas/logs/hqta_processing.log
+++ b/high_quality_transit_areas/logs/hqta_processing.log
@@ -1,123 +1,3 @@
-2024-02-15 10:39:37.404 | INFO     | __main__:<module>:62 - Analysis date: 2024-02-14
-2024-02-15 10:39:53.615 | INFO     | __main__:<module>:70 - grabbed rail: 0:00:16.209130
-2024-02-15 10:40:08.900 | INFO     | __main__:<module>:79 - grabbed brt: 0:00:15.285392
-2024-02-15 10:40:23.528 | INFO     | __main__:<module>:86 - grabbed ferry: 0:00:14.627469
-2024-02-15 10:40:27.034 | INFO     | __main__:<module>:102 - concatenated datasets
-2024-02-15 10:40:27.290 | INFO     | __main__:<module>:113 - execution time: 0:00:49.884562
-2024-02-15 10:41:59.689 | INFO     | __main__:<module>:281 - Analysis date: 2024-02-14
-2024-02-15 10:42:06.502 | INFO     | __main__:<module>:298 - merge routes to trips: 0:00:06.811228
-2024-02-15 10:48:36.857 | INFO     | __main__:<module>:319 - cut segments: 0:06:30.354741
-2024-02-15 10:48:36.858 | INFO     | __main__:<module>:322 - total execution time: 0:06:37.166630
-2024-02-15 10:48:55.584 | INFO     | __main__:<module>:218 - Analysis date: 2024-02-14
-2024-02-15 10:49:39.341 | INFO     | __main__:<module>:258 - Execution time: 0:00:43.7553602024-02-15 11:12:10.529 | INFO     | __main__:<module>:123 - operators_for_hqta Analysis date: 2024-02-14
-2024-02-15 11:12:11.884 | INFO     | __main__:<module>:132 - get list of cached ITP IDs: 0:00:01.322261
-2024-02-15 11:12:13.731 | INFO     | __main__:<module>:148 - check files for completeness, save as json: 0:00:01.846775
-2024-02-15 11:12:13.731 | INFO     | __main__:<module>:151 - operators_for_hqta execution time: 0:00:03.169393
-2024-02-15 11:12:29.212 | INFO     | __main__:<module>:62 - A2_combine_stops Analysis Date: 2024-02-14
-2024-02-15 11:12:42.921 | INFO     | __main__:<module>:70 - grabbed rail: 0:00:13.707923
-2024-02-15 11:12:56.621 | INFO     | __main__:<module>:79 - grabbed brt: 0:00:13.700660
-2024-02-15 11:13:08.412 | INFO     | __main__:<module>:86 - grabbed ferry: 0:00:11.790357
-2024-02-15 11:13:11.728 | INFO     | __main__:<module>:110 - A2_combine_stops execution time: 0:00:42.515868
-2024-02-15 11:13:25.690 | INFO     | __main__:<module>:281 - B1_create_hqta_segments Analysis date: 2024-02-14
-2024-02-15 11:13:31.067 | INFO     | __main__:<module>:298 - merge routes to trips: 0:00:05.376384
-2024-02-15 11:17:26.810 | INFO     | __main__:<module>:319 - cut segments: 0:03:55.742468
-2024-02-15 11:17:26.810 | INFO     | __main__:<module>:322 - B1_create_hqta_segments execution time: 0:04:01.119368
-2024-02-15 11:17:40.815 | INFO     | __main__:<module>:218 - B2_sjoin_stops_to_segments Analysis date: 2024-02-14
-2024-02-15 11:18:13.209 | INFO     | __main__:<module>:258 - B2_sjoin_stops_to_segments execution time: 0:00:32.393811
-2024-02-15 11:18:28.611 | INFO     | __main__:<module>:90 - C1_prep_pairwise_intersections Analysis date: 2024-02-14
-2024-02-15 11:18:35.802 | INFO     | __main__:<module>:113 - get pairwise table: 0:00:07.190876
-2024-02-15 11:18:36.183 | INFO     | __main__:<module>:136 - compute for pairwise/subset_corridors: 0:00:00.381023
-2024-02-15 11:18:38.070 | INFO     | __main__:<module>:148 - C1_prep_pairwise_intersections execution time: 0:00:09.459173
-2024-02-15 11:18:51.793 | INFO     | __main__:<module>:109 - C2_find_intersections Analysis date: 2024-02-14
-2024-02-15 11:18:58.127 | INFO     | __main__:<module>:119 - attach geometry to pairwise table: 0:00:06.332460
-2024-02-15 11:19:14.113 | INFO     | __main__:<module>:124 - find intersections: 0:00:15.986460
-2024-02-15 11:19:17.140 | INFO     | __main__:<module>:133 - C2_find_intersections execution time: 0:00:25.345916
-2024-02-15 11:19:30.961 | INFO     | __main__:<module>:127 - C3_create_bus_hqta_types Analysis date: 2024-02-14
-2024-02-15 11:19:46.819 | INFO     | __main__:<module>:141 - grab all stops
-2024-02-15 11:19:49.936 | INFO     | __main__:<module>:145 - create major stop bus
-2024-02-15 11:19:53.686 | INFO     | __main__:<module>:149 - create hq corridor bus
-2024-02-15 11:19:54.723 | INFO     | __main__:<module>:165 - C3_create_bus_hqta_types execution time: 0:00:23.759924
-2024-02-15 11:20:10.567 | INFO     | __main__:<module>:209 - D1_assemble_hqta_points Analysis date: 2024-02-14
-2024-02-15 11:20:15.564 | INFO     | __main__:<module>:234 - combined points: 0:00:04.997074
-2024-02-15 11:20:28.853 | INFO     | __main__:<module>:240 - add route info: 0:00:13.288940
-2024-02-15 11:20:38.686 | INFO     | __main__:<module>:248 - add agency names: 0:00:09.832929
-2024-02-15 11:20:39.357 | INFO     | __main__:<module>:258 - export as geoparquet in date folder
-2024-02-15 11:20:39.906 | INFO     | __main__:<module>:267 - export as geoparquet
-2024-02-15 11:20:39.907 | INFO     | __main__:<module>:270 - D1_assemble_hqta_points execution time: 0:00:29.339672
-2024-02-15 11:20:53.679 | INFO     | __main__:<module>:135 - D2_assemble_hqta_polygons Analysis date: 2024-02-14
-2024-02-15 11:21:20.489 | INFO     | __main__:<module>:146 - filter and buffer: 0:00:26.810481
-2024-02-15 11:21:22.772 | INFO     | __main__:<module>:159 - export as geoparquet in date folder
-2024-02-15 11:21:24.051 | INFO     | __main__:<module>:168 - export as geoparquet
-2024-02-15 11:21:24.052 | INFO     | __main__:<module>:171 - D2_assemble_hqta_polygons execution time: 0:00:30.373503
-2024-02-23 12:16:18.905 | INFO     | __main__:<module>:334 - A1_rail_ferry_brt_stops 2024-02-14 execution time: 0:00:55.356475
-2024-02-23 13:02:56.942 | INFO     | __main__:<module>:243 - B1_create_hqta_segments execution time: 0:02:45.137511
-2024-02-23 13:06:07.547 | INFO     | __main__:<module>:256 - B2_sjoin_stops_to_segments 2024-02-14 execution time: 0:00:43.474327
-2024-02-23 13:18:24.960 | INFO     | __main__:<module>:143 - C1_prep_pairwise_intersections 2024-02-14 execution time: 0:00:09.433171
-2024-02-23 13:20:52.094 | INFO     | __main__:<module>:125 - C2_find_intersections 2024-02-14 execution time: 0:00:26.055988
-2024-02-23 13:38:04.076 | INFO     | __main__:<module>:341 - A1_rail_ferry_brt_stops 2024-02-14 execution time: 0:00:47.907172
-2024-02-23 13:41:03.390 | INFO     | __main__:<module>:243 - B1_create_hqta_segments execution time: 0:02:43.653086
-2024-02-23 13:41:58.511 | INFO     | __main__:<module>:256 - B2_sjoin_stops_to_segments 2024-02-14 execution time: 0:00:38.637850
-2024-02-23 13:42:21.715 | INFO     | __main__:<module>:143 - C1_prep_pairwise_intersections 2024-02-14 execution time: 0:00:06.457581
-2024-02-23 13:42:59.131 | INFO     | __main__:<module>:125 - C2_find_intersections 2024-02-14 execution time: 0:00:21.831450
-2024-02-23 13:48:09.216 | INFO     | __main__:<module>:160 - C3_create_bus_hqta_types 2024-02-14 execution time: 0:00:21.771559
-2024-02-23 15:04:25.099 | INFO     | __main__:<module>:295 - D1_assemble_hqta_points 2024-02-14 execution time: 0:00:24.251981
-2024-02-23 15:06:13.474 | INFO     | __main__:<module>:167 - D2_assemble_hqta_polygons 2024-02-14 execution time: 0:00:20.572008
-2024-03-14 10:35:35.916 | INFO     | __main__:<module>:340 - A1_rail_ferry_brt_stops 2023-03-13 execution time: 0:00:56.468970
-2024-03-14 10:49:33.915 | INFO     | __main__:<module>:243 - B1_create_hqta_segments execution time: 0:13:35.798908
-2024-03-14 10:50:41.957 | INFO     | __main__:<module>:256 - B2_sjoin_stops_to_segments 2023-03-13 execution time: 0:00:46.505567
-2024-03-14 10:51:12.944 | INFO     | __main__:<module>:142 - C1_prep_pairwise_intersections 2023-03-13 execution time: 0:00:07.495188
-2024-03-14 10:52:10.238 | INFO     | __main__:<module>:125 - C2_find_intersections 2023-03-13 execution time: 0:00:34.444930
-2024-03-14 10:53:01.682 | INFO     | __main__:<module>:163 - C3_create_bus_hqta_types 2023-03-13 execution time: 0:00:29.090421
-2024-03-14 11:27:07.016 | INFO     | __main__:<module>:259 - B1_create_hqta_segments execution time: 0:01:24.890920
-2024-03-14 11:29:20.496 | INFO     | __main__:<module>:295 - D1_assemble_hqta_points 2023-03-13 execution time: 0:00:22.179824
-2024-03-14 11:30:06.328 | INFO     | __main__:<module>:167 - D2_assemble_hqta_polygons 2023-03-13 execution time: 0:00:22.226070
-2024-03-14 11:48:11.160 | INFO     | __main__:<module>:340 - A1_rail_ferry_brt_stops 2024-03-13 execution time: 0:00:57.213630
-2024-03-14 11:53:27.946 | INFO     | __main__:<module>:259 - B1_create_hqta_segments execution time: 0:04:58.538786
-2024-03-14 11:54:43.754 | INFO     | __main__:<module>:256 - B2_sjoin_stops_to_segments 2024-03-13 execution time: 0:00:50.087412
-2024-03-14 11:55:10.649 | INFO     | __main__:<module>:142 - C1_prep_pairwise_intersections 2024-03-13 execution time: 0:00:07.089710
-2024-03-14 11:56:06.191 | INFO     | __main__:<module>:125 - C2_find_intersections 2024-03-13 execution time: 0:00:35.945019
-2024-03-14 11:56:55.334 | INFO     | __main__:<module>:163 - C3_create_bus_hqta_types 2024-03-13 execution time: 0:00:27.390021
-2024-03-14 12:12:21.763 | INFO     | __main__:<module>:295 - D1_assemble_hqta_points 2024-03-13 execution time: 0:00:26.480160
-2024-03-14 12:13:12.687 | INFO     | __main__:<module>:167 - D2_assemble_hqta_polygons 2024-03-13 execution time: 0:00:29.033860
-2024-03-21 11:54:40.930 | INFO     | __main__:<module>:354 - A1_rail_ferry_brt_stops 2024-03-13 execution time: 0:00:51.987419
-2024-03-21 12:01:28.365 | INFO     | __main__:<module>:249 - B1_create_hqta_segments execution time: 0:03:02.428114
-2024-03-21 12:02:23.099 | INFO     | __main__:<module>:256 - B2_sjoin_stops_to_segments 2024-03-13 execution time: 0:00:35.845848
-2024-03-21 12:02:46.911 | INFO     | __main__:<module>:142 - C1_prep_pairwise_intersections 2024-03-13 execution time: 0:00:05.864652
-2024-03-21 12:03:24.770 | INFO     | __main__:<module>:125 - C2_find_intersections 2024-03-13 execution time: 0:00:21.158652
-2024-03-21 12:04:01.449 | INFO     | __main__:<module>:163 - C3_create_bus_hqta_types 2024-03-13 execution time: 0:00:19.553787
-2024-03-21 12:04:42.807 | INFO     | __main__:<module>:295 - D1_assemble_hqta_points 2024-03-13 execution time: 0:00:22.988739
-2024-03-21 12:05:20.102 | INFO     | __main__:<module>:167 - D2_assemble_hqta_polygons 2024-03-13 execution time: 0:00:19.166756
-2024-04-18 12:02:44.870 | INFO     | __main__:<module>:354 - A1_rail_ferry_brt_stops 2024-04-17 execution time: 0:00:59.115933
-2024-04-18 12:09:06.425 | INFO     | __main__:<module>:256 - B2_sjoin_stops_to_segments 2024-04-17 execution time: 0:00:50.678918
-2024-04-18 12:09:36.340 | INFO     | __main__:<module>:142 - C1_prep_pairwise_intersections 2024-04-17 execution time: 0:00:07.719892
-2024-04-18 12:10:31.226 | INFO     | __main__:<module>:125 - C2_find_intersections 2024-04-17 execution time: 0:00:33.802270
-2024-04-18 12:11:31.609 | INFO     | __main__:<module>:163 - C3_create_bus_hqta_types 2024-04-17 execution time: 0:00:37.330690
-2024-04-18 12:12:28.853 | INFO     | __main__:<module>:296 - D1_assemble_hqta_points 2024-04-17 execution time: 0:00:31.955298
-2024-04-18 12:13:36.294 | INFO     | __main__:<module>:167 - D2_assemble_hqta_polygons 2024-04-17 execution time: 0:00:40.596021
-2024-06-07 15:55:59.608 | INFO     | __main__:<module>:354 - A1_rail_ferry_brt_stops 2024-05-26 execution time: 0:00:43.845652
-2024-06-07 16:00:03.975 | INFO     | __main__:<module>:249 - B1_create_hqta_segments execution time: 0:03:43.783595
-2024-06-07 16:01:02.185 | INFO     | __main__:<module>:256 - B2_sjoin_stops_to_segments 2024-05-26 execution time: 0:00:34.801918
-2024-06-07 16:01:29.932 | INFO     | __main__:<module>:142 - C1_prep_pairwise_intersections 2024-05-26 execution time: 0:00:05.850319
-2024-06-07 16:02:05.231 | INFO     | __main__:<module>:125 - C2_find_intersections 2024-05-26 execution time: 0:00:14.305249
-2024-06-07 16:02:42.337 | INFO     | __main__:<module>:163 - C3_create_bus_hqta_types 2024-05-26 execution time: 0:00:16.144903
-2024-06-07 16:03:25.052 | INFO     | __main__:<module>:296 - D1_assemble_hqta_points 2024-05-26 execution time: 0:00:20.105690
-2024-06-07 16:04:04.859 | INFO     | __main__:<module>:167 - D2_assemble_hqta_polygons 2024-05-26 execution time: 0:00:16.899794
-2024-06-13 10:52:06.307 | INFO     | __main__:<module>:354 - A1_rail_ferry_brt_stops 2024-05-22 execution time: 0:02:22.756503
-2024-06-13 12:58:01.749 | INFO     | __main__:<module>:249 - B1_create_hqta_segments execution time: 0:13:52.098231
-2024-06-13 13:00:11.575 | INFO     | __main__:<module>:256 - B2_sjoin_stops_to_segments 2024-05-22 execution time: 0:01:32.770484
-2024-06-13 13:00:50.204 | INFO     | __main__:<module>:142 - C1_prep_pairwise_intersections 2024-05-22 execution time: 0:00:10.587615
-2024-06-13 13:01:58.938 | INFO     | __main__:<module>:125 - C2_find_intersections 2024-05-22 execution time: 0:00:42.017435
-2024-06-13 13:03:04.167 | INFO     | __main__:<module>:163 - C3_create_bus_hqta_types 2024-05-22 execution time: 0:00:38.066749
-2024-06-13 13:04:13.581 | INFO     | __main__:<module>:296 - D1_assemble_hqta_points 2024-05-22 execution time: 0:00:33.857546
-2024-06-13 13:05:21.917 | INFO     | __main__:<module>:167 - D2_assemble_hqta_polygons 2024-05-22 execution time: 0:00:38.362120
-2024-06-13 13:08:03.561 | INFO     | __main__:<module>:354 - A1_rail_ferry_brt_stops 2024-06-12 execution time: 0:01:04.260629
-2024-06-13 13:17:59.981 | INFO     | __main__:<module>:249 - B1_create_hqta_segments execution time: 0:09:30.012600
-2024-06-13 13:19:38.445 | INFO     | __main__:<module>:256 - B2_sjoin_stops_to_segments 2024-06-12 execution time: 0:01:13.144507
-2024-06-13 13:20:16.756 | INFO     | __main__:<module>:142 - C1_prep_pairwise_intersections 2024-06-12 execution time: 0:00:09.378312
-2024-06-13 13:21:54.122 | INFO     | __main__:<module>:125 - C2_find_intersections 2024-06-12 execution time: 0:01:04.703513
-2024-06-13 13:23:49.518 | INFO     | __main__:<module>:163 - C3_create_bus_hqta_types 2024-06-12 execution time: 0:01:10.061193
-2024-06-13 13:25:40.133 | INFO     | __main__:<module>:296 - D1_assemble_hqta_points 2024-06-12 execution time: 0:00:58.173453
-2024-06-13 13:27:47.666 | INFO     | __main__:<module>:167 - D2_assemble_hqta_polygons 2024-06-12 execution time: 0:01:16.776741
 2024-07-18 12:57:28.027 | INFO     | __main__:<module>:354 - A1_rail_ferry_brt_stops 2024-07-17 execution time: 0:01:01.056099
 2024-07-18 13:01:59.287 | INFO     | __main__:<module>:249 - B1_create_hqta_segments execution time: 0:04:10.669481
 2024-07-18 13:03:04.077 | INFO     | __main__:<module>:256 - B2_sjoin_stops_to_segments 2024-07-17 execution time: 0:00:44.167777

From 8498292164885de6d4ccd695f9d47ce6f0f267cf Mon Sep 17 00:00:00 2001
From: tiffanychu90 <tiffany.chu@dot.ca.gov>
Date: Wed, 25 Sep 2024 17:42:04 +0000
Subject: [PATCH 10/11] (remove): combine 3 hqta notebook checks into 1

---
 .../check1_downloads.ipynb                    | 107 -------------
 .../check2_hq_corridors.ipynb                 | 150 ------------------
 ..._hqta_points.ipynb => check_exports.ipynb} |  58 +++++--
 3 files changed, 41 insertions(+), 274 deletions(-)
 delete mode 100644 high_quality_transit_areas/check1_downloads.ipynb
 delete mode 100644 high_quality_transit_areas/check2_hq_corridors.ipynb
 rename high_quality_transit_areas/{check3_hqta_points.ipynb => check_exports.ipynb} (81%)

diff --git a/high_quality_transit_areas/check1_downloads.ipynb b/high_quality_transit_areas/check1_downloads.ipynb
deleted file mode 100644
index b12fc7687..000000000
--- a/high_quality_transit_areas/check1_downloads.ipynb
+++ /dev/null
@@ -1,107 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "08f82968-a8b5-42c3-919a-f8f2028b9c8a",
-   "metadata": {},
-   "source": [
-    "# Check: initial downloads\n",
-    "\n",
-    "Make maps to see that rail/ferry/brt all show up correctly."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ecdd335a-be94-4a11-aaca-24a43a3b9756",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import geopandas as gpd\n",
-    "import pandas as pd\n",
-    "\n",
-    "from IPython.display import Markdown\n",
-    "\n",
-    "from segment_speed_utils import helpers\n",
-    "from update_vars import analysis_date, GCS_FILE_PATH\n",
-    "\n",
-    "# Map arguments\n",
-    "TILES = \"Carto DB Positron\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "573004b2-659d-4930-97b9-3f333a7ab8d8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def make_map(gdf, plot_col):\n",
-    "    date_cols = [c for c in gdf.columns if \n",
-    "                 gdf[c].dtype == 'datetime64[ns]']\n",
-    "\n",
-    "    gdf = gdf.drop(columns = date_cols)\n",
-    "        \n",
-    "    m = gdf.explore(plot_col, categorical = True, tiles = TILES)\n",
-    "    \n",
-    "    display(m)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5351b749-106b-4b4b-aa67-aa76bad06ca2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "stops = gpd.read_parquet(\n",
-    "    f\"{GCS_FILE_PATH}rail_brt_ferry.parquet\"\n",
-    ")\n",
-    "\n",
-    "hqta_types = list(stops.hqta_type.unique())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "72b13f4a-313b-45fe-b420-3bb656c2fd25",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for i in hqta_types:\n",
-    "    display(Markdown(f\"### HQTA Type: {i}\"))\n",
-    "    \n",
-    "    make_map(stops[stops.hqta_type==i], \"route_id\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5dc0b6aa-4e52-4c56-9d44-a9818b2890b2",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/high_quality_transit_areas/check2_hq_corridors.ipynb b/high_quality_transit_areas/check2_hq_corridors.ipynb
deleted file mode 100644
index eba631e67..000000000
--- a/high_quality_transit_areas/check2_hq_corridors.ipynb
+++ /dev/null
@@ -1,150 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "08f82968-a8b5-42c3-919a-f8f2028b9c8a",
-   "metadata": {},
-   "source": [
-    "# Check: HQ corridors"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ecdd335a-be94-4a11-aaca-24a43a3b9756",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import geopandas as gpd\n",
-    "import pandas as pd\n",
-    "\n",
-    "from shared_utils import rt_dates\n",
-    "from update_vars import analysis_date"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "dc9684e7-ebc1-48a8-8360-40682a1a1c14",
-   "metadata": {},
-   "source": [
-    "### After `C3_create_bus_hqta_types`\n",
-    "\n",
-    "Check part of the compiling and assembly of polygons."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4fa137db-08d5-4822-9bdd-46919ee0da7f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import C1_prep_pairwise_intersections as prep_clip\n",
-    "import D2_assemble_hqta_polygons as D2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1e383a39-3810-41f6-a366-985126b335db",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "bus_hq_corr = prep_clip.prep_bus_corridors(is_hq_corr=True)\n",
-    "\n",
-    "corridors = D2.get_dissolved_hq_corridor_bus(bus_hq_corr, \n",
-    "                                             analysis_date)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "691394ca-b7cc-4f81-8a96-2c167748e240",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "COUNTY_URL = \"https://opendata.arcgis.com/datasets/8713ced9b78a4abb97dc130a691a8695_0.geojson\"\n",
-    "\n",
-    "EPSG_CODE = corridors.crs.to_epsg()\n",
-    "counties = gpd.read_file(COUNTY_URL).to_crs(f\"EPSG: {EPSG_CODE}\")\n",
-    "\n",
-    "bay_area_counties = [\n",
-    "    \"Alameda\", \"Contra Costa\", \n",
-    "    \"Marin\", \"Napa\", \n",
-    "    \"San Francisco\", \"San Mateo\", \"Santa Clara\", \n",
-    "    \"Solano\", \"Sonoma\"\n",
-    "]\n",
-    "\n",
-    "hqta_in_bay = gpd.sjoin(\n",
-    "    corridors,\n",
-    "    counties[counties.COUNTY_NAME.isin(bay_area_counties)][\n",
-    "        [\"COUNTY_NAME\", \"geometry\"]],\n",
-    "    how = \"inner\",\n",
-    "    predicate=\"intersects\"\n",
-    ").drop(columns=\"index_right\")\n",
-    "\n",
-    "hqta_in_la = gpd.sjoin(\n",
-    "    corridors,\n",
-    "    counties[counties.COUNTY_NAME == \"Los Angeles\"][\n",
-    "        [\"COUNTY_NAME\", \"geometry\"]],\n",
-    "    how = \"inner\",\n",
-    "    predicate=\"intersects\"\n",
-    ").drop(columns=\"index_right\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c6368928-8f1a-4c08-b826-96115e85eb4f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Bay Area\n",
-    "TILES = \"CartoDB Positron\"\n",
-    "hqta_in_bay.explore(\"feed_key_primary\", categorical=True, \n",
-    "                    tiles = TILES, legend=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "549e7974-fa20-4cce-b02a-7984192483b4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# LA\n",
-    "hqta_in_la.explore(\"feed_key_primary\", categorical=True, \n",
-    "                   tiles = TILES, legend=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fdd4f490-df75-4433-898a-4479f5cb62b2",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/high_quality_transit_areas/check3_hqta_points.ipynb b/high_quality_transit_areas/check_exports.ipynb
similarity index 81%
rename from high_quality_transit_areas/check3_hqta_points.ipynb
rename to high_quality_transit_areas/check_exports.ipynb
index 7869c49f4..efb28e05f 100644
--- a/high_quality_transit_areas/check3_hqta_points.ipynb
+++ b/high_quality_transit_areas/check_exports.ipynb
@@ -5,13 +5,7 @@
    "id": "cec8ccdd-5225-4814-b59a-a8d398062e35",
    "metadata": {},
    "source": [
-    "# Check HQTA points / polygons\n",
-    "\n",
-    "## Dropping bad stops\n",
-    "\n",
-    "* Be more stringent about what `stop_id` to drop, since the same `stop_id` can be shared across operators. Also add in which operator.\n",
-    "\n",
-    "### Done in `D2_assemble_hqta_polygons`, but should also be added to `D1_assemble_hqta_points`\n"
+    "# Check HQTA points / polygons"
    ]
   },
   {
@@ -32,29 +26,57 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "1c880d7b-ebb3-46c7-98f1-d724bdd802cb",
+   "id": "ad15e88d-da20-435c-b3c0-34df96ff75bf",
    "metadata": {},
    "outputs": [],
    "source": [
-    "gdf = gpd.read_parquet(f\"{GCS_FILE_PATH}hqta_points.parquet\")"
+    "def make_map(gdf, plot_col):\n",
+    "    date_cols = [c for c in gdf.columns if \n",
+    "                 gdf[c].dtype == 'datetime64[ns]']\n",
+    "        \n",
+    "    m = gdf.drop(date_cols).explore(\n",
+    "        plot_col, \n",
+    "        categorical=True, \n",
+    "        tiles = \"CartoDB Positron\", \n",
+    "        legend=True\n",
+    "    )\n",
+    "    \n",
+    "    display(m)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "927b1e97-e7d6-4f46-9550-5c3f83ac08a8",
+   "metadata": {},
+   "source": [
+    "## Rail / BRT / Ferry stops"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "ee3dad2e-935a-42b8-92af-7e7fae3d9248",
+   "id": "dcd7f8df-93bd-4702-a29e-ddd9211de08f",
    "metadata": {},
    "outputs": [],
    "source": [
-    "TILES = \"CartoDB Positron\"\n",
+    "stops = gpd.read_parquet(\n",
+    "    f\"{GCS_FILE_PATH}rail_brt_ferry.parquet\"\n",
+    ")\n",
     "\n",
-    "def make_map(gdf, plot_col):\n",
-    "    if \"service_date\" in gdf.columns:\n",
-    "        gdf = gdf.drop(columns = \"service_date\")\n",
-    "        \n",
-    "    m = gdf.explore(plot_col, categorical=True, tiles = TILES, legend=True)\n",
+    "hqta_types = list(stops.hqta_type.unique())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01ad05f9-bcb3-4b06-8911-2ab3669b5561",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in hqta_types:\n",
+    "    display(Markdown(f\"### HQTA Type: {i}\"))\n",
     "    \n",
-    "    display(m)"
+    "    make_map(stops[stops.hqta_type==i], \"route_id\")"
    ]
   },
   {
@@ -95,6 +117,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "gdf = gpd.read_parquet(f\"{GCS_FILE_PATH}hqta_points.parquet\")\n",
+    "\n",
     "check_for_missing(gdf)    "
    ]
   },

From e382995dc78f54fc8f80b1c5f1f3b309955bf1bd Mon Sep 17 00:00:00 2001
From: tiffanychu90 <tiffany.chu@dot.ca.gov>
Date: Wed, 25 Sep 2024 17:45:52 +0000
Subject: [PATCH 11/11] notebooks use numbered ordering, add _utils

---
 ...ines.ipynb => 01_corridors-as-lines.ipynb} |  0
 ..._SACOG.ipynb => 02_hqta_green_SACOG.ipynb} |  0
 .../{hqta-map.ipynb => 03_hqta_map.ipynb}     |  0
 ...ittier.ipynb => 04_explore_whittier.ipynb} |  0
 .../{metro_brt.ipynb => 05_metro_brt.ipynb}   |  0
 .../{muni_brt.ipynb => 06_muni_brt.ipynb}     |  0
 .../{amtrak.ipynb => 07_amtrak.ipynb}         |  0
 high_quality_transit_areas/_utils.py          | 43 +++++++++++++++++++
 8 files changed, 43 insertions(+)
 rename high_quality_transit_areas/{corridors-as-lines.ipynb => 01_corridors-as-lines.ipynb} (100%)
 rename high_quality_transit_areas/{hqta_green_SACOG.ipynb => 02_hqta_green_SACOG.ipynb} (100%)
 rename high_quality_transit_areas/{hqta-map.ipynb => 03_hqta_map.ipynb} (100%)
 rename high_quality_transit_areas/{explore-whittier.ipynb => 04_explore_whittier.ipynb} (100%)
 rename high_quality_transit_areas/{metro_brt.ipynb => 05_metro_brt.ipynb} (100%)
 rename high_quality_transit_areas/{muni_brt.ipynb => 06_muni_brt.ipynb} (100%)
 rename high_quality_transit_areas/{amtrak.ipynb => 07_amtrak.ipynb} (100%)
 create mode 100644 high_quality_transit_areas/_utils.py

diff --git a/high_quality_transit_areas/corridors-as-lines.ipynb b/high_quality_transit_areas/01_corridors-as-lines.ipynb
similarity index 100%
rename from high_quality_transit_areas/corridors-as-lines.ipynb
rename to high_quality_transit_areas/01_corridors-as-lines.ipynb
diff --git a/high_quality_transit_areas/hqta_green_SACOG.ipynb b/high_quality_transit_areas/02_hqta_green_SACOG.ipynb
similarity index 100%
rename from high_quality_transit_areas/hqta_green_SACOG.ipynb
rename to high_quality_transit_areas/02_hqta_green_SACOG.ipynb
diff --git a/high_quality_transit_areas/hqta-map.ipynb b/high_quality_transit_areas/03_hqta_map.ipynb
similarity index 100%
rename from high_quality_transit_areas/hqta-map.ipynb
rename to high_quality_transit_areas/03_hqta_map.ipynb
diff --git a/high_quality_transit_areas/explore-whittier.ipynb b/high_quality_transit_areas/04_explore_whittier.ipynb
similarity index 100%
rename from high_quality_transit_areas/explore-whittier.ipynb
rename to high_quality_transit_areas/04_explore_whittier.ipynb
diff --git a/high_quality_transit_areas/metro_brt.ipynb b/high_quality_transit_areas/05_metro_brt.ipynb
similarity index 100%
rename from high_quality_transit_areas/metro_brt.ipynb
rename to high_quality_transit_areas/05_metro_brt.ipynb
diff --git a/high_quality_transit_areas/muni_brt.ipynb b/high_quality_transit_areas/06_muni_brt.ipynb
similarity index 100%
rename from high_quality_transit_areas/muni_brt.ipynb
rename to high_quality_transit_areas/06_muni_brt.ipynb
diff --git a/high_quality_transit_areas/amtrak.ipynb b/high_quality_transit_areas/07_amtrak.ipynb
similarity index 100%
rename from high_quality_transit_areas/amtrak.ipynb
rename to high_quality_transit_areas/07_amtrak.ipynb
diff --git a/high_quality_transit_areas/_utils.py b/high_quality_transit_areas/_utils.py
new file mode 100644
index 000000000..fd42638bf
--- /dev/null
+++ b/high_quality_transit_areas/_utils.py
@@ -0,0 +1,43 @@
+"""
+Shared utility functions for HQTA
+"""
+import geopandas as gpd
+import intake
+import pandas as pd
+
+catalog = intake.open_catalog("catalog.yml")
+
+def add_hqta_details(row) -> str:
+    """
+    Add HQTA details of why nulls are present 
+    based on feedback from open data users.
+    """    
+    if row.hqta_type == "major_stop_bus":
+        if row.schedule_gtfs_dataset_key_primary != row.schedule_gtfs_dataset_key_secondary:
+            return "intersection_2_bus_routes_different_operators"
+        else:
+            return "intersection_2_bus_routes_same_operator"  
+    
+    elif row.hqta_type == "hq_corridor_bus":
+        if row.peak_trips >= 4:
+            return "corridor_frequent_stop"
+        else:
+            return "corridor_other_stop"
+    
+    elif row.hqta_type in ["major_stop_ferry", 
+                           "major_stop_brt", "major_stop_rail"]:
+        return row.hqta_type + "_single_operator"
+
+def primary_rename(df: pd.DataFrame) -> pd.DataFrame:
+    return df.rename(
+        columns = {"schedule_gtfs_dataset_key": "schedule_gtfs_dataset_key_primary"})
+
+def clip_to_ca(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
+    """
+    Clip to CA boundaries. 
+    """    
+    ca = catalog.ca_boundary.read().to_crs(gdf.crs)
+
+    gdf2 = gdf.clip(ca, keep_geom_type = False).reset_index(drop=True)
+
+    return gdf2
\ No newline at end of file