Skip to content

Commit

Permalink
Merge pull request #1228 from cal-itp/sep-open-data
Browse files Browse the repository at this point in the history
Sep open data
  • Loading branch information
tiffanychu90 authored Sep 25, 2024
2 parents b1061b7 + e382995 commit 9e6b007
Show file tree
Hide file tree
Showing 45 changed files with 1,235 additions and 1,404 deletions.
1 change: 1 addition & 0 deletions _shared_utils/shared_utils/rt_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
"jun2024": "2024-06-12",
"jul2024": "2024-07-17",
"aug2024": "2024-08-14",
"sep2024": "2024-09-18",
}

y2023_dates = [
Expand Down
3 changes: 2 additions & 1 deletion gtfs_funnel/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ route_typologies_data:
# Clean route names for displaying across time
timeseries_preprocessing:
python clean_route_naming.py

python track_publish_dates.py

# monthly scheduled service, download after the end of each month
monthly_scheduled_data:
python download_monthly_service.py
Expand Down
17 changes: 17 additions & 0 deletions gtfs_funnel/logs/download_data.log
Original file line number Diff line number Diff line change
Expand Up @@ -516,3 +516,20 @@
2024-08-15 09:09:27.480 | INFO | __main__:download_one_day:33 - *********** Download st data ***********
2024-08-15 09:11:56.577 | INFO | __main__:download_one_day:56 - execution time: 0:02:30.991910
2024-08-15 10:30:38.864 | INFO | __main__:download_one_year:35 - execution time: 0:00:25.978363
2024-09-19 08:13:46.511 | INFO | __main__:download_one_day:45 - Analysis date: 2024-09-18
2024-09-19 08:13:49.222 | INFO | __main__:download_one_day:52 - # operators to run: 221
2024-09-19 08:13:49.223 | INFO | __main__:download_one_day:56 - *********** Download trips data ***********
2024-09-19 08:14:16.573 | INFO | __main__:download_one_day:86 - execution time: 0:00:30.061230
2024-09-19 08:14:35.388 | INFO | __main__:download_one_day:22 - Analysis date: 2024-09-18
2024-09-19 08:14:37.294 | INFO | __main__:download_one_day:29 - # operators to run: 221
2024-09-19 08:14:37.294 | INFO | __main__:download_one_day:33 - *********** Download stops data ***********
2024-09-19 08:14:47.392 | INFO | __main__:download_one_day:64 - execution time: 0:00:12.003376
2024-09-19 08:15:03.834 | INFO | __main__:download_one_day:22 - Analysis date: 2024-09-18
2024-09-19 08:15:05.784 | INFO | __main__:download_one_day:29 - # operators to run: 221
2024-09-19 08:15:05.785 | INFO | __main__:download_one_day:33 - *********** Download routelines data ***********
2024-09-19 08:16:57.558 | INFO | __main__:download_one_day:63 - execution time: 0:01:53.723521
2024-09-19 08:17:14.221 | INFO | __main__:download_one_day:21 - Analysis date: 2024-09-18
2024-09-19 08:17:15.854 | INFO | __main__:download_one_day:29 - # operators to run: 190
2024-09-19 08:17:15.855 | INFO | __main__:download_one_day:33 - *********** Download st data ***********
2024-09-19 08:19:06.258 | INFO | __main__:download_one_day:56 - execution time: 0:01:52.036660
2024-09-19 09:28:35.882 | INFO | __main__:download_one_year:35 - execution time: 0:00:45.388883
11 changes: 11 additions & 0 deletions gtfs_funnel/logs/download_vp_v2.log
Original file line number Diff line number Diff line change
Expand Up @@ -339,3 +339,14 @@
2024-08-15 09:29:03.589 | INFO | __main__:<module>:112 - export concatenated vp: 0:04:16.418987
2024-08-15 09:34:04.743 | INFO | __main__:<module>:134 - remove batched parquets
2024-08-15 09:34:04.745 | INFO | __main__:<module>:137 - execution time: 0:09:26.469734
2024-09-19 08:19:35.573 | INFO | __main__:<module>:148 - Analysis date: 2024-09-18
2024-09-19 08:21:52.859 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 0 to GCS: 0:02:17.254015
2024-09-19 08:23:01.583 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 1 to GCS: 0:01:08.722700
2024-09-19 08:26:57.364 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 2 to GCS: 0:03:55.780573
2024-09-19 08:28:55.328 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 3 to GCS: 0:01:57.952237
2024-09-19 08:28:55.328 | INFO | __main__:<module>:155 - execution time: 0:09:19.722825
2024-09-19 08:29:19.967 | INFO | __main__:<module>:97 - Analysis date: 2024-09-18
2024-09-19 08:29:38.182 | INFO | __main__:<module>:105 - concat and filter batched data: 0:00:18.208902
2024-09-19 08:33:43.251 | INFO | __main__:<module>:112 - export concatenated vp: 0:04:05.069147
2024-09-19 08:37:30.865 | INFO | __main__:<module>:134 - remove batched parquets
2024-09-19 08:37:30.865 | INFO | __main__:<module>:137 - execution time: 0:08:10.892310
11 changes: 11 additions & 0 deletions gtfs_funnel/logs/vp_preprocessing.log
Original file line number Diff line number Diff line change
Expand Up @@ -200,3 +200,14 @@
2024-08-15 10:05:01.848 | INFO | __main__:<module>:235 - vp with dwell time 2024-08-14: 0:07:09.680694
2024-08-15 10:13:16.657 | INFO | __main__:<module>:120 - 2024-08-14: condense vp for trip 0:07:51.642337
2024-08-15 10:24:50.802 | INFO | __main__:<module>:128 - 2024-08-14: prepare vp to use in nearest neighbor: 0:11:34.144491
2024-09-19 08:46:17.298 | INFO | __main__:<module>:169 - 2024-09-18: pare down vp: 0:02:12.746302
2024-09-19 08:51:10.542 | INFO | __main__:attach_prior_vp_add_direction:90 - persist vp gddf: 0:04:35.313281
2024-09-19 08:55:04.346 | INFO | __main__:attach_prior_vp_add_direction:122 - np vectorize arrays for direction: 0:03:53.804190
2024-09-19 08:55:11.908 | INFO | __main__:<module>:194 - 2024-09-18: export vp direction: 0:08:36.678934
2024-09-19 08:56:33.980 | INFO | __main__:<module>:200 - 2024-09-18: export usable vp with direction: 0:01:22.071985
2024-09-19 08:56:33.981 | INFO | __main__:<module>:203 - 2024-09-18: vp_direction script execution time: 0:09:58.750919
2024-09-19 09:01:58.870 | INFO | __main__:<module>:212 - compute dwell df: 0:04:44.983561
2024-09-19 09:03:13.198 | INFO | __main__:<module>:234 - merge with original and export: 0:01:14.327719
2024-09-19 09:03:13.200 | INFO | __main__:<module>:235 - vp with dwell time 2024-09-18: 0:05:59.311280
2024-09-19 09:08:43.742 | INFO | __main__:<module>:120 - 2024-09-18: condense vp for trip 0:05:09.575132
2024-09-19 09:20:16.936 | INFO | __main__:<module>:128 - 2024-09-18: prepare vp to use in nearest neighbor: 0:11:33.194871
235 changes: 235 additions & 0 deletions gtfs_funnel/published_operators.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
2024-09-18:
- Alhambra Schedule
- Amador Schedule
- Anaheim Resort Schedule
- Anaheim Resort Schedule v2
- Antelope Valley Transit Authority Schedule
- Arcadia Schedule
- Arvin Schedule
- Auburn Schedule
- B-Line Schedule
- Baldwin Park Schedule
- Banning Pass Schedule
- Bay Area 511 AC Transit Schedule
- Bay Area 511 ACE Schedule
- Bay Area 511 Angel Island-Tiburon Ferry Schedule
- Bay Area 511 BART Schedule
- Bay Area 511 Caltrain Schedule
- Bay Area 511 Capitol Corridor Schedule
- Bay Area 511 Commute.org Schedule
- Bay Area 511 County Connection Schedule
- Bay Area 511 Dumbarton Express Schedule
- Bay Area 511 Emery Go-Round Schedule
- Bay Area 511 Fairfield and Suisun Transit Schedule
- Bay Area 511 Golden Gate Ferry Schedule
- Bay Area 511 Golden Gate Transit Schedule
- Bay Area 511 MVGO Schedule
- Bay Area 511 Marin Schedule
- Bay Area 511 Mission Bay Schedule
- Bay Area 511 Muni Schedule
- Bay Area 511 Petaluma Schedule
- Bay Area 511 Rio Vista Delta Breeze Schedule
- Bay Area 511 SFO AirTrain Schedule
- Bay Area 511 SamTrans Schedule
- Bay Area 511 San Francisco Bay Ferry Schedule
- Bay Area 511 Santa Clara Transit Schedule
- Bay Area 511 Santa Rosa CityBus Schedule
- Bay Area 511 SolTrans Schedule
- Bay Area 511 Sonoma County Transit Schedule
- Bay Area 511 Sonoma-Marin Area Rail Transit Schedule
- Bay Area 511 South San Francisco Shuttle Schedule
- Bay Area 511 Treasure Island Ferry Schedule
- Bay Area 511 Tri Delta Schedule
- Bay Area 511 Tri-Valley Wheels Schedule
- Bay Area 511 Union City Transit Schedule
- Bay Area 511 Vacaville City Coach Schedule
- Bay Area 511 Vine Transit Schedule
- Bay Area 511 WestCAT Schedule
- Beach Cities GMV Schedule
- Bear Schedule
- Beaumont Pass Schedule
- Bell Gardens Schedule
- Bellflower Bus Schedule
- Big Blue Bus Schedule
- Big Blue Bus Swiftly Schedule
- BruinBus Schedule
- Burbank Schedule
- Calabasas Schedule
- Calaveras Schedule
- Cerritos on Wheels Schedule
- Cerritos on Wheels Website Schedule
- Clean Air Express Schedule
- Clovis Schedule
- Commerce Schedule
- Corona Schedule
- County Express Schedule
- Cudahy Schedule
- Culver City Schedule
- Curry Public Transit Schedule
- Dana Point Trolley Schedule
- Delano Schedule
- Desert Roadrunner GMV Schedule
- Desert Roadrunner Schedule
- DowneyLINK GMV Schedule
- Eastern Sierra Schedule
- El Dorado Schedule
- El Monte Schedule
- Elk Grove Schedule
- Flixbus Schedule
- Foothill Schedule
- Fresno County Schedule
- Fresno Schedule
- G Trans Schedule
- GET Schedule
- Get Around Town Express Schedule
- Glendale Schedule
- Glendora Schedule
- Glenn Schedule
- Go West Schedule
- Grapeline Schedule
- Guadalupe Flyer Schedule
- Havasu Landing Ferry Schedule
- Humboldt Schedule
- Huntington Schedule
- Imperial Valley Transit Schedule
- Inglewood Schedule
- Irvine CONNECT Schedule
- Kern Schedule
- Kings Schedule
- LA DOT Schedule
- LA Metro Bus Schedule
- LA Metro Rail Schedule
- LADPW Schedule
- LAX FlyAway Schedule
- LAX Flyaway Bus Schedule
- LAX Shuttles Schedule
- La Campana Schedule
- La Puente Schedule
- Laguna Beach Schedule
- Lake Schedule
- Lassen Schedule
- Lawndale Beat GMV Schedule
- Lawndale Schedule
- Lompoc Schedule
- Long Beach Schedule
- Lynwood Schedule IPS
- MV Shuttle Schedule
- Madera County Connection Schedule
- Madera Metro Schedule
- Mariposa Grove Shuttle Schedule
- Maywood Schedule
- Mendocino Schedule
- Merced GMV Schedule
- Merced Schedule
- Metrolink Schedule
- Montebello Schedule
- Monterey Salinas Schedule
- Morongo Basin Schedule
- Morro Bay Cal-ITP Schedule
- Mountain Transit GMV Schedule
- Mountain Transit Schedule
- Needles Schedule
- Nevada County Schedule
- North County Schedule
- Norwalk Avail Schedule
- OCTA Schedule
- OmniTrans Schedule
- Oregon POINT
- Palos Verdes PTA Schedule
- Pasadena Schedule
- Placer Schedule
- Plumas Schedule
- PresidiGo Schedule
- Redding Schedule
- Redwood Coast Schedule
- Riverside Schedule
- Rosemead Passio Schedule
- Roseville Schedule
- Roseville Transit GMV Schedule
- SBMTD Schedule
- SLO Schedule
- SLORTA Schedule
- Sage Stage Schedule
- San Clemente Trolley Schedule
- San Diego Schedule
- San Fernando Schedule
- San Joaquin Schedule
- San Juan Capistrano Trolley Schedule
- Santa Clarita Schedule
- Santa Maria Schedule
- Santa Ynez Mecatran Schedule
- Sierra Madre Schedule
- Siskiyou Schedule
- South County Transit Link Schedule
- South San Francisco Schedule
- Spirit Bus Passio Schedule
- StanRTA Schedule
- Stanford Schedule
- SunLine Avail Schedule
- 'TART, North Lake Tahoe Schedule'
- TCRTA TripShot Schedule
- Tahoe Transportation District GMV Schedule
- Tahoe Transportation District Schedule
- Tehama Schedule
- Torrance Schedule
- Tracy Schedule
- Trinity Schedule
- Tuolumne Remix Schedule
- Turlock Schedule
- UCSC Schedule
- Unitrans Schedule
- VCTC GMV Schedule
- Victor Valley GMV Schedule
- Victor Valley Schedule
- Visalia Schedule
- WeHo Schedule
- YARTS Schedule
- Yolobus Schedule
- Yosemite Valley Shuttle Schedule
- Yuba-Sutter Schedule
- Yuma Schedule
- eTrans Schedule
2024-08-14:
- Santa Cruz Schedule
2024-06-12:
- Anteater Express Schedule
- Lassen Flex
- Lynwood Schedule
- Manteca Schedule
2024-05-22:
- El Segundo Schedule
- Redwood Coast Schedulel
2024-04-17:
- Sacramento Schedule
2024-03-13:
- Avalon Schedule
2024-02-14:
- Rosemead Schedule
2023-12-13:
- DowneyLINK Schedule
- Humboldt Flex
- Laguna Beach Flex
- Manteca Flex
- Placer Flex
- San Joaquin Flex
- Spirit Bus Schedule
- StanRTA Flex
- TART Flex
- Thousand Oaks Flex
- Tracy Flex
- Turlock Flex
- Union City Flex
- VCTC Flex
- WestCAT Flex
2023-11-15:
- Amtrak Schedule
- Mission Bay Schedule
2023-08-15:
- Blossom Express Schedule
- Eastern Sierra Flex
2023-06-14:
- Tuolumne Schedule
2023-04-12:
- Guadalupe Flex
2023-03-15:
- TIME GMV Schedule
85 changes: 85 additions & 0 deletions gtfs_funnel/track_publish_dates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""
Grab all the operators by service date from
saved scheduled_trips tables from GCS.
Create a yaml that tells us the most recent
date available for each operator (schedule_gtfs_dataset_name).
"""
import pandas as pd
import pyaml # use pyaml because it gets us prettier indents than yaml

from pathlib import Path
from typing import Union

from shared_utils import rt_dates
from segment_speed_utils import time_series_utils

def filter_to_recent_date(df: pd.DataFrame) -> pd.DataFrame:
"""
By schedule_gtfs_dataset_name, keep the most recent
service_date that shows up in scheduled trips.
"""
df2 = (df.groupby("name", group_keys=False)
.service_date
.max()
.reset_index()
.sort_values(["service_date", "name"], ascending=[False, True])
.reset_index(drop=True)
.astype({"service_date": "str"})
)
return df2

def export_results_yml(
df: pd.DataFrame,
export_yaml: Union[str, Path]
):
"""
Save out our results from df.
Convert df into a dictionary and save out dictionary results as yaml.
"""
# TODO: check this list manually and there will be some
# operator names that have more recent names that we are keeping,
# so we can remove these from our yaml
exclude_me = [
"TIME GMV"
]

df2 = df[~df.name.isin(exclude_me)]

my_dict = {
**{
date_key: df2[df2.service_date==date_key].name.tolist()
for date_key in df2.service_date.unique()
}
}

# sort_keys=False to prevent alphabetical sort (earliest date first)
# because we want to main our results and yaml with most recent date first
output = pyaml.dump(my_dict, sort_keys=False)

with open(export_yaml, "w") as f:
f.write(output)

print(f"{export_yaml} exported")

return


if __name__ == "__main__":

from update_vars import (GTFS_DATA_DICT,
COMPILED_CACHED_VIEWS,
PUBLISHED_OPERATORS_YAML)

TABLE = GTFS_DATA_DICT.schedule_downloads.trips

operators = time_series_utils.concatenate_datasets_across_dates(
COMPILED_CACHED_VIEWS,
TABLE,
rt_dates.y2024_dates + rt_dates.y2023_dates,
data_type = "df",
get_pandas = True,
columns = ["name"]
).drop_duplicates().pipe(filter_to_recent_date)

export_results_yml(operators, PUBLISHED_OPERATORS_YAML)
Loading

0 comments on commit 9e6b007

Please sign in to comment.