diff --git a/conveyal_update/conveyal_vars.py b/conveyal_update/conveyal_vars.py index 97a0517a4..2ebfc8280 100644 --- a/conveyal_update/conveyal_vars.py +++ b/conveyal_update/conveyal_vars.py @@ -1,7 +1,7 @@ import datetime as dt GCS_PATH = 'gs://calitp-analytics-data/data-analyses/conveyal_update/' -TARGET_DATE = dt.date(2023, 10, 18) # '2023-10-18' is most recent in Conveyal for the main 4 regions +TARGET_DATE = dt.date(2022, 9, 21) # 2022 date for SCAG request OSM_FILE = 'us-west-latest.osm.pbf' # http://download.geofabrik.de/north-america/us-west-latest.osm.pbf # first download with wget... diff --git a/conveyal_update/evaluate_feeds.py b/conveyal_update/evaluate_feeds.py index 7c0e1f1f2..d11d9e8e4 100644 --- a/conveyal_update/evaluate_feeds.py +++ b/conveyal_update/evaluate_feeds.py @@ -54,16 +54,22 @@ def attach_transit_services(feeds_on_target: pd.DataFrame): def report_undefined(feeds_on_target: pd.DataFrame): fname = 'no_apparent_service.csv' undefined = feeds_on_target.apply(check_defined_elsewhere, axis=1, args=[feeds_on_target]) >> filter(-_.service_any_feed) - print('these feeds have no service defined on target date, nor are their services captured in other feeds:') - print(undefined >> select(_.gtfs_dataset_name, _.service_any_feed)) - print(f'saving detailed csv to {fname}') - undefined.to_csv(fname) + if undefined.empty: + print('no undefined service feeds') + else: + print(undefined.columns) + print('these feeds have no service defined on target date, nor are their services captured in other feeds:') + # gtfs_dataset_name no longer present, this whole script should probably be updated/replaced + # print(undefined >> select(_.gtfs_dataset_name, _.service_any_feed)) + print(f'saving detailed csv to {fname}') + undefined.to_csv(fname) return if __name__ == '__main__': feeds_on_target = get_feeds_check_service() feeds_on_target = attach_transit_services(feeds_on_target) + print(f'feeds on target date shape: {feeds_on_target.shape}') report_undefined(feeds_on_target) feeds_on_target.to_parquet(f'{conveyal_vars.GCS_PATH}feeds_{TARGET_DATE.isoformat()}.parquet') \ No newline at end of file diff --git a/conveyal_update/scag_2022_export.ipynb b/conveyal_update/scag_2022_export.ipynb new file mode 100644 index 000000000..0f4194e40 --- /dev/null +++ b/conveyal_update/scag_2022_export.ipynb @@ -0,0 +1,744 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 9, + "id": "2f0079e6-ac0e-4528-9970-2f3f4c453caa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from siuba import *\n", + "import pandas as pd\n", + "import geopandas as gpd\n", + "import datetime as dt\n", + "\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "id": "b52cd909-556a-4e69-8abd-b54ae1f2181d", + "metadata": {}, + "source": [ + "# 2022 SCAG GTFS Export\n", + "\n", + "SCAG is interested in 2022 GTFS data so they can recalculate their RTP/SCS hq corridor/major stop baseline per AB2553.\n", + "\n", + "Download original feed for any feed with a stop in SCAG region.\n", + "Use `gtfs_funnel`, could likely redo conveyal update scripts this way too." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "993ef52d-a1a8-49f0-a7a0-00e18748d569", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"CALITP_BQ_MAX_BYTES\"] = str(800_000_000_000)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e9e1ec68-7ee0-441b-93a2-670e324d5b00", + "metadata": {}, + "outputs": [], + "source": [ + "import zipfile\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b9be13fb-c18e-40d0-b228-de1ae4afc1d2", + "metadata": {}, + "outputs": [], + "source": [ + "from shared_utils import gtfs_utils_v2, catalog_utils, rt_dates" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "bd68a19f-5a37-4b83-94c2-a5c0211d80ee", + "metadata": {}, + "outputs": [], + "source": [ + "from calitp_data_analysis.geography_utils import CA_NAD83Albers, WGS84" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "1badd168-ff8f-4c5f-8697-a5ba9ce59f70", + "metadata": {}, + "outputs": [], + "source": [ + "scag = gpd.read_file('scag_region.geojson')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "21a05873-55a8-4571-aa50-cc8bb9ea3ba3", + "metadata": {}, + "outputs": [], + "source": [ + "cat = catalog_utils.get_catalog('gtfs_analytics_data')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "4c2ff82f-bc8d-4812-a263-bda2439cee50", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'stops'" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat.schedule_downloads.stops" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d4eb920c-7791-4e8a-93cb-91c359f82f29", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat.gcs_paths.COMPILED_CACHED_VIEWS" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "3a74097b-8f2f-4b73-b125-5ceca5409705", + "metadata": {}, + "outputs": [], + "source": [ + "analysis_date = rt_dates.DATES['sep2022a']" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "9ae8fd74-ccc6-43dc-a4bf-6fe7f54c5952", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2022-09-21'" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analysis_date" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "ce880745-df7b-444c-8dac-43d1cce88abc", + "metadata": {}, + "outputs": [], + "source": [ + "stops = gpd.read_parquet(f'{cat.gcs_paths.COMPILED_CACHED_VIEWS}{cat.schedule_downloads.stops}_{analysis_date}.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "0031fbbc-b0d4-4d79-adb2-d0639d72f072", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['feed_key', 'service_date', 'feed_timezone',\n", + " 'first_stop_arrival_datetime_pacific',\n", + " 'last_stop_departure_datetime_pacific', 'stop_id', 'stop_key',\n", + " 'stop_name', 'stop_event_count', 'route_type_0', 'route_type_1',\n", + " 'route_type_2', 'route_type_3', 'route_type_4', 'route_type_5',\n", + " 'route_type_6', 'route_type_7', 'route_type_11', 'route_type_12',\n", + " 'missing_route_type', 'geometry'],\n", + " dtype='object')" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stops.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "4576d627-20ae-4393-a06f-eb457241a41c", + "metadata": {}, + "outputs": [], + "source": [ + "stops = stops >> select(_.feed_key, _.geometry)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "ef6eaa7c-97cb-4da9-8df3-c9b94e006d4a", + "metadata": {}, + "outputs": [], + "source": [ + "feeds = (gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(selected_date=analysis_date)\n", + " >> select(_.feed_key, _.gtfs_dataset_name == _.name, _.base64_url))" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "6e722819-6d73-45ee-9518-0f5c37716ed0", + "metadata": {}, + "outputs": [], + "source": [ + "gdf = stops >> inner_join(_, feeds, on='feed_key')" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "19f9a4c5-5ba7-41bc-be8c-83d87cafe378", + "metadata": {}, + "outputs": [], + "source": [ + "gdf = gdf.to_crs(CA_NAD83Albers)" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "f8f5b976-f56c-4425-b658-8b48fbf465b3", + "metadata": {}, + "outputs": [], + "source": [ + "scag = scag.to_crs(CA_NAD83Albers)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "d821daef-1224-4886-8977-bb587188ef5c", + "metadata": {}, + "outputs": [], + "source": [ + "scag_stops = gdf.clip(scag.dissolve())" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "71348fde-8230-4870-af61-77c6126d0aa4", + "metadata": {}, + "outputs": [], + "source": [ + "scag_feeds = scag_stops >> distinct(_.feed_key, _.base64_url, _.gtfs_dataset_name)\n", + "scag_feeds['date'] = analysis_date" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "9dc410fd-cd5b-4f66-b35b-cf5f9a39182a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | feed_key | \n", + "base64_url | \n", + "gtfs_dataset_name | \n", + "date | \n", + "
---|---|---|---|---|
0 | \n", + "360899e1281d494ad773604cd324a8c4 | \n", + "aHR0cHM6Ly9nb3ZjYnVzLmNvbS9ndGZz | \n", + "VCTC GMV Schedule | \n", + "2022-09-21 | \n", + "
1 | \n", + "4cc74cc4d637c03ba2d87df7080a57d1 | \n", + "aHR0cDovL3d3dy5nb2xkY29hc3R0cmFuc2l0Lm9yZy9pbW... | \n", + "Gold Coast Schedule | \n", + "2022-09-21 | \n", + "
2 | \n", + "cf5091853923f5eee684e4b8f1763b3b | \n", + "aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... | \n", + "VCTC Schedule | \n", + "2022-09-21 | \n", + "
3 | \n", + "756ac24f2446226cf33c5ce02f73c028 | \n", + "aHR0cHM6Ly93d3cueWNpcHRhLm9yZy9ndGZzL2dvb2dsZV... | \n", + "Yuma Schedule | \n", + "2022-09-21 | \n", + "
4 | \n", + "12fc0bdeffdf836940c5755c9611103e | \n", + "aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... | \n", + "Desert Roadrunner Schedule | \n", + "2022-09-21 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
61 | \n", + "c20c953e5cec597e7e75ef05c2f8746f | \n", + "aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... | \n", + "Moorpark Schedule | \n", + "2022-09-21 | \n", + "
62 | \n", + "66831926626d19ead711aef2a6be877f | \n", + "aHR0cHM6Ly9naXRodWIuY29tL0xBQ01UQS9sb3MtYW5nZW... | \n", + "Sierra Madre Schedule | \n", + "2022-09-21 | \n", + "
63 | \n", + "3914ec5719b2697f81b4318b38e91694 | \n", + "aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... | \n", + "Kern Schedule | \n", + "2022-09-21 | \n", + "
64 | \n", + "0a165d0fe19fefcb424f577091cf52d0 | \n", + "aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... | \n", + "Eastern Sierra Schedule | \n", + "2022-09-21 | \n", + "
65 | \n", + "7a8e48c9cf05f58718635944100028d4 | \n", + "aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... | \n", + "Needles Schedule | \n", + "2022-09-21 | \n", + "
66 rows × 4 columns
\n", + "\n", + " | feed_key | \n", + "base64_url | \n", + "gtfs_dataset_name | \n", + "date | \n", + "
---|---|---|---|---|
0 | \n", + "360899e1281d494ad773604cd324a8c4 | \n", + "aHR0cHM6Ly9nb3ZjYnVzLmNvbS9ndGZz | \n", + "vctc_gmv_schedule | \n", + "2022-09-21 | \n", + "
1 | \n", + "4cc74cc4d637c03ba2d87df7080a57d1 | \n", + "aHR0cDovL3d3dy5nb2xkY29hc3R0cmFuc2l0Lm9yZy9pbW... | \n", + "gold_coast_schedule | \n", + "2022-09-21 | \n", + "
2 | \n", + "cf5091853923f5eee684e4b8f1763b3b | \n", + "aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... | \n", + "vctc_schedule | \n", + "2022-09-21 | \n", + "
3 | \n", + "756ac24f2446226cf33c5ce02f73c028 | \n", + "aHR0cHM6Ly93d3cueWNpcHRhLm9yZy9ndGZzL2dvb2dsZV... | \n", + "yuma_schedule | \n", + "2022-09-21 | \n", + "
4 | \n", + "12fc0bdeffdf836940c5755c9611103e | \n", + "aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... | \n", + "desert_roadrunner_schedule | \n", + "2022-09-21 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
61 | \n", + "c20c953e5cec597e7e75ef05c2f8746f | \n", + "aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... | \n", + "moorpark_schedule | \n", + "2022-09-21 | \n", + "
62 | \n", + "66831926626d19ead711aef2a6be877f | \n", + "aHR0cHM6Ly9naXRodWIuY29tL0xBQ01UQS9sb3MtYW5nZW... | \n", + "sierra_madre_schedule | \n", + "2022-09-21 | \n", + "
63 | \n", + "3914ec5719b2697f81b4318b38e91694 | \n", + "aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... | \n", + "kern_schedule | \n", + "2022-09-21 | \n", + "
64 | \n", + "0a165d0fe19fefcb424f577091cf52d0 | \n", + "aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... | \n", + "eastern_sierra_schedule | \n", + "2022-09-21 | \n", + "
65 | \n", + "7a8e48c9cf05f58718635944100028d4 | \n", + "aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... | \n", + "needles_schedule | \n", + "2022-09-21 | \n", + "
66 rows × 4 columns
\n", + "