From 3c441758702943293ed4ece2a4f5f8ff7d4fc443 Mon Sep 17 00:00:00 2001 From: Eric Dasmalchi Date: Tue, 29 Oct 2024 21:21:02 +0000 Subject: [PATCH 1/2] wip --- conveyal_update/conveyal_vars.py | 2 +- conveyal_update/evaluate_feeds.py | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/conveyal_update/conveyal_vars.py b/conveyal_update/conveyal_vars.py index 97a0517a4..2ebfc8280 100644 --- a/conveyal_update/conveyal_vars.py +++ b/conveyal_update/conveyal_vars.py @@ -1,7 +1,7 @@ import datetime as dt GCS_PATH = 'gs://calitp-analytics-data/data-analyses/conveyal_update/' -TARGET_DATE = dt.date(2023, 10, 18) # '2023-10-18' is most recent in Conveyal for the main 4 regions +TARGET_DATE = dt.date(2022, 9, 21) # 2022 date for SCAG request OSM_FILE = 'us-west-latest.osm.pbf' # http://download.geofabrik.de/north-america/us-west-latest.osm.pbf # first download with wget... diff --git a/conveyal_update/evaluate_feeds.py b/conveyal_update/evaluate_feeds.py index 7c0e1f1f2..36f388a67 100644 --- a/conveyal_update/evaluate_feeds.py +++ b/conveyal_update/evaluate_feeds.py @@ -54,16 +54,21 @@ def attach_transit_services(feeds_on_target: pd.DataFrame): def report_undefined(feeds_on_target: pd.DataFrame): fname = 'no_apparent_service.csv' undefined = feeds_on_target.apply(check_defined_elsewhere, axis=1, args=[feeds_on_target]) >> filter(-_.service_any_feed) - print('these feeds have no service defined on target date, nor are their services captured in other feeds:') - print(undefined >> select(_.gtfs_dataset_name, _.service_any_feed)) - print(f'saving detailed csv to {fname}') - undefined.to_csv(fname) + if undefined.empty: + print('no undefined service feeds') + else: + print(undefined.columns) + print('these feeds have no service defined on target date, nor are their services captured in other feeds:') + print(undefined >> select(_.gtfs_dataset_name, _.service_any_feed)) + print(f'saving detailed csv to {fname}') + undefined.to_csv(fname) return if __name__ == '__main__': feeds_on_target = get_feeds_check_service() feeds_on_target = attach_transit_services(feeds_on_target) + print(f'feeds on target date shape: {feeds_on_target.shape}') report_undefined(feeds_on_target) feeds_on_target.to_parquet(f'{conveyal_vars.GCS_PATH}feeds_{TARGET_DATE.isoformat()}.parquet') \ No newline at end of file From ca9471dfccbdf9c35da9a928cbd11674b7ef55cd Mon Sep 17 00:00:00 2001 From: Eric Dasmalchi Date: Tue, 29 Oct 2024 22:55:05 +0000 Subject: [PATCH 2/2] add comment, commit scag 2022 export nb --- conveyal_update/evaluate_feeds.py | 3 +- conveyal_update/scag_2022_export.ipynb | 744 +++++++++++++++++++++++++ 2 files changed, 746 insertions(+), 1 deletion(-) create mode 100644 conveyal_update/scag_2022_export.ipynb diff --git a/conveyal_update/evaluate_feeds.py b/conveyal_update/evaluate_feeds.py index 36f388a67..d11d9e8e4 100644 --- a/conveyal_update/evaluate_feeds.py +++ b/conveyal_update/evaluate_feeds.py @@ -59,7 +59,8 @@ def report_undefined(feeds_on_target: pd.DataFrame): else: print(undefined.columns) print('these feeds have no service defined on target date, nor are their services captured in other feeds:') - print(undefined >> select(_.gtfs_dataset_name, _.service_any_feed)) + # gtfs_dataset_name no longer present, this whole script should probably be updated/replaced + # print(undefined >> select(_.gtfs_dataset_name, _.service_any_feed)) print(f'saving detailed csv to {fname}') undefined.to_csv(fname) return diff --git a/conveyal_update/scag_2022_export.ipynb b/conveyal_update/scag_2022_export.ipynb new file mode 100644 index 000000000..0f4194e40 --- /dev/null +++ b/conveyal_update/scag_2022_export.ipynb @@ -0,0 +1,744 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 9, + "id": "2f0079e6-ac0e-4528-9970-2f3f4c453caa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from siuba import *\n", + "import pandas as pd\n", + "import geopandas as gpd\n", + "import datetime as dt\n", + "\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "id": "b52cd909-556a-4e69-8abd-b54ae1f2181d", + "metadata": {}, + "source": [ + "# 2022 SCAG GTFS Export\n", + "\n", + "SCAG is interested in 2022 GTFS data so they can recalculate their RTP/SCS hq corridor/major stop baseline per AB2553.\n", + "\n", + "Download original feed for any feed with a stop in SCAG region.\n", + "Use `gtfs_funnel`, could likely redo conveyal update scripts this way too." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "993ef52d-a1a8-49f0-a7a0-00e18748d569", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"CALITP_BQ_MAX_BYTES\"] = str(800_000_000_000)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e9e1ec68-7ee0-441b-93a2-670e324d5b00", + "metadata": {}, + "outputs": [], + "source": [ + "import zipfile\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b9be13fb-c18e-40d0-b228-de1ae4afc1d2", + "metadata": {}, + "outputs": [], + "source": [ + "from shared_utils import gtfs_utils_v2, catalog_utils, rt_dates" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "bd68a19f-5a37-4b83-94c2-a5c0211d80ee", + "metadata": {}, + "outputs": [], + "source": [ + "from calitp_data_analysis.geography_utils import CA_NAD83Albers, WGS84" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "1badd168-ff8f-4c5f-8697-a5ba9ce59f70", + "metadata": {}, + "outputs": [], + "source": [ + "scag = gpd.read_file('scag_region.geojson')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "21a05873-55a8-4571-aa50-cc8bb9ea3ba3", + "metadata": {}, + "outputs": [], + "source": [ + "cat = catalog_utils.get_catalog('gtfs_analytics_data')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "4c2ff82f-bc8d-4812-a263-bda2439cee50", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'stops'" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat.schedule_downloads.stops" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d4eb920c-7791-4e8a-93cb-91c359f82f29", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat.gcs_paths.COMPILED_CACHED_VIEWS" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "3a74097b-8f2f-4b73-b125-5ceca5409705", + "metadata": {}, + "outputs": [], + "source": [ + "analysis_date = rt_dates.DATES['sep2022a']" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "9ae8fd74-ccc6-43dc-a4bf-6fe7f54c5952", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2022-09-21'" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analysis_date" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "ce880745-df7b-444c-8dac-43d1cce88abc", + "metadata": {}, + "outputs": [], + "source": [ + "stops = gpd.read_parquet(f'{cat.gcs_paths.COMPILED_CACHED_VIEWS}{cat.schedule_downloads.stops}_{analysis_date}.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "0031fbbc-b0d4-4d79-adb2-d0639d72f072", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['feed_key', 'service_date', 'feed_timezone',\n", + " 'first_stop_arrival_datetime_pacific',\n", + " 'last_stop_departure_datetime_pacific', 'stop_id', 'stop_key',\n", + " 'stop_name', 'stop_event_count', 'route_type_0', 'route_type_1',\n", + " 'route_type_2', 'route_type_3', 'route_type_4', 'route_type_5',\n", + " 'route_type_6', 'route_type_7', 'route_type_11', 'route_type_12',\n", + " 'missing_route_type', 'geometry'],\n", + " dtype='object')" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stops.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "4576d627-20ae-4393-a06f-eb457241a41c", + "metadata": {}, + "outputs": [], + "source": [ + "stops = stops >> select(_.feed_key, _.geometry)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "ef6eaa7c-97cb-4da9-8df3-c9b94e006d4a", + "metadata": {}, + "outputs": [], + "source": [ + "feeds = (gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(selected_date=analysis_date)\n", + " >> select(_.feed_key, _.gtfs_dataset_name == _.name, _.base64_url))" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "6e722819-6d73-45ee-9518-0f5c37716ed0", + "metadata": {}, + "outputs": [], + "source": [ + "gdf = stops >> inner_join(_, feeds, on='feed_key')" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "19f9a4c5-5ba7-41bc-be8c-83d87cafe378", + "metadata": {}, + "outputs": [], + "source": [ + "gdf = gdf.to_crs(CA_NAD83Albers)" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "f8f5b976-f56c-4425-b658-8b48fbf465b3", + "metadata": {}, + "outputs": [], + "source": [ + "scag = scag.to_crs(CA_NAD83Albers)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "d821daef-1224-4886-8977-bb587188ef5c", + "metadata": {}, + "outputs": [], + "source": [ + "scag_stops = gdf.clip(scag.dissolve())" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "71348fde-8230-4870-af61-77c6126d0aa4", + "metadata": {}, + "outputs": [], + "source": [ + "scag_feeds = scag_stops >> distinct(_.feed_key, _.base64_url, _.gtfs_dataset_name)\n", + "scag_feeds['date'] = analysis_date" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "9dc410fd-cd5b-4f66-b35b-cf5f9a39182a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
feed_keybase64_urlgtfs_dataset_namedate
0360899e1281d494ad773604cd324a8c4aHR0cHM6Ly9nb3ZjYnVzLmNvbS9ndGZzVCTC GMV Schedule2022-09-21
14cc74cc4d637c03ba2d87df7080a57d1aHR0cDovL3d3dy5nb2xkY29hc3R0cmFuc2l0Lm9yZy9pbW...Gold Coast Schedule2022-09-21
2cf5091853923f5eee684e4b8f1763b3baHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...VCTC Schedule2022-09-21
3756ac24f2446226cf33c5ce02f73c028aHR0cHM6Ly93d3cueWNpcHRhLm9yZy9ndGZzL2dvb2dsZV...Yuma Schedule2022-09-21
412fc0bdeffdf836940c5755c9611103eaHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...Desert Roadrunner Schedule2022-09-21
...............
61c20c953e5cec597e7e75ef05c2f8746faHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...Moorpark Schedule2022-09-21
6266831926626d19ead711aef2a6be877faHR0cHM6Ly9naXRodWIuY29tL0xBQ01UQS9sb3MtYW5nZW...Sierra Madre Schedule2022-09-21
633914ec5719b2697f81b4318b38e91694aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...Kern Schedule2022-09-21
640a165d0fe19fefcb424f577091cf52d0aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...Eastern Sierra Schedule2022-09-21
657a8e48c9cf05f58718635944100028d4aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...Needles Schedule2022-09-21
\n", + "

66 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " feed_key \\\n", + "0 360899e1281d494ad773604cd324a8c4 \n", + "1 4cc74cc4d637c03ba2d87df7080a57d1 \n", + "2 cf5091853923f5eee684e4b8f1763b3b \n", + "3 756ac24f2446226cf33c5ce02f73c028 \n", + "4 12fc0bdeffdf836940c5755c9611103e \n", + ".. ... \n", + "61 c20c953e5cec597e7e75ef05c2f8746f \n", + "62 66831926626d19ead711aef2a6be877f \n", + "63 3914ec5719b2697f81b4318b38e91694 \n", + "64 0a165d0fe19fefcb424f577091cf52d0 \n", + "65 7a8e48c9cf05f58718635944100028d4 \n", + "\n", + " base64_url \\\n", + "0 aHR0cHM6Ly9nb3ZjYnVzLmNvbS9ndGZz \n", + "1 aHR0cDovL3d3dy5nb2xkY29hc3R0cmFuc2l0Lm9yZy9pbW... \n", + "2 aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... \n", + "3 aHR0cHM6Ly93d3cueWNpcHRhLm9yZy9ndGZzL2dvb2dsZV... \n", + "4 aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... \n", + ".. ... \n", + "61 aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... \n", + "62 aHR0cHM6Ly9naXRodWIuY29tL0xBQ01UQS9sb3MtYW5nZW... \n", + "63 aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... \n", + "64 aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... \n", + "65 aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... \n", + "\n", + " gtfs_dataset_name date \n", + "0 VCTC GMV Schedule 2022-09-21 \n", + "1 Gold Coast Schedule 2022-09-21 \n", + "2 VCTC Schedule 2022-09-21 \n", + "3 Yuma Schedule 2022-09-21 \n", + "4 Desert Roadrunner Schedule 2022-09-21 \n", + ".. ... ... \n", + "61 Moorpark Schedule 2022-09-21 \n", + "62 Sierra Madre Schedule 2022-09-21 \n", + "63 Kern Schedule 2022-09-21 \n", + "64 Eastern Sierra Schedule 2022-09-21 \n", + "65 Needles Schedule 2022-09-21 \n", + "\n", + "[66 rows x 4 columns]" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scag_feeds" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "339fc998-590e-49a7-8b10-3b684790480f", + "metadata": {}, + "outputs": [], + "source": [ + "from shared_utils.rt_utils import show_full_df" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "67f25e2b-53c9-4879-8fd5-cbf0288bb871", + "metadata": {}, + "outputs": [], + "source": [ + "scag_feeds.gtfs_dataset_name = scag_feeds.gtfs_dataset_name.str.lower().str.replace(' ', '_')" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "0d9815ce-9692-4e0e-80fa-0280326f95b2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
feed_keybase64_urlgtfs_dataset_namedate
0360899e1281d494ad773604cd324a8c4aHR0cHM6Ly9nb3ZjYnVzLmNvbS9ndGZzvctc_gmv_schedule2022-09-21
14cc74cc4d637c03ba2d87df7080a57d1aHR0cDovL3d3dy5nb2xkY29hc3R0cmFuc2l0Lm9yZy9pbW...gold_coast_schedule2022-09-21
2cf5091853923f5eee684e4b8f1763b3baHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...vctc_schedule2022-09-21
3756ac24f2446226cf33c5ce02f73c028aHR0cHM6Ly93d3cueWNpcHRhLm9yZy9ndGZzL2dvb2dsZV...yuma_schedule2022-09-21
412fc0bdeffdf836940c5755c9611103eaHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...desert_roadrunner_schedule2022-09-21
...............
61c20c953e5cec597e7e75ef05c2f8746faHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...moorpark_schedule2022-09-21
6266831926626d19ead711aef2a6be877faHR0cHM6Ly9naXRodWIuY29tL0xBQ01UQS9sb3MtYW5nZW...sierra_madre_schedule2022-09-21
633914ec5719b2697f81b4318b38e91694aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...kern_schedule2022-09-21
640a165d0fe19fefcb424f577091cf52d0aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...eastern_sierra_schedule2022-09-21
657a8e48c9cf05f58718635944100028d4aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...needles_schedule2022-09-21
\n", + "

66 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " feed_key \\\n", + "0 360899e1281d494ad773604cd324a8c4 \n", + "1 4cc74cc4d637c03ba2d87df7080a57d1 \n", + "2 cf5091853923f5eee684e4b8f1763b3b \n", + "3 756ac24f2446226cf33c5ce02f73c028 \n", + "4 12fc0bdeffdf836940c5755c9611103e \n", + ".. ... \n", + "61 c20c953e5cec597e7e75ef05c2f8746f \n", + "62 66831926626d19ead711aef2a6be877f \n", + "63 3914ec5719b2697f81b4318b38e91694 \n", + "64 0a165d0fe19fefcb424f577091cf52d0 \n", + "65 7a8e48c9cf05f58718635944100028d4 \n", + "\n", + " base64_url \\\n", + "0 aHR0cHM6Ly9nb3ZjYnVzLmNvbS9ndGZz \n", + "1 aHR0cDovL3d3dy5nb2xkY29hc3R0cmFuc2l0Lm9yZy9pbW... \n", + "2 aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... \n", + "3 aHR0cHM6Ly93d3cueWNpcHRhLm9yZy9ndGZzL2dvb2dsZV... \n", + "4 aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... \n", + ".. ... \n", + "61 aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... \n", + "62 aHR0cHM6Ly9naXRodWIuY29tL0xBQ01UQS9sb3MtYW5nZW... \n", + "63 aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... \n", + "64 aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... \n", + "65 aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG... \n", + "\n", + " gtfs_dataset_name date \n", + "0 vctc_gmv_schedule 2022-09-21 \n", + "1 gold_coast_schedule 2022-09-21 \n", + "2 vctc_schedule 2022-09-21 \n", + "3 yuma_schedule 2022-09-21 \n", + "4 desert_roadrunner_schedule 2022-09-21 \n", + ".. ... ... \n", + "61 moorpark_schedule 2022-09-21 \n", + "62 sierra_madre_schedule 2022-09-21 \n", + "63 kern_schedule 2022-09-21 \n", + "64 eastern_sierra_schedule 2022-09-21 \n", + "65 needles_schedule 2022-09-21 \n", + "\n", + "[66 rows x 4 columns]" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scag_feeds" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "bd12d7e1-e4fd-4930-9cbf-0182e6919b3c", + "metadata": {}, + "outputs": [], + "source": [ + "def download_feed(row):\n", + " # need wildcard for file too -- not all are gtfs.zip!\n", + " uri = f'gs://calitp-gtfs-schedule-raw-v2/schedule/dt={row.date}/*/base64_url={row.base64_url}/*.zip'\n", + " fs.get(uri, f'{row.path}/{row.gtfs_dataset_name}_{row.feed_key}_gtfs.zip')\n", + " # print(f'downloaded {row.path}/{row.feed_key}_gtfs.zip')\n", + " \n", + "def download_region(feeds_df):\n", + " \n", + " path = f'./feeds_{feeds_df.date.iloc[0]}/scag'\n", + " if not os.path.exists(path): os.makedirs(path)\n", + " feeds_df['path'] = path\n", + " feeds_df.progress_apply(download_feed, axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "id": "9106b5a5-f2c2-441d-ac8a-ebac71998f27", + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "tqdm.pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "id": "673451f5-e1b4-4406-8842-475ea274a4fd", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 66/66 [00:22<00:00, 2.94it/s]\n" + ] + } + ], + "source": [ + "download_region(scag_feeds)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "8641df71-265c-4a19-a204-60e2e5525c9e", + "metadata": {}, + "outputs": [], + "source": [ + "import shutil" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "1aab6e78-73cc-41c7-8742-1e4ca5358bfd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/home/jovyan/data-analyses/conveyal_update/feeds_2022-09-21.zip'" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "shutil.make_archive(f'feeds_{analysis_date}', 'zip', f'./feeds_{analysis_date}/')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}