diff --git a/ahsc_grant/ACS_eda.ipynb b/ahsc_grant/ACS_eda.ipynb index 18541e7cc..6227f7405 100644 --- a/ahsc_grant/ACS_eda.ipynb +++ b/ahsc_grant/ACS_eda.ipynb @@ -6629,7 +6629,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.9.13" } }, "nbformat": 4, diff --git a/ahsc_grant/process_mst.ipynb b/ahsc_grant/process_mst.ipynb index 5f8856e95..9015c163d 100644 --- a/ahsc_grant/process_mst.ipynb +++ b/ahsc_grant/process_mst.ipynb @@ -1498,7 +1498,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.5" + "version": "3.9.13" } }, "nbformat": 4, diff --git a/ahsc_grant/process_sbmtd_refactor.ipynb b/ahsc_grant/process_sbmtd_refactor.ipynb index 92a4c446c..1c1e76143 100644 --- a/ahsc_grant/process_sbmtd_refactor.ipynb +++ b/ahsc_grant/process_sbmtd_refactor.ipynb @@ -102,6 +102,7 @@ "metadata": {}, "outputs": [], "source": [ + "#Selecting analysis date and agency\n", "analysis_date = \"2022-03-15\"\n", "agency_name = \"SBMTD\"" ] @@ -113,6 +114,7 @@ "metadata": {}, "outputs": [], "source": [ + "#Getting GTFS trips data \n", "get_trips = import_scheduled_trips(analysis_date=analysis_date, columns =[\n", " \"gtfs_dataset_key\", \"feed_key\", \"name\", \"trip_id\", \n", " \"shape_id\", \"shape_array_key\", \n", @@ -127,6 +129,7 @@ "metadata": {}, "outputs": [], "source": [ + "#Function to find feed key for the selected agency \n", "def compute_feed_key(agency_name):\n", " filtered = get_trips[get_trips['name'].str.contains(agency_name, na=False)]\n", " if not filtered.empty:\n", @@ -152,6 +155,7 @@ "metadata": {}, "outputs": [], "source": [ + "#Getting stops data for the selected feed key\n", "stops_data = import_scheduled_stops(analysis_date).drop_duplicates().reset_index(drop=True)\n", "if feed_key is not None:\n", " stops_data = stops_data[stops_data['feed_key'].isin([feed_key])]" @@ -164,6 +168,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Ridership information for selected agencies \n", "GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'\n", "yr_sbmtd_raw = pd.read_excel(f'{GCS_FILE_PATH}SYSTEM WIDE STOP USAGE FY21-22.xlsx', sheet_name=None)" ] @@ -199,6 +204,7 @@ } ], "source": [ + "# Assigning the key of each item in yr_sbmtd_raw to the 'daytype' field of its corresponding value\n", "for key, value in yr_sbmtd_raw.items():\n", " value['daytype'] = f'{key}'\n", "\n", @@ -212,6 +218,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Concatenating all DataFrames in yr_sbmtd_raw into a single DataFrame, resetting the index.\n", "yr_sbmtd_all = pd.concat(yr_sbmtd_raw, ignore_index=True)" ] }, @@ -222,7 +229,7 @@ "metadata": {}, "outputs": [], "source": [ - "# standardize stop ids\n", + "# Standardizing stop ids\n", "day_cols = {'WKDY': 'weekday_ons', 'SAT': 'sat_ons', 'SUN': 'sun_ons'}\n", "\n", "yr_sbmtd_all = (yr_sbmtd_all\n", @@ -238,6 +245,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Creating a distinct, sorted dictionary of clean STOP_IDs and STOP_NAMES, removing NAs and renaming the STOP_NAME column\n", "name_id_dict = (yr_sbmtd_all \n", " >> distinct(_.STOP_ID_clean,_.STOP_NAME)\n", " >> arrange(_.STOP_ID_clean,_.STOP_NAME)\n", @@ -334,6 +342,7 @@ } ], "source": [ + "# Joining yr_sbmtd_all with name_id_dict, grouping by cleaned STOP_ID, STOP_NAME, and DAY_TYPE to summarize total boardings\n", "yr_sbmtd_grouped = (yr_sbmtd_all\n", " >> left_join(_,name_id_dict) \n", " >> group_by(_.STOP_ID_clean,_.STOP_NAME_clean, _.DAY_TYPE)\n", @@ -456,6 +465,7 @@ } ], "source": [ + "# # Converting STOP_ID to string, renaming columns, spreadng DAY_TYPE values into separate columns, and adding feed_key and name to the table\n", "yr_sbmtd_grouped = (yr_sbmtd_grouped\n", " >> mutate(STOP_ID_clean = _.STOP_ID_clean.astype(str))\n", " >> rename(stop_id = _.STOP_ID_clean)\n", @@ -500,6 +510,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Creating a dictionary mapping STOP_NAME to stop_id from the stops_to_join DataFrame.\n", "stop_name_to_id = stops_to_join.set_index('STOP_NAME')['stop_id'].to_dict()" ] }, @@ -510,6 +521,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Function to fuzzy match \n", "def get_best_match(name, choices, scorer=fuzz.ratio, threshold=90):\n", " best_match, score = process.extractOne(name, choices, scorer=scorer)\n", " if score >= threshold:\n", @@ -525,6 +537,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Matching STOP_NAME in stops_to_join with unique stop names from yr_sbmtd_grouped and mapping the corresponding stop IDs\n", "stops_to_join['matched_stop_name'] = stops_to_join['STOP_NAME'].apply(lambda x: get_best_match(x, yr_sbmtd_grouped['stop_name'].unique()))\n", "stops_to_join['matched_stop_id'] = stops_to_join['matched_stop_name'].map(stop_name_to_id)" ] @@ -546,6 +559,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Creating a dictionary mapping matched_stop_name to stop_id from fuzzy matches \n", "name_to_stop_id_mapping = fuzzy_matches.set_index('matched_stop_name')['stop_id'].to_dict()" ] }, @@ -556,6 +570,7 @@ "metadata": {}, "outputs": [], "source": [ + "#Updating the stop_id in yr_sbmtd_grouped by mapping stop names to IDs and filling missing values with the original stop_id\n", "yr_sbmtd_grouped_updated = yr_sbmtd_grouped.copy()\n", "yr_sbmtd_grouped_updated['stop_id'] = yr_sbmtd_grouped_updated['stop_name'].map(name_to_stop_id_mapping).fillna(yr_sbmtd_grouped_updated['stop_id'])" ] @@ -911,7 +926,18 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 34, + "id": "3e1e21fd-f28a-4e65-8c16-42b5eb1c93ed", + "metadata": {}, + "outputs": [], + "source": [ + "yr_sbmtd_grouped_updated.to_excel('sbmtd.xlsx', index =False)\n", + "stops_to_join.to_excel('stops_sbmtd.xlsx', index = False)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, "id": "67ec195f-84a8-4c65-83c2-7a85bbda22d4", "metadata": {}, "outputs": [ @@ -921,7 +947,7 @@ "149" ] }, - "execution_count": 32, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -937,7 +963,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 37, "id": "1225423d-148b-496e-88d8-6fdf842ff037", "metadata": {}, "outputs": [ @@ -947,7 +973,7 @@ "153" ] }, - "execution_count": 33, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -960,6 +986,182 @@ "len(yr_sbmtd_remainders)" ] }, + { + "cell_type": "code", + "execution_count": 38, + "id": "e027872b-d625-46c3-a3c2-ffe2681707b6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stop_idstop_namesat_onssun_onsweekday_onsfeed_keyname
35-49Pueblo/Castillo Out116.0109.02734.052201caab047b98ae19b7547c0d7c2adSBMTD Schedule
45100003AbreCDSONaNNaN0.052201caab047b98ae19b7547c0d7c2adSBMTD Schedule
46100004AlamBasONaNNaN0.052201caab047b98ae19b7547c0d7c2adSBMTD Schedule
47100005AlamPadN0.00.00.052201caab047b98ae19b7547c0d7c2adSBMTD Schedule
48100006AlamPadO0.00.00.052201caab047b98ae19b7547c0d7c2adSBMTD Schedule
49100008ArBuBePa0.00.00.052201caab047b98ae19b7547c0d7c2adSBMTD Schedule
50100011BranEverNaNNaN0.052201caab047b98ae19b7547c0d7c2adSBMTD Schedule
51100096CaOaAlphNaNNaN0.052201caab047b98ae19b7547c0d7c2adSBMTD Schedule
52100098CaOaFaiONaNNaN0.052201caab047b98ae19b7547c0d7c2adSBMTD Schedule
54100100CaOaTurnNaNNaN0.052201caab047b98ae19b7547c0d7c2adSBMTD Schedule
\n", + "
" + ], + "text/plain": [ + " stop_id stop_name sat_ons sun_ons weekday_ons \\\n", + "35 -49 Pueblo/Castillo Out 116.0 109.0 2734.0 \n", + "45 100003 AbreCDSO NaN NaN 0.0 \n", + "46 100004 AlamBasO NaN NaN 0.0 \n", + "47 100005 AlamPadN 0.0 0.0 0.0 \n", + "48 100006 AlamPadO 0.0 0.0 0.0 \n", + "49 100008 ArBuBePa 0.0 0.0 0.0 \n", + "50 100011 BranEver NaN NaN 0.0 \n", + "51 100096 CaOaAlph NaN NaN 0.0 \n", + "52 100098 CaOaFaiO NaN NaN 0.0 \n", + "54 100100 CaOaTurn NaN NaN 0.0 \n", + "\n", + " feed_key name \n", + "35 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule \n", + "45 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule \n", + "46 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule \n", + "47 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule \n", + "48 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule \n", + "49 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule \n", + "50 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule \n", + "51 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule \n", + "52 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule \n", + "54 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule " + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "yr_sbmtd_remainders.head(10)" + ] + }, { "cell_type": "code", "execution_count": 34,