Skip to content

Commit

Permalink
summer work dashboard refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
shweta487 committed Oct 6, 2024
1 parent 660b719 commit a9cc43e
Show file tree
Hide file tree
Showing 3 changed files with 209 additions and 7 deletions.
2 changes: 1 addition & 1 deletion ahsc_grant/ACS_eda.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6629,7 +6629,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.9.13"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion ahsc_grant/process_mst.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1498,7 +1498,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.5"
"version": "3.9.13"
}
},
"nbformat": 4,
Expand Down
212 changes: 207 additions & 5 deletions ahsc_grant/process_sbmtd_refactor.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@
"metadata": {},
"outputs": [],
"source": [
"#Selecting analysis date and agency\n",
"analysis_date = \"2022-03-15\"\n",
"agency_name = \"SBMTD\""
]
Expand All @@ -113,6 +114,7 @@
"metadata": {},
"outputs": [],
"source": [
"#Getting GTFS trips data \n",
"get_trips = import_scheduled_trips(analysis_date=analysis_date, columns =[\n",
" \"gtfs_dataset_key\", \"feed_key\", \"name\", \"trip_id\", \n",
" \"shape_id\", \"shape_array_key\", \n",
Expand All @@ -127,6 +129,7 @@
"metadata": {},
"outputs": [],
"source": [
"#Function to find feed key for the selected agency \n",
"def compute_feed_key(agency_name):\n",
" filtered = get_trips[get_trips['name'].str.contains(agency_name, na=False)]\n",
" if not filtered.empty:\n",
Expand All @@ -152,6 +155,7 @@
"metadata": {},
"outputs": [],
"source": [
"#Getting stops data for the selected feed key\n",
"stops_data = import_scheduled_stops(analysis_date).drop_duplicates().reset_index(drop=True)\n",
"if feed_key is not None:\n",
" stops_data = stops_data[stops_data['feed_key'].isin([feed_key])]"
Expand All @@ -164,6 +168,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Ridership information for selected agencies \n",
"GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'\n",
"yr_sbmtd_raw = pd.read_excel(f'{GCS_FILE_PATH}SYSTEM WIDE STOP USAGE FY21-22.xlsx', sheet_name=None)"
]
Expand Down Expand Up @@ -199,6 +204,7 @@
}
],
"source": [
"# Assigning the key of each item in yr_sbmtd_raw to the 'daytype' field of its corresponding value\n",
"for key, value in yr_sbmtd_raw.items():\n",
" value['daytype'] = f'{key}'\n",
"\n",
Expand All @@ -212,6 +218,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Concatenating all DataFrames in yr_sbmtd_raw into a single DataFrame, resetting the index.\n",
"yr_sbmtd_all = pd.concat(yr_sbmtd_raw, ignore_index=True)"
]
},
Expand All @@ -222,7 +229,7 @@
"metadata": {},
"outputs": [],
"source": [
"# standardize stop ids\n",
"# Standardizing stop ids\n",
"day_cols = {'WKDY': 'weekday_ons', 'SAT': 'sat_ons', 'SUN': 'sun_ons'}\n",
"\n",
"yr_sbmtd_all = (yr_sbmtd_all\n",
Expand All @@ -238,6 +245,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Creating a distinct, sorted dictionary of clean STOP_IDs and STOP_NAMES, removing NAs and renaming the STOP_NAME column\n",
"name_id_dict = (yr_sbmtd_all \n",
" >> distinct(_.STOP_ID_clean,_.STOP_NAME)\n",
" >> arrange(_.STOP_ID_clean,_.STOP_NAME)\n",
Expand Down Expand Up @@ -334,6 +342,7 @@
}
],
"source": [
"# Joining yr_sbmtd_all with name_id_dict, grouping by cleaned STOP_ID, STOP_NAME, and DAY_TYPE to summarize total boardings\n",
"yr_sbmtd_grouped = (yr_sbmtd_all\n",
" >> left_join(_,name_id_dict) \n",
" >> group_by(_.STOP_ID_clean,_.STOP_NAME_clean, _.DAY_TYPE)\n",
Expand Down Expand Up @@ -456,6 +465,7 @@
}
],
"source": [
"# # Converting STOP_ID to string, renaming columns, spreadng DAY_TYPE values into separate columns, and adding feed_key and name to the table\n",
"yr_sbmtd_grouped = (yr_sbmtd_grouped\n",
" >> mutate(STOP_ID_clean = _.STOP_ID_clean.astype(str))\n",
" >> rename(stop_id = _.STOP_ID_clean)\n",
Expand Down Expand Up @@ -500,6 +510,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Creating a dictionary mapping STOP_NAME to stop_id from the stops_to_join DataFrame.\n",
"stop_name_to_id = stops_to_join.set_index('STOP_NAME')['stop_id'].to_dict()"
]
},
Expand All @@ -510,6 +521,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Function to fuzzy match \n",
"def get_best_match(name, choices, scorer=fuzz.ratio, threshold=90):\n",
" best_match, score = process.extractOne(name, choices, scorer=scorer)\n",
" if score >= threshold:\n",
Expand All @@ -525,6 +537,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Matching STOP_NAME in stops_to_join with unique stop names from yr_sbmtd_grouped and mapping the corresponding stop IDs\n",
"stops_to_join['matched_stop_name'] = stops_to_join['STOP_NAME'].apply(lambda x: get_best_match(x, yr_sbmtd_grouped['stop_name'].unique()))\n",
"stops_to_join['matched_stop_id'] = stops_to_join['matched_stop_name'].map(stop_name_to_id)"
]
Expand All @@ -546,6 +559,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Creating a dictionary mapping matched_stop_name to stop_id from fuzzy matches \n",
"name_to_stop_id_mapping = fuzzy_matches.set_index('matched_stop_name')['stop_id'].to_dict()"
]
},
Expand All @@ -556,6 +570,7 @@
"metadata": {},
"outputs": [],
"source": [
"#Updating the stop_id in yr_sbmtd_grouped by mapping stop names to IDs and filling missing values with the original stop_id\n",
"yr_sbmtd_grouped_updated = yr_sbmtd_grouped.copy()\n",
"yr_sbmtd_grouped_updated['stop_id'] = yr_sbmtd_grouped_updated['stop_name'].map(name_to_stop_id_mapping).fillna(yr_sbmtd_grouped_updated['stop_id'])"
]
Expand Down Expand Up @@ -911,7 +926,18 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 34,
"id": "3e1e21fd-f28a-4e65-8c16-42b5eb1c93ed",
"metadata": {},
"outputs": [],
"source": [
"yr_sbmtd_grouped_updated.to_excel('sbmtd.xlsx', index =False)\n",
"stops_to_join.to_excel('stops_sbmtd.xlsx', index = False)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "67ec195f-84a8-4c65-83c2-7a85bbda22d4",
"metadata": {},
"outputs": [
Expand All @@ -921,7 +947,7 @@
"149"
]
},
"execution_count": 32,
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -937,7 +963,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 37,
"id": "1225423d-148b-496e-88d8-6fdf842ff037",
"metadata": {},
"outputs": [
Expand All @@ -947,7 +973,7 @@
"153"
]
},
"execution_count": 33,
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -960,6 +986,182 @@
"len(yr_sbmtd_remainders)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "e027872b-d625-46c3-a3c2-ffe2681707b6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>stop_id</th>\n",
" <th>stop_name</th>\n",
" <th>sat_ons</th>\n",
" <th>sun_ons</th>\n",
" <th>weekday_ons</th>\n",
" <th>feed_key</th>\n",
" <th>name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>35</th>\n",
" <td>-49</td>\n",
" <td>Pueblo/Castillo Out</td>\n",
" <td>116.0</td>\n",
" <td>109.0</td>\n",
" <td>2734.0</td>\n",
" <td>52201caab047b98ae19b7547c0d7c2ad</td>\n",
" <td>SBMTD Schedule</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45</th>\n",
" <td>100003</td>\n",
" <td>AbreCDSO</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>52201caab047b98ae19b7547c0d7c2ad</td>\n",
" <td>SBMTD Schedule</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>100004</td>\n",
" <td>AlamBasO</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>52201caab047b98ae19b7547c0d7c2ad</td>\n",
" <td>SBMTD Schedule</td>\n",
" </tr>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>100005</td>\n",
" <td>AlamPadN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>52201caab047b98ae19b7547c0d7c2ad</td>\n",
" <td>SBMTD Schedule</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48</th>\n",
" <td>100006</td>\n",
" <td>AlamPadO</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>52201caab047b98ae19b7547c0d7c2ad</td>\n",
" <td>SBMTD Schedule</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49</th>\n",
" <td>100008</td>\n",
" <td>ArBuBePa</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>52201caab047b98ae19b7547c0d7c2ad</td>\n",
" <td>SBMTD Schedule</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50</th>\n",
" <td>100011</td>\n",
" <td>BranEver</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>52201caab047b98ae19b7547c0d7c2ad</td>\n",
" <td>SBMTD Schedule</td>\n",
" </tr>\n",
" <tr>\n",
" <th>51</th>\n",
" <td>100096</td>\n",
" <td>CaOaAlph</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>52201caab047b98ae19b7547c0d7c2ad</td>\n",
" <td>SBMTD Schedule</td>\n",
" </tr>\n",
" <tr>\n",
" <th>52</th>\n",
" <td>100098</td>\n",
" <td>CaOaFaiO</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>52201caab047b98ae19b7547c0d7c2ad</td>\n",
" <td>SBMTD Schedule</td>\n",
" </tr>\n",
" <tr>\n",
" <th>54</th>\n",
" <td>100100</td>\n",
" <td>CaOaTurn</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>52201caab047b98ae19b7547c0d7c2ad</td>\n",
" <td>SBMTD Schedule</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" stop_id stop_name sat_ons sun_ons weekday_ons \\\n",
"35 -49 Pueblo/Castillo Out 116.0 109.0 2734.0 \n",
"45 100003 AbreCDSO NaN NaN 0.0 \n",
"46 100004 AlamBasO NaN NaN 0.0 \n",
"47 100005 AlamPadN 0.0 0.0 0.0 \n",
"48 100006 AlamPadO 0.0 0.0 0.0 \n",
"49 100008 ArBuBePa 0.0 0.0 0.0 \n",
"50 100011 BranEver NaN NaN 0.0 \n",
"51 100096 CaOaAlph NaN NaN 0.0 \n",
"52 100098 CaOaFaiO NaN NaN 0.0 \n",
"54 100100 CaOaTurn NaN NaN 0.0 \n",
"\n",
" feed_key name \n",
"35 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule \n",
"45 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule \n",
"46 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule \n",
"47 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule \n",
"48 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule \n",
"49 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule \n",
"50 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule \n",
"51 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule \n",
"52 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule \n",
"54 52201caab047b98ae19b7547c0d7c2ad SBMTD Schedule "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"yr_sbmtd_remainders.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 34,
Expand Down

0 comments on commit a9cc43e

Please sign in to comment.