-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1275 from cal-itp/ah_starterkit
Tutorials for new analysts
- Loading branch information
Showing
28 changed files
with
8,598 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Starter Kit Portfolio | ||
I am revamping some of our exercises and one exercise will teach future analysts how to make a portfolio. Yay! | ||
|
||
## Who We Are | ||
We want our audience to understand who we are and why our expertise and research should be trusted. Here is a blurb you can lift. | ||
|
||
This website was created by the [California Department of Transportation](https://dot.ca.gov/)'s Division of Data and Digital Services. We are a group of data analysts and scientists who analyze transportation data, such as General Transit Feed Specification (GTFS) data, or data from funding programs such as the Active Transportation Program. Our goal is to transform messy and indecipherable original datasets into usable, customer-friendly products to better the transportation landscape. For more of our work, visit our [portfolio](https://analysis.calitp.org/). | ||
|
||
<img src="https://raw.githubusercontent.com/cal-itp/data-analyses/main/portfolio/Calitp_logo_MAIN.png" alt="Alt text" width="200" height="100"> <img src="https://raw.githubusercontent.com/cal-itp/data-analyses/main/portfolio/CT_logo_Wht_outline.gif" alt="Alt text" width="129" height="100"> | ||
|
||
<br>Caltrans®, the California Department of Transportation® and the Caltrans logo are registered service marks of the California Department of Transportation and may not be copied, distributed, displayed, reproduced or transmitted in any form without prior written permission from the California Department of Transportation. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
import pandas as pd | ||
import numpy as np | ||
import altair as alt | ||
from calitp_data_analysis import calitp_color_palette | ||
from IPython.display import HTML, Image, Markdown, display, display_html | ||
|
||
def reverse_snakecase(df:pd.DataFrame)->pd.DataFrame: | ||
""" | ||
Clean up columns to remove underscores and spaces. | ||
""" | ||
df.columns = df.columns.str.replace("_", " ").str.strip().str.title() | ||
|
||
df.columns = (df.columns.str.replace("Dac", "DAC") | ||
.str.replace("Vmt", "VMT") | ||
.str.replace("Zev", "ZEV") | ||
.str.replace("Lu", "Landuse") | ||
.str.replace("Ct", "CalTrans") | ||
) | ||
return df | ||
|
||
def load_dataset()->pd.DataFrame: | ||
""" | ||
Load the final dataframe. | ||
""" | ||
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/starter_kit/" | ||
FILE = "starter_kit_example_categorized.parquet" | ||
|
||
# Read dataframe in | ||
df = pd.read_parquet(f"{GCS_FILE_PATH}{FILE}") | ||
|
||
# Capitalize the Scope of Work column again since it is all lowercase | ||
df.scope_of_work = df.scope_of_work.str.capitalize() | ||
|
||
# Clean up the column names | ||
df = reverse_snakecase(df) | ||
return df | ||
|
||
def aggregate_by_category(df: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Find the median overall score and project cost | ||
and total unique projects by category. | ||
""" | ||
agg1 = ( | ||
df.groupby(["Category"]) | ||
.aggregate( | ||
{ | ||
"Overall Score": "median", | ||
"Project Cost": "median", | ||
"Project Name": "nunique", | ||
} | ||
) | ||
.reset_index() | ||
.rename( | ||
columns={ | ||
"Overall Score": "Median Score", | ||
"Project Cost": "Median Project Cost", | ||
"Project Name": "Total Projects", | ||
} | ||
) | ||
) | ||
|
||
# Format the Cost column properly | ||
agg1['Median Project Cost'] = agg1['Median Project Cost'].apply(lambda x: '${:,.0f}'.format(x)) | ||
|
||
return agg1 | ||
|
||
def wide_to_long(df:pd.DataFrame)->pd.DataFrame: | ||
""" | ||
Change the dataframe from wide to long based on the project name and | ||
Caltrans District. | ||
""" | ||
df2 = pd.melt( | ||
df, | ||
id_vars=["CalTrans District","Project Name"], | ||
value_vars=[ | ||
"Accessibility Score", | ||
"DAC Accessibility Score", | ||
"DAC Traffic Impacts Score", | ||
"Freight Efficiency Score", | ||
"Freight Sustainability Score", | ||
"Mode Shift Score", | ||
"Landuse Natural Resources Score", | ||
"Safety Score", | ||
"VMT Score", | ||
"ZEV Score", | ||
"Public Engagement Score", | ||
"Climate Resilience Score", | ||
"Program Fit Score", | ||
]) | ||
|
||
df2 = df2.rename(columns = {'variable':'Metric', | ||
'value':'Score'}) | ||
return df2 | ||
|
||
def style_df(df: pd.DataFrame): | ||
""" | ||
Styles a dataframe and displays it. | ||
""" | ||
display( | ||
df.style.hide(axis="index") | ||
.format(precision=0) # Display only 2 decimal points | ||
.set_properties(**{ | ||
"background-color": "white", | ||
"text-align": "center" | ||
}) | ||
) | ||
|
||
def create_metric_chart(df: pd.DataFrame) -> alt.Chart: | ||
""" | ||
Create a chart that displays metric scores | ||
for each project. | ||
""" | ||
# Create dropdown | ||
metrics_list = df["Metric"].unique().tolist() | ||
|
||
metrics_dropdown = alt.binding_select( | ||
options=metrics_list, | ||
name="Metrics: ", | ||
) | ||
# Column that controls the bar charts | ||
xcol_param = alt.selection_point( | ||
fields=["Metric"], value=metrics_list[0], bind=metrics_dropdown | ||
) | ||
|
||
chart = ( | ||
alt.Chart(df, title="Metric by Categories") | ||
.mark_circle(size=200) | ||
.encode( | ||
x=alt.X("Score", scale=alt.Scale(domain=[0, 10])), | ||
y=alt.Y("Project Name"), | ||
color=alt.Color( | ||
"Score", | ||
scale=alt.Scale( | ||
range=calitp_color_palette.CALITP_CATEGORY_BRIGHT_COLORS | ||
), | ||
), | ||
tooltip=list(df.columns), | ||
) | ||
.properties(width=400, height=250) | ||
) | ||
|
||
chart = chart.add_params(xcol_param).transform_filter(xcol_param) | ||
|
||
return chart | ||
|
||
def create_district_summary(df: pd.DataFrame, caltrans_district: int): | ||
""" | ||
Create a summary of CSIS metrics for one Caltrans District. | ||
""" | ||
filtered_df = df.loc[df["CalTrans District"] == caltrans_district].reset_index( | ||
drop=True | ||
) | ||
# Finding the values referenced in the narrative | ||
median_score = filtered_df["Overall Score"].median() | ||
total_projects = filtered_df["Project Name"].nunique() | ||
max_project = filtered_df["Project Cost"].max() | ||
max_project = f"${max_project:,.2f}" | ||
|
||
# Aggregate the dataframe | ||
aggregated_df = aggregate_by_category(filtered_df) | ||
|
||
# Change the dataframe from wide to long | ||
df2 = wide_to_long(filtered_df) | ||
|
||
# Create narrative | ||
display( | ||
Markdown( | ||
f"""The median score for projects in District {caltrans_district} is <b>{median_score}</b><br> | ||
The total number of projects is <b>{total_projects}</b><br> | ||
The most expensive project costs <b>{max_project}</b> | ||
""" | ||
) | ||
) | ||
display( | ||
Markdown( | ||
f"""<h4>Metrics aggregated by Categories</h4> | ||
""" | ||
) | ||
) | ||
style_df(aggregated_df) | ||
|
||
display( | ||
Markdown( | ||
f"""<h4>Overview of Projects</h4> | ||
""" | ||
) | ||
) | ||
style_df(filtered_df[["Project Name", "Overall Score", "Scope Of Work"]]) | ||
display( | ||
Markdown( | ||
f"""<h4>Metric Scores by Project</h4> | ||
""" | ||
) | ||
) | ||
display(create_metric_chart(df2)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "260ba8f3-dd02-4fdc-945d-450db01d188e", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"%%capture\n", | ||
"\n", | ||
"import warnings\n", | ||
"warnings.filterwarnings('ignore')\n", | ||
"\n", | ||
"import calitp_data_analysis.magics\n", | ||
"\n", | ||
"# All your other packages go here\n", | ||
"# Here I just want pandas and my own utils.\n", | ||
"import pandas as pd\n", | ||
"import _starterkit_utils " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "2a2996fd-29d0-4a19-ac48-a6957d9f8140", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"pd.options.display.max_columns = 100\n", | ||
"pd.options.display.float_format = \"{:.2f}\".format\n", | ||
"pd.set_option(\"display.max_rows\", None)\n", | ||
"pd.set_option(\"display.max_colwidth\", None)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "5d82c9a8-6f8f-485b-ace5-957f1b80c2f3", | ||
"metadata": { | ||
"tags": [ | ||
"parameters" | ||
] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# district = 1" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "43a07a8c-567d-471d-be10-a547cd0b3a13", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"%%capture_parameters\n", | ||
"district" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "cb5a0cc4-3e7e-4aea-81f2-c5e858fb315b", | ||
"metadata": {}, | ||
"source": [ | ||
"# District {district} Analysis " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "c91049e1-107d-47d9-9cda-63aa4fbf554b", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df = _starterkit_utils.load_dataset()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "bd1509c0-b435-456e-ad1c-b583a991f1e2", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"_starterkit_utils.create_district_summary(df, district)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.13" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Starter Kit Portfolio | ||
I am revamping some of our exercises and one exercise will teach future analysts how to make a portfolio. Yay! | ||
|
||
## Who We Are | ||
We want our audience to understand who we are and why our expertise and research should be trusted. Here is a blurb you can lift. | ||
|
||
This website was created by the [California Department of Transportation](https://dot.ca.gov/)'s Division of Data and Digital Services. We are a group of data analysts and scientists who analyze transportation data, such as General Transit Feed Specification (GTFS) data, or data from funding programs such as the Active Transportation Program. Our goal is to transform messy and indecipherable original datasets into usable, customer-friendly products to better the transportation landscape. For more of our work, visit our [portfolio](https://analysis.calitp.org/). | ||
|
||
<img src="https://raw.githubusercontent.com/cal-itp/data-analyses/main/portfolio/Calitp_logo_MAIN.png" alt="Alt text" width="200" height="100"> <img src="https://raw.githubusercontent.com/cal-itp/data-analyses/main/portfolio/CT_logo_Wht_outline.gif" alt="Alt text" width="129" height="100"> | ||
|
||
<br>Caltrans®, the California Department of Transportation® and the Caltrans logo are registered service marks of the California Department of Transportation and may not be copied, distributed, displayed, reproduced or transmitted in any form without prior written permission from the California Department of Transportation. |
Oops, something went wrong.