Merge pull request #1275 from cal-itp/ah_starterkit

Tutorials for new analysts
cal-itp · Oct 31, 2024 · 0b31cbc · 0b31cbc
2 parents d3a4868 + b0d1841
commit 0b31cbc
Show file tree

Hide file tree

Showing 28 changed files with 8,598 additions and 1 deletion.
diff --git a/Makefile b/Makefile
@@ -68,12 +68,25 @@ build_legislative_district_digest:
 	make git_check_no_sections
 
 
+build_starterkit_ha:
+	$(eval export site = ha_starterkit_district)
+	pip install -r portfolio/requirements.txt
+	make build_portfolio_site 
+	git add portfolio/$(site)/district_*/ portfolio/$(site)/*.yml portfolio/$(site)/*.md
+	python portfolio/portfolio.py index --deploy --prod
+
+build_starterkit_LASTNAME:
+	$(eval export site = YOUR_SITE_NAME)
+	pip install -r portfolio/requirements.txt
+	make build_portfolio_site 
+	git add portfolio/$(site)/district_*/ portfolio/$(site)/*.yml portfolio/$(site)/*.md
+	python portfolio/portfolio.py index --deploy --prod
+
 add_precommit:
 	pip install pre-commit
 	pre-commit install 
 	#pre-commit run --all-files 
 
-
 # Add to _.bash_profile outside of data-analyses
 #alias go='cd ~/data-analyses/portfolio && pip install -r requirements.txt && cd #../_shared_utils && make setup_env && cd ..'
 

diff --git a/ha_portfolio/README.md b/ha_portfolio/README.md
@@ -0,0 +1,11 @@
+# Starter Kit Portfolio
+I am revamping some of our exercises and one exercise will teach future analysts how to make a portfolio. Yay!
+
+## Who We Are
+We want our audience to understand who we are and why our expertise and research should be trusted. Here is a blurb you can lift. 
+
+This website was created by the [California Department of Transportation](https://dot.ca.gov/)'s Division of Data and Digital Services. We are a group of data analysts and scientists who analyze transportation data, such as General Transit Feed Specification (GTFS) data, or data from funding programs such as the Active Transportation Program. Our goal is to transform messy and indecipherable original datasets into usable, customer-friendly products to better the transportation landscape. For more of our work, visit our [portfolio](https://analysis.calitp.org/).
+
+<img src="https://raw.githubusercontent.com/cal-itp/data-analyses/main/portfolio/Calitp_logo_MAIN.png" alt="Alt text" width="200" height="100"> <img src="https://raw.githubusercontent.com/cal-itp/data-analyses/main/portfolio/CT_logo_Wht_outline.gif" alt="Alt text" width="129" height="100">
+
+<br>Caltrans®, the California Department of Transportation® and the Caltrans logo are registered service marks of the California Department of Transportation and may not be copied, distributed, displayed, reproduced or transmitted in any form without prior written permission from the California Department of Transportation.
diff --git a/ha_portfolio/_starterkit_utils.py b/ha_portfolio/_starterkit_utils.py
@@ -0,0 +1,195 @@
+import pandas as pd
+import numpy as np
+import altair as alt
+from calitp_data_analysis import calitp_color_palette
+from IPython.display import HTML, Image, Markdown, display, display_html
+
+def reverse_snakecase(df:pd.DataFrame)->pd.DataFrame:
+    """
+    Clean up columns to remove underscores and spaces.
+    """
+    df.columns = df.columns.str.replace("_", " ").str.strip().str.title()
+
+    df.columns = (df.columns.str.replace("Dac", "DAC")
+                  .str.replace("Vmt", "VMT")
+                  .str.replace("Zev", "ZEV")
+                  .str.replace("Lu", "Landuse")
+                  .str.replace("Ct", "CalTrans")
+                 )
+    return df
+
+def load_dataset()->pd.DataFrame:
+    """
+    Load the final dataframe.
+    """
+    GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/starter_kit/"
+    FILE = "starter_kit_example_categorized.parquet"
+
+    # Read dataframe in
+    df = pd.read_parquet(f"{GCS_FILE_PATH}{FILE}")
+
+    # Capitalize the Scope of Work column again since it is all lowercase
+    df.scope_of_work = df.scope_of_work.str.capitalize()
+
+    # Clean up the column names
+    df = reverse_snakecase(df)
+    return df
+
+def aggregate_by_category(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Find the median overall score and project cost 
+    and total unique projects by category.
+    """
+    agg1 = (
+        df.groupby(["Category"])
+        .aggregate(
+            {
+                "Overall Score": "median",
+                "Project Cost": "median",
+                "Project Name": "nunique",
+            }
+        )
+        .reset_index()
+        .rename(
+            columns={
+                "Overall Score": "Median Score",
+                "Project Cost": "Median Project Cost",
+                "Project Name": "Total Projects",
+            }
+        )
+    )
+
+    # Format the Cost column properly
+    agg1['Median Project Cost'] = agg1['Median Project Cost'].apply(lambda x: '${:,.0f}'.format(x))
+
+    return agg1
+
+def wide_to_long(df:pd.DataFrame)->pd.DataFrame:
+    """
+    Change the dataframe from wide to long based on the project name and
+    Caltrans District.
+    """
+    df2 = pd.melt(
+    df,
+    id_vars=["CalTrans District","Project Name"],
+    value_vars=[
+        "Accessibility Score",
+        "DAC Accessibility Score",
+        "DAC Traffic Impacts Score",
+        "Freight Efficiency Score",
+        "Freight Sustainability Score",
+        "Mode Shift Score",
+        "Landuse Natural Resources Score",
+        "Safety Score",
+        "VMT Score",
+        "ZEV Score",
+        "Public Engagement Score",
+        "Climate Resilience Score",
+        "Program Fit Score",
+    ])
+
+    df2 = df2.rename(columns = {'variable':'Metric',
+                                'value':'Score'})
+    return df2
+
+def style_df(df: pd.DataFrame):
+    """
+    Styles a dataframe and displays it.
+    """
+    display(
+        df.style.hide(axis="index")
+        .format(precision=0)  # Display only 2 decimal points
+        .set_properties(**{
+            "background-color": "white",
+            "text-align": "center"
+        })
+    )
+
+def create_metric_chart(df: pd.DataFrame) -> alt.Chart:
+    """
+    Create a chart that displays metric scores
+    for each project.
+    """
+    # Create dropdown
+    metrics_list = df["Metric"].unique().tolist()
+
+    metrics_dropdown = alt.binding_select(
+        options=metrics_list,
+        name="Metrics: ",
+    )
+    # Column that controls the bar charts
+    xcol_param = alt.selection_point(
+        fields=["Metric"], value=metrics_list[0], bind=metrics_dropdown
+    )
+
+    chart = (
+        alt.Chart(df, title="Metric by Categories")
+        .mark_circle(size=200)
+        .encode(
+            x=alt.X("Score", scale=alt.Scale(domain=[0, 10])),
+            y=alt.Y("Project Name"),
+            color=alt.Color(
+                "Score",
+                scale=alt.Scale(
+                    range=calitp_color_palette.CALITP_CATEGORY_BRIGHT_COLORS
+                ),
+            ),
+            tooltip=list(df.columns),
+        )
+        .properties(width=400, height=250)
+    )
+
+    chart = chart.add_params(xcol_param).transform_filter(xcol_param)
+
+    return chart
+
+def create_district_summary(df: pd.DataFrame, caltrans_district: int):
+    """
+    Create a summary of CSIS metrics for one Caltrans District.
+    """
+    filtered_df = df.loc[df["CalTrans District"] == caltrans_district].reset_index(
+        drop=True
+    )
+    # Finding the values referenced in the narrative
+    median_score = filtered_df["Overall Score"].median()
+    total_projects = filtered_df["Project Name"].nunique()
+    max_project = filtered_df["Project Cost"].max()
+    max_project = f"${max_project:,.2f}"
+
+    # Aggregate the dataframe
+    aggregated_df = aggregate_by_category(filtered_df)
+
+    # Change the dataframe from wide to long
+    df2 = wide_to_long(filtered_df)
+
+    # Create narrative
+    display(
+        Markdown(
+            f"""The median score for projects in District {caltrans_district} is <b>{median_score}</b><br> 
+        The total number of projects is <b>{total_projects}</b><br>
+        The most expensive project costs <b>{max_project}</b>
+        """
+        )
+    )
+    display(
+        Markdown(
+            f"""<h4>Metrics aggregated by Categories</h4>
+        """
+        )
+    )
+    style_df(aggregated_df)
+
+    display(
+        Markdown(
+            f"""<h4>Overview of Projects</h4>
+        """
+        )
+    )
+    style_df(filtered_df[["Project Name", "Overall Score", "Scope Of Work"]])
+    display(
+        Markdown(
+            f"""<h4>Metric Scores by Project</h4>
+        """
+        )
+    )
+    display(create_metric_chart(df2))
diff --git a/ha_portfolio/ha_portfolio.ipynb b/ha_portfolio/ha_portfolio.ipynb
@@ -0,0 +1,111 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "260ba8f3-dd02-4fdc-945d-450db01d188e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "import calitp_data_analysis.magics\n",
+    "\n",
+    "# All your other packages go here\n",
+    "# Here I just want pandas and my own utils.\n",
+    "import pandas as pd\n",
+    "import _starterkit_utils "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a2996fd-29d0-4a19-ac48-a6957d9f8140",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.options.display.max_columns = 100\n",
+    "pd.options.display.float_format = \"{:.2f}\".format\n",
+    "pd.set_option(\"display.max_rows\", None)\n",
+    "pd.set_option(\"display.max_colwidth\", None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5d82c9a8-6f8f-485b-ace5-957f1b80c2f3",
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# district = 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43a07a8c-567d-471d-be10-a547cd0b3a13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture_parameters\n",
+    "district"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cb5a0cc4-3e7e-4aea-81f2-c5e858fb315b",
+   "metadata": {},
+   "source": [
+    "# District {district} Analysis "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c91049e1-107d-47d9-9cda-63aa4fbf554b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = _starterkit_utils.load_dataset()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd1509c0-b435-456e-ad1c-b583a991f1e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_starterkit_utils.create_district_summary(df, district)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/portfolio/ha_starterkit_district/README.md b/portfolio/ha_starterkit_district/README.md
@@ -0,0 +1,11 @@
+# Starter Kit Portfolio
+I am revamping some of our exercises and one exercise will teach future analysts how to make a portfolio. Yay!
+
+## Who We Are
+We want our audience to understand who we are and why our expertise and research should be trusted. Here is a blurb you can lift. 
+
+This website was created by the [California Department of Transportation](https://dot.ca.gov/)'s Division of Data and Digital Services. We are a group of data analysts and scientists who analyze transportation data, such as General Transit Feed Specification (GTFS) data, or data from funding programs such as the Active Transportation Program. Our goal is to transform messy and indecipherable original datasets into usable, customer-friendly products to better the transportation landscape. For more of our work, visit our [portfolio](https://analysis.calitp.org/).
+
+<img src="https://raw.githubusercontent.com/cal-itp/data-analyses/main/portfolio/Calitp_logo_MAIN.png" alt="Alt text" width="200" height="100"> <img src="https://raw.githubusercontent.com/cal-itp/data-analyses/main/portfolio/CT_logo_Wht_outline.gif" alt="Alt text" width="129" height="100">
+
+<br>Caltrans®, the California Department of Transportation® and the Caltrans logo are registered service marks of the California Department of Transportation and may not be copied, distributed, displayed, reproduced or transmitted in any form without prior written permission from the California Department of Transportation.