From 53514f4d77b1c3e319aa5bce953615a54b22fc11 Mon Sep 17 00:00:00 2001 From: amandaha8 Date: Wed, 6 Nov 2024 19:12:51 +0000 Subject: [PATCH 1/2] copyediting starter kit --- gtfs_digest/_operators_prep_og.py | 90 +++++++++++++++++++++++++++++++ starter_kit/2024_basics_01.ipynb | 2 +- 2 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 gtfs_digest/_operators_prep_og.py diff --git a/gtfs_digest/_operators_prep_og.py b/gtfs_digest/_operators_prep_og.py new file mode 100644 index 000000000..289adc996 --- /dev/null +++ b/gtfs_digest/_operators_prep_og.py @@ -0,0 +1,90 @@ +from shared_utils import catalog_utils +import pandas as pd +import yaml + +GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data") +# Readable Dictionary +with open("readable.yml") as f: + readable_dict = yaml.safe_load(f) + +def operator_profiles()->pd.DataFrame: + # Load operator profiles + op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet" + op_cols = ["organization_name", "name", "service_date", "schedule_gtfs_dataset_key"] + op_profiles_df = pd.read_parquet(op_profiles_url)[op_cols] + + # Keep the name with the most recent service date + op_profiles2 = (op_profiles_df.sort_values( + by=["name", "service_date"], + ascending=[True, False]) + ) + # Drop duplicated names + op_profiles3 = op_profiles2.drop_duplicates(subset=["name"]) + + # Drop duplicated organization names + op_profiles4 = (op_profiles3 + .drop_duplicates(subset = ['organization_name']) + .reset_index(drop = True)) + return op_profiles4 + + +def operators_schd_vp_rt()->pd.DataFrame: + """ + Operators who have schedule only OR have + both schedule and realtime data. + """ + schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet" + + schd_vp_df = (pd.read_parquet(schd_vp_url, + filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]], + columns = [ "schedule_gtfs_dataset_key", + "caltrans_district", + "organization_name", + "name", + "sched_rt_category", + "service_date",] + ) + ) + + + schd_vp_df2 = ( + schd_vp_df.dropna(subset="caltrans_district") + .sort_values( + by=[ + "caltrans_district", + "organization_name", + "service_date", + ], + ascending=[True, True, False], + ) + .drop_duplicates( + subset=[ + "organization_name", + "caltrans_district", + ] + ) + .reset_index(drop=True) + ) + + schd_vp_df3 = ( + schd_vp_df2.sort_values( + by=["caltrans_district", "name", "service_date"], ascending=[True, False, False] + ) + .drop_duplicates(subset=["caltrans_district", "name"]) + .reset_index(drop=True) + ) + + schd_vp_df3 = schd_vp_df3[["caltrans_district","organization_name"]] + + op_profile = operator_profiles() + + # Merge + final = pd.merge( + schd_vp_df3, op_profile, on=["organization_name"], + how="left") + + final = (final + .sort_values(by = ["caltrans_district","organization_name"]) + .reset_index(drop = True) + ) + return final diff --git a/starter_kit/2024_basics_01.ipynb b/starter_kit/2024_basics_01.ipynb index 3431b92ed..956547687 100644 --- a/starter_kit/2024_basics_01.ipynb +++ b/starter_kit/2024_basics_01.ipynb @@ -8,7 +8,7 @@ "# Exercise 1: Familiarize yourself with `pandas` and `python`\n", "If you are new to Python, there are many resources!\n", "* There are introductory Python courses available through [Caltrans's LinkedIn Learning Library](https://www.linkedin.com/learning/search?keywords=python&u=36029164).\n", - "* [Practical Python for Data Science](https://www.practicalpythonfordatascience.com/00_python_crash_course) is an incredibly helpful book and material from this resource are linked throughout.\n", + "* [Practical Python for Data Science](https://www.practicalpythonfordatascience.com/00_python_crash_course) is an incredibly helpful resource. Material from it is linked throughout.\n", "\n", "## Skills \n", "* `pandas` is one of the base Python packages for working with tabular data.\n", From bd6af9169e68c5572415d7bad15c6dc088e4773b Mon Sep 17 00:00:00 2001 From: amandaha8 Date: Wed, 6 Nov 2024 23:39:56 +0000 Subject: [PATCH 2/2] corrected typos and made instructions less longwinded --- starter_kit/2024_basics_01.ipynb | 715 ++--------- starter_kit/2024_basics_02.ipynb | 1595 ++----------------------- starter_kit/2024_basics_03.ipynb | 1482 ++--------------------- starter_kit/2024_basics_04.ipynb | 1906 ++---------------------------- starter_kit/2024_basics_05.ipynb | 33 +- 5 files changed, 454 insertions(+), 5277 deletions(-) diff --git a/starter_kit/2024_basics_01.ipynb b/starter_kit/2024_basics_01.ipynb index 956547687..9a838649e 100644 --- a/starter_kit/2024_basics_01.ipynb +++ b/starter_kit/2024_basics_01.ipynb @@ -5,27 +5,22 @@ "id": "247e773f-0e29-4ed6-ab4d-5856325611b4", "metadata": {}, "source": [ - "# Exercise 1: Familiarize yourself with `pandas` and `python`\n", - "If you are new to Python, there are many resources!\n", + "# Exercise 1: `pandas`,`python`, `f-strings`, Importing and Exporting data.\n", + "If you are new to Python, there are many resources to help you! Below is just a small sample of what is available.\n", "* There are introductory Python courses available through [Caltrans's LinkedIn Learning Library](https://www.linkedin.com/learning/search?keywords=python&u=36029164).\n", "* [Practical Python for Data Science](https://www.practicalpythonfordatascience.com/00_python_crash_course) is an incredibly helpful resource. Material from it is linked throughout.\n", "\n", - "## Skills \n", - "* `pandas` is one of the base Python packages for working with tabular data.\n", - "* F-strings\n", - "* Export to Google Cloud Storage\n", - "* Practice committing on GitHub\n", - "\n", "## How to use these tutorials\n", "* The tutorials are divided by skills/concepts we are going to learn.\n", "* There are hints and instructions on the top.\n", - "* There are links to references. **It is highly recommended to read through them and practice them in this notebook, in addition to these exercises.**\n", + "* There are links to references. \n", + "**It is highly recommended to read through them and practice them in this notebook.**\n", "\n", "## What are we working with today? \n", - "* Today we will be working on Caltrans System Investment Strategy (CSIS) today. Per this [description](https://dot.ca.gov/programs/transportation-planning/division-of-transportation-planning/corridor-and-system-planning/csis)\n", + "* Today we will be working on Caltrans System Investment Strategy (CSIS) data. Per this [description](https://dot.ca.gov/programs/transportation-planning/division-of-transportation-planning/corridor-and-system-planning/csis)\n", "> The California Department of Transportation (Caltrans) is committed to leading climate action and advancing social equity in the transportation sector set forth by the California State Transportation Agency (CalSTA) Climate Action Plan for Transportation Infrastructure (CAPTI, 2021)...Caltrans is in a significant leadership role to carry out meaningful measures that advance state’s goals and priorities through the development and implementation of the Caltrans System Investment Strategy (CSIS). The CSIS, which implements one of CAPTI’s key actions, is envisioned to be an investment framework through a data and performance-driven approach that guides transportation investments and decisions.\n", - "* DDS is working on CSIS is by automating the scoring of projects using Python. We score each project based on how well they do in various categories, aka metrics such as Zero Emmission Vehicles, Vehicle Miles Traveled, and more. \n", - "* While the values in we are working with today are all fake, the exercise is based on actual datasets and assignments. " + "* The Data Science Branch is working on CSIS is by automating the scoring of projects using Python. We score each project based on how well they do on various metrics such as Zero Emmission Vehicles, Vehicle Miles Traveled Reduction, and more. \n", + "* While the values in we are working with today are all fake, the exercise is based on the actual data and work we've done. " ] }, { @@ -34,20 +29,18 @@ "metadata": {}, "source": [ "## Import Packages\n", - "* Before doing some data cleaning and analyzing, we need to equip ourselves with the right tools to get started.\n", - "* Part of our \"toolbox\" are packages. \n", - "\n", + "* Before doing some data cleaning and analyzing, we need to equip ourselves with the right tools.\n", + "* Part of our \"toolbox\" are importing packages. \n", "* **Resource**: [Importing Dependencies via Practical Python for Data Science](https://www.practicalpythonfordatascience.com/05_data_exploration.html?highlight=dependencies#importing-our-dependencies)\n", "\n", "### `Pandas`\n", - "* You are importing the package `pandas` that is the backbone of the majority of our data analysis work. \n", - "* You can import countless packages. \n", - "* We commonly use `geopandas` for geospatial data work. We use `altair` for making charts." + "* Below, you are importing the package `pandas` that is the backbone of our data analysis work. \n", + "* Other packages DDS commonly uses are `geopandas` for geospatial data work and `altair` for making charts." ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "50199af7-04a8-43c5-ba1b-4127940749bd", "metadata": {}, "outputs": [], @@ -60,19 +53,18 @@ "id": "19b42c5d-4f2b-4d66-a7a7-98ab74a6591e", "metadata": {}, "source": [ - "* This block of code below adjusts the notebook.\n", - "* I am setting the maximum number of columns to be displayed to be 100.\n", + "* This block of code below adjusts the notebook's settings.\n", + "* I am setting the maximum number of columns to be displayed to be 100 because the default number of columns shown is much smaller.\n", "* I want any `float` columns to be rounded to 2 decimal points.\n", "* I want all of the rows in the dataframe to display. \n", - "* I don't want my columns to be truncated.\n", - " * If you have a column with `strings` that is very long, it will automatically cut off.\n", - " * Example: The California Department of Transportation (Caltrans) is committed to leading climate action and advancing social equity... would be displayed something like this The California Department of Transportation (Caltrans) is... without this line of code.\n", - "* Adjust some of these settings if you wish " + "* I don't want my string columns to be truncated.\n", + " * A long string value will display like this The California Department of Transportation (Caltrans) is committed to leading climate action and advancing social equity... would be displayed something like this The California Department of Transportation (Caltrans) is... without this line of code.\n", + "* Adjust some of these settings if you wish to make this notebook the proper environment for you." ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "8e18d8d7-2cce-4854-b6c4-56a7e7bdf636", "metadata": {}, "outputs": [], @@ -90,14 +82,15 @@ "source": [ "### `calitp_data_analysis`\n", "* DDS also has our own [internal library of functions](https://docs.calitp.org/data-infra/analytics_tools/python_libraries.html#calitp-data-analysis).\n", - "* You can check out all the functions [here](https://github.com/cal-itp/data-infra/tree/main/packages/calitp-data-analysis/calitp_data_analysis).\n", - "* Below, we are importing only one function called `to_snakecase` from the python submodule `sql` in our package `calitp_data_analysis`. `to_snakecase` allows us to change the column names of our dataset from something like `Project Description` to `project_description`. \n", - "* By turning the column names to lower case and replacing the spaces with underscores, this makes referencing specific columns much easier." + " * You can check out all the functions [here](https://github.com/cal-itp/data-infra/tree/main/packages/calitp-data-analysis/calitp_data_analysis).\n", + "* Below, we are importing only one function called `to_snakecase` from the python submodule `sql` in our package `calitp_data_analysis`. \n", + "* `to_snakecase` allows us to change the column names of our dataset from something like `Project Description` to `project_description`. \n", + " * Turning the column names to lower case and replacing the spaces with underscores, this makes referencing specific columns much easier." ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "bd388d88-d2d6-4dd6-9870-22c14db7a44a", "metadata": {}, "outputs": [], @@ -112,8 +105,13 @@ "source": [ "## Jupyter Notebook\n", "* You're using a Jupyter Notebook right now.\n", - "* There are many benefits listed here in our [DDS Docs](https://docs.calitp.org/data-infra/analytics_new_analysts/04-notebooks.html).\n", - "* Take some time to get used to this interface. There are many tutorials available on Youtube that shows tips and tricks, just skip the installation portion. \n", + "* There are many benefits of using a notebook for our analysis, which you can read about here in our [DDS Docs](https://docs.calitp.org/data-infra/analytics_new_analysts/04-notebooks.html).\n", + "* Take some time to get used to this interface. \n", + " * Press ctrl+enter to run a cell\n", + " * Go up to the Kernel and rerun all the cells.\n", + " * Use the scissors at the top to cut out the cell.\n", + " * Adjust your settings to be dark instead of light.\n", + "* There are many tutorials available on Youtube, just skip the installation portion. \n", " * [This one looks promising](https://youtu.be/LW2Rye_l8L0?si=B8kojobCe3OIF3xg)." ] }, @@ -124,16 +122,17 @@ "source": [ "## Check out the data \n", "* Download the Excel workbook containing all the CSIS data from Google Cloud Storage [here](https://console.cloud.google.com/storage/browser/_details/calitp-analytics-data/data-analyses/starter_kit/starter_kit_csis_scoring_workbook.xlsx;tab=live_object?project=cal-itp-data-infra). \n", - " * Open it up in Excel and take a look at how many sheets and the data structure.\n", + "* Open the workbook up in Excel and take a look at how many sheetsit contains.\n", + "\n", "### Read in the data\n", "* We are reading our Excel Workbook into a Pandas dataframe.\n", "* While there is a very [technical definition](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) of what a dataframe is, you can think of it as an Excel sheet that holds your data. \n", - "* Resource: [This page of the Practical Python for Data Science](https://www.practicalpythonfordatascience.com/02_loading_data)" + "* Resource: [Practical Python for Data Science](https://www.practicalpythonfordatascience.com/02_loading_data)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "5950cb87-75ab-4871-ab4b-a8f1c41f0a4a", "metadata": {}, "outputs": [], @@ -146,12 +145,12 @@ "id": "88d79cea-c017-454e-a2aa-85c0bf511d85", "metadata": {}, "source": [ - "* Read in the dataframe without `to_snakecase()` first to see what happens." + "* Read in the dataframe without the function `to_snakecase()` first to see what happens." ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "67ba9264-65d9-453b-a800-a91bd365e43e", "metadata": {}, "outputs": [], @@ -161,78 +160,10 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "e2d886b4-c207-41e5-8325-7275619b60e6", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ct_districtproject_nameScope of WorkProject Costlead agency
01Meadow Magic Multi-Use PathA 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife.5245734Meadow Bunny Public Transportation (MBPT)
14Bunny Hop Bike BoulevardA Class II bike lane with charming streetlights, benches, and bike racks designed to resemble carrot sticks, connecting residential neighborhoods to local schools and parks.6929368Unicorn Fairy Express Bus (UFX)
\n", - "
" - ], - "text/plain": [ - " ct_district project_name \\\n", - "0 1 Meadow Magic Multi-Use Path \n", - "1 4 Bunny Hop Bike Boulevard \n", - "\n", - " Scope of Work \\\n", - "0 A 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife. \n", - "1 A Class II bike lane with charming streetlights, benches, and bike racks designed to resemble carrot sticks, connecting residential neighborhoods to local schools and parks. \n", - "\n", - " Project Cost lead agency \n", - "0 5245734 Meadow Bunny Public Transportation (MBPT) \n", - "1 6929368 Unicorn Fairy Express Bus (UFX) " - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df_no_snakecase.head(2)" ] @@ -247,7 +178,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "e09456e0-dfd2-4388-85de-eb9e95f983fa", "metadata": {}, "outputs": [], @@ -257,78 +188,10 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "54c718b3-eeff-4ec5-b012-1cc612543c60", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ct_districtproject_namescope_of_workproject_costlead_agency
01Meadow Magic Multi-Use PathA 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife.5245734Meadow Bunny Public Transportation (MBPT)
14Bunny Hop Bike BoulevardA Class II bike lane with charming streetlights, benches, and bike racks designed to resemble carrot sticks, connecting residential neighborhoods to local schools and parks.6929368Unicorn Fairy Express Bus (UFX)
\n", - "
" - ], - "text/plain": [ - " ct_district project_name \\\n", - "0 1 Meadow Magic Multi-Use Path \n", - "1 4 Bunny Hop Bike Boulevard \n", - "\n", - " scope_of_work \\\n", - "0 A 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife. \n", - "1 A Class II bike lane with charming streetlights, benches, and bike racks designed to resemble carrot sticks, connecting residential neighborhoods to local schools and parks. \n", - "\n", - " project_cost lead_agency \n", - "0 5245734 Meadow Bunny Public Transportation (MBPT) \n", - "1 6929368 Unicorn Fairy Express Bus (UFX) " - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.head(2)" ] @@ -340,89 +203,19 @@ "source": [ "### Previewing Data \n", "* Often, you want to get a sneak preview of your data. \n", - "* Thankfully, Python provides many methods for you to do so. \n", "* Below are a couple of very common methods we use. \n", " * `.head()` shows the first five rows, while `.tail()` shows the last five.\n", " * `.sample()` shows you a random row.\n", - " * Want to see or less than five? Specify it in the parantheses: `.head(10)` allows you to see the first 10 rows and `.head(2)` allows you to see the first 2.\n", - "* Try everything yourself below.\n", + " * Want to see or less than five? Specify it in the parantheses: `.head(10)` allows you to see the first 10 rows and `.sample(2)` allows you to see two random rows.\n", "* **Resource**: [Practical Python for Data Science: Data Inspection](https://www.practicalpythonfordatascience.com/02_loading_data)" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "id": "5e966250-47b1-4f14-802b-c795e44330dd", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ct_districtproject_namescope_of_workproject_costlead_agency
01Meadow Magic Multi-Use PathA 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife.5245734Meadow Bunny Public Transportation (MBPT)
14Bunny Hop Bike BoulevardA Class II bike lane with charming streetlights, benches, and bike racks designed to resemble carrot sticks, connecting residential neighborhoods to local schools and parks.6929368Unicorn Fairy Express Bus (UFX)
\n", - "
" - ], - "text/plain": [ - " ct_district project_name \\\n", - "0 1 Meadow Magic Multi-Use Path \n", - "1 4 Bunny Hop Bike Boulevard \n", - "\n", - " scope_of_work \\\n", - "0 A 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife. \n", - "1 A Class II bike lane with charming streetlights, benches, and bike racks designed to resemble carrot sticks, connecting residential neighborhoods to local schools and parks. \n", - "\n", - " project_cost lead_agency \n", - "0 5245734 Meadow Bunny Public Transportation (MBPT) \n", - "1 6929368 Unicorn Fairy Express Bus (UFX) " - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.head(2)" ] @@ -436,40 +229,11 @@ "* `df.shape` gives you the number of rows and columns in your dataset.\n", "* `df.columns` returns all of the column names.\n", "* `df.info()` per the [pandas docs](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.info.html#pandas.DataFrame.info) prints information about a DataFrame including the index dtype and columns, non-null values and memory usage.\n", - "* Experiment below. \n", + "* **Experiment below.** \n", "* More food for thought:\n", " * `Dtype` is critical. There are integers, objects, booleans, floats...\n", " * Does the `dtype` of each column below make sense to you? \n", - " * The `dtype` of `object` is a catchall term." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "7f55b33e-d402-473b-815a-92ad935d35d7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 44 entries, 0 to 43\n", - "Data columns (total 5 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 ct_district 44 non-null int64 \n", - " 1 project_name 44 non-null object\n", - " 2 scope_of_work 44 non-null object\n", - " 3 project_cost 44 non-null int64 \n", - " 4 lead_agency 44 non-null object\n", - "dtypes: int64(2), object(3)\n", - "memory usage: 1.8+ KB\n" - ] - } - ], - "source": [ - "df.info()" + " * The `dtype` of `object` is a catchall term. It can either contain all string values like \"muffins\" and \"apples\" or a mix of string and other data types like \"6 muffins\" and \"3 apples.\"" ] }, { @@ -478,46 +242,16 @@ "metadata": {}, "source": [ "### Deeper Dive\n", - "* We now know a good amount about our dataset, but the # of rows and columns are not always so thrilling. \n", "* Let's take a closer look at some columns.\n", "* `.value_counts()` helps you see how many times the same value appears. " ] }, - { - "cell_type": "markdown", - "id": "55cece73-c3d5-4cd7-8896-f97d43fc1114", - "metadata": {}, - "source": [] - }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "63f21ab5-0920-4310-afce-2ea657556912", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "4 6\n", - "3 6\n", - "8 5\n", - "11 5\n", - "12 4\n", - "5 4\n", - "9 3\n", - "6 3\n", - "7 3\n", - "2 2\n", - "10 2\n", - "1 1\n", - "Name: ct_district, dtype: int64" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.ct_district.value_counts()" ] @@ -534,42 +268,20 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "id": "1d832308-a425-404d-83a0-53ce8bfae279", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "44" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.project_name.nunique()" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "id": "55d2140f-feab-496b-b9b1-90bbe5701a9a", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(44, 5)" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.shape" ] @@ -579,30 +291,48 @@ "id": "7c0c499e-fa7b-4f01-a357-db7b0ec41416", "metadata": {}, "source": [ - "* You can preview a column with brackets [] as well with the column name encased in quotation marks." + "* You can preview a column with brackets [] as well with the column name encased in quotation marks.\n", + "* However, simply using a period . is much easier." ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "4e232324-f75f-46a0-962d-76ed9273dac7", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "44" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df[\"scope_of_work\"].nunique()" ] }, + { + "cell_type": "markdown", + "id": "cfbb9b16-b7cd-44d5-af92-6c6351a35022", + "metadata": {}, + "source": [ + "* Describe() gives you some descriptive statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83536bb4-1939-4ceb-8d62-7aeab1473993", + "metadata": {}, + "outputs": [], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "497ae835-e7eb-45b8-b593-eaf44cb5858e", + "metadata": {}, + "outputs": [], + "source": [ + "df.project_cost.describe()" + ] + }, { "cell_type": "markdown", "id": "06ee15f6-ee2e-4e3e-91b2-115875292042", @@ -639,6 +369,7 @@ "id": "21a32ab4-bfb2-4e7a-b90a-6fa05b7ceb89", "metadata": {}, "source": [ + "### Application of Lists\n", "* I am placing all of the sheets in our Excel Workbook in a list.\n", "* Notice that the items in this list are strings. \n", " * Read about strings [here](https://www.practicalpythonfordatascience.com/00_python_crash_course_datatypes.html?highlight=dictionary#string).\n", @@ -649,7 +380,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "id": "02380fb6-c55b-477f-acfb-8b483e83beac", "metadata": {}, "outputs": [], @@ -660,68 +391,25 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "id": "8a9a1a3e-e10d-4447-96dd-92ecb2fe6357", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(my_sheets)" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "id": "a3be037d-b21b-4192-9099-25bfcb660f01", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'projects_auto'" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "# Index\n", + "# Index 0 is projects_auto\n", "my_sheets[0]" ] }, - { - "cell_type": "code", - "execution_count": 30, - "id": "ebf91535-a466-446a-9f7a-606503d78b6a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'overall_score'" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_sheets[1]" - ] - }, { "cell_type": "markdown", "id": "75df89d0-92fb-4e4e-aaa3-54f4944c55c3", @@ -733,7 +421,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "id": "2e2578bc-db1f-41f5-bc07-3cb82998420e", "metadata": {}, "outputs": [], @@ -752,27 +440,8 @@ "### Specificity is beautiful.\n", "* Grab out each individual sheet into its own dataframe using `df2.get(my_sheets[enter in the index number])`. \n", "* Make sure your `dataframe` is titled descriptively.\n", - "* `df` is not exactly very telling. " - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "4c6f8fdb-33d3-4c44-bb00-6d1447d49feb", - "metadata": {}, - "outputs": [], - "source": [ - "projects_df = to_snakecase(df2.get(my_sheets[0]))" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "167af2f1-b09d-476d-87b4-b9374ad445c2", - "metadata": {}, - "outputs": [], - "source": [ - "scores_df = to_snakecase(df2.get(my_sheets[1]))" + "* `df` is not exactly very telling. \n", + "* Use the function `to_snakecase` to clean up your column names" ] }, { @@ -782,28 +451,27 @@ "source": [ "## Add a new column\n", "* Oops! Us analysts were so wrapped up in scoring, we forgot to to total up all the metrics to find the overall_score for the project. \n", - "* Sum up all the metric columns into a column called `overall_score`\n", + "* Using the dataframe you read in from the Excel sheet \"Overall Score\", sum up all the metric columns into a column called `overall_score`\n", "* There are a couple of ways to do this: experiment! \n", "* Here are some resources:\n", " * [Stackoverflow](https://stackoverflow.com/questions/22342285/summing-two-columns-in-a-pandas-dataframe)\n", " * [Statology](https://www.statology.org/pandas-sum-specific-columns/)\n", "* Food for thought:\n", + " * What happens when you create a new column with `scores_df.overall_score` instead of `scores_df[\"overall_score\"]`? \n", " * What does `axis = 1` mean?\n", " * What happens if you do `.sum(axis=0)`?\n", " * You don't always have to save everything into a dataframe. You can do something like `df.sum(axis=0)` just to see what happens. \n", " * Just make sure your dataframe isn't too large or else you will run out of memory!\n", - " * What happens when you create a new column with `scores_df.overall_score` instead of `scores_df[\"overall_score\"]`? " + " " ] }, { "cell_type": "code", - "execution_count": 34, - "id": "e9321f90-8c99-46fb-9d50-8571f3d94fc8", + "execution_count": null, + "id": "93085a1c-d479-424d-b2a1-d8e6cba150ab", "metadata": {}, "outputs": [], - "source": [ - "scores_df[\"overall_score\"] = scores_df.select_dtypes(include=['int64', 'float64']).sum(axis=1)" - ] + "source": [] }, { "cell_type": "markdown", @@ -813,27 +481,26 @@ }, "source": [ "## Subsetting\n", - "* Your manager asks for the `overall_score` for each project. \n", + "* Your manager asks for the `overall_score` for each project in Excel format. \n", "* They do not want to see the other metrics, only the project's name and its `overall_score`\n", - "* Subset the dataframe and save it into a new dataframe.\n", - "* Again, there are many ways to do the same thing in Python. \n", + "* Subset the dataframe and saveit into a new dataframe.\n", "* Method 1: Enter in all the columns you want to keep in a list and place the list in another set of brackets." ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "id": "4e6d8e70-ae57-46c5-a5aa-9972be77f415", "metadata": {}, "outputs": [], "source": [ "# Enter in the columns you want to keep\n", - "columns_to_keep = [\"project_name\",\"overall_score\"]" + "columns_to_keep = []" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "id": "48ee899b-3db9-464f-802f-d431189176b7", "metadata": { "scrolled": true, @@ -854,7 +521,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "id": "2c64cdcf-9598-4f4a-b077-5caec0cfe264", "metadata": {}, "outputs": [], @@ -865,7 +532,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "id": "47a96b86-e5d1-4fcd-ba73-7db5badae28b", "metadata": { "scrolled": true, @@ -874,7 +541,7 @@ "outputs": [], "source": [ "\n", - "# subsetted_df2 = scores_df.drop(columns = columns_to_drop)" + "subsetted_df2 = scores_df.drop(columns = columns_to_drop)" ] }, { @@ -884,18 +551,19 @@ "source": [ "## F-Strings\n", "* Save your subsetted dataframe from above back into the `starter_kit` folder. \n", - " * The file path should be something like this `\"gs://calitp-analytics-data/data-analyses/starter_kit/aggregated_csis.xlsx\"`.\n", + "* The file path should be something like this `\"gs://calitp-analytics-data/data-analyses/starter_kit/your_file_name_here.xlsx\"`.\n", "* However, remember our original Excel workbook's file path? It was`\"gs://calitp-analytics-data/data-analyses/starter_kit/starter_kit_csis_scoring_workbook.xlsx\"`\n", - "* Essentially, the **only** difference between these two file paths are `aggregated_csis.xlsx` and `starter_kit_csis_scoring_workbook.xlsx` because the folder path `gs://calitp-analytics-data/data-analyses/starter_kit/` remains the same. \n", - "* This is where f-strings come in. Read more about them [here](https://realpython.com/python-f-strings/#f-strings-a-new-and-improved-way-to-format-strings-in-python).\n", + "* The **only** difference between these two file paths are `your_file_name_here.xlsx` and `starter_kit_csis_scoring_workbook.xlsx` because the folder path `gs://calitp-analytics-data/data-analyses/starter_kit/` remains the same. \n", + "* This is where f-strings come in. \n", "> Python f-strings provide a quick way to interpolate and format strings. They’re readable, concise, and less prone to error than traditional string interpolation and formatting tools...\n", - "* Let's practice !\n", - " * My file_path is always going to be `gs://calitp-analytics-data/data-analyses/starter_kit/`.\n" + " * Excerpt from [here](https://realpython.com/python-f-strings/#f-strings-a-new-and-improved-way-to-format-strings-in-python).\n", + "#### Application of F-Strings\n", + "* My file_path is always going to be `gs://calitp-analytics-data/data-analyses/starter_kit/` so I'll set that in its own variable.\n" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "id": "4c9c53a5-dbf3-4dc0-aea0-832f3a91414d", "metadata": {}, "outputs": [], @@ -909,18 +577,18 @@ "metadata": {}, "source": [ "* However the file is going to change.\n", - "* Save the file name in a variable called `FILE`." + "* Save the file name in a new variable called `FILE`." ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "id": "db111f34-08b8-42f9-96fe-6852c4af50ad", "metadata": {}, "outputs": [], "source": [ "\n", - "FILE = \"starter_kit_example_final_scores.xlsx\"" + "FILE = " ] }, { @@ -933,21 +601,10 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "id": "edff403c-ef37-48d8-8c7a-60b388752a51", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'gs://calitp-analytics-data/data-analyses/starter_kit/starter_kit_example_final_scores.xlsx'" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Put them together using a f-string\n", "f\"{GCS_FILE_PATH}{FILE}\"" @@ -965,7 +622,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": null, "id": "bf37fc2d-ac6c-4134-94de-79a9a4141ffc", "metadata": {}, "outputs": [], @@ -979,137 +636,13 @@ "id": "17c17adb-404e-4e54-bdb4-c3295e0e2be2", "metadata": {}, "source": [ + "### Parquets\n", "* Export the entire (not subsetted) dataframe with the new `overall_score` column using `df.to_parquet()`. \n", " * We typically prefer saving to `parquets`. Why? Read below. Text taken from [here](https://docs.calitp.org/data-infra/analytics_new_analysts/03-data-management.html#parquet).\n", " * Parquet is an “open source columnar storage format for use in data analysis systems.” Columnar storage is more efficient as it is easily compressed and the data is more homogenous. CSV files utilize a row-based storage format which is harder to compress, a reason why Parquets files are preferable for larger datasets. Parquet files are faster to read than CSVs, as they have a higher querying speed and preserve datatypes (i.e. Number, Timestamps, Points). They are best for intermediate data storage and large datasets (1GB+) on most any on-disk storage. This file format is also good for passing dataframes between Python and R. A similar option is feather.\n", "* Reference\n", - " * [DDS Docs: Saving Code](https://docs.calitp.org/data-infra/analytics_tools/saving_code.html)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "22562f2f-8359-4e44-951c-25e5ac033282", - "metadata": {}, - "outputs": [], - "source": [ - "scores_df.to_parquet(f\"{GCS_FILE_PATH}starter_kit_example_final_scores.parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "9bc1a3cb-85e2-4203-bdd4-e45bb6c20ba4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
project_nameaccessibility_scoredac_accessibility_scoredac_traffic_impacts_scorefreight_efficiency_scorefreight_sustainability_scoremode_shift_scorelu_natural_resources_scoresafety_scorevmt_scorezev_scorepublic_engagement_scoreclimate_resilience_scoreprogram_fit_scoreoverall_score
0Meadow Magic Multi-Use Path28810235327661072
1Bunny Hop Bike Boulevard3976763221026568
\n", - "
" - ], - "text/plain": [ - " project_name accessibility_score dac_accessibility_score \\\n", - "0 Meadow Magic Multi-Use Path 2 8 \n", - "1 Bunny Hop Bike Boulevard 3 9 \n", - "\n", - " dac_traffic_impacts_score freight_efficiency_score \\\n", - "0 8 10 \n", - "1 7 6 \n", - "\n", - " freight_sustainability_score mode_shift_score lu_natural_resources_score \\\n", - "0 2 3 5 \n", - "1 7 6 3 \n", - "\n", - " safety_score vmt_score zev_score public_engagement_score \\\n", - "0 3 2 7 6 \n", - "1 2 2 10 2 \n", - "\n", - " climate_resilience_score program_fit_score overall_score \n", - "0 6 10 72 \n", - "1 6 5 68 " - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "scores_df.head(2)" + " * [DDS Docs: Saving Code](https://docs.calitp.org/data-infra/analytics_tools/saving_code.html)\n", + "* Make sure you use a f-string." ] }, { diff --git a/starter_kit/2024_basics_02.ipynb b/starter_kit/2024_basics_02.ipynb index f9fc0551e..fd3759d45 100644 --- a/starter_kit/2024_basics_02.ipynb +++ b/starter_kit/2024_basics_02.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "6cbbfb96-1e9e-400a-9884-72f08d1191f3", "metadata": {}, "outputs": [], @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "3da62b06-24b4-4791-a073-185ee3765152", "metadata": {}, "outputs": [], @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "e7e4cafe-eb24-477b-a45c-88bfcaff37f3", "metadata": {}, "outputs": [], @@ -53,240 +53,6 @@ "GCS_FILE_PATH = \"gs://calitp-analytics-data/data-analyses/starter_kit/\"" ] }, - { - "cell_type": "code", - "execution_count": 4, - "id": "2c4af22f-91ac-4e03-8b80-2121adc9a348", - "metadata": {}, - "outputs": [], - "source": [ - "EXCEL_FILE = \"starter_kit_csis_scoring_workbook.xlsx\"" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "873bfb72-9b47-472c-a18b-248be7f8c694", - "metadata": {}, - "outputs": [], - "source": [ - "OVERALL_SCORE_FILE = \"starter_kit_example_final_scores.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "6cf0c667-b81a-430f-afb8-68f4e0f0a147", - "metadata": {}, - "outputs": [], - "source": [ - "projects_df = to_snakecase(pd.read_excel(f\"{GCS_FILE_PATH}{EXCEL_FILE}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "7de4e3b1-15bb-4f37-a392-36c3c0d3e39d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ct_districtproject_namescope_of_workproject_costlead_agency
01Meadow Magic Multi-Use PathA 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife.5245734Meadow Bunny Public Transportation (MBPT)
14Bunny Hop Bike BoulevardA Class II bike lane with charming streetlights, benches, and bike racks designed to resemble carrot sticks, connecting residential neighborhoods to local schools and parks.6929368Unicorn Fairy Express Bus (UFX)
\n", - "
" - ], - "text/plain": [ - " ct_district project_name \\\n", - "0 1 Meadow Magic Multi-Use Path \n", - "1 4 Bunny Hop Bike Boulevard \n", - "\n", - " scope_of_work \\\n", - "0 A 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife. \n", - "1 A Class II bike lane with charming streetlights, benches, and bike racks designed to resemble carrot sticks, connecting residential neighborhoods to local schools and parks. \n", - "\n", - " project_cost lead_agency \n", - "0 5245734 Meadow Bunny Public Transportation (MBPT) \n", - "1 6929368 Unicorn Fairy Express Bus (UFX) " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "projects_df.head(2)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "8a5e10d5-f978-408d-87d9-05f930038a47", - "metadata": {}, - "outputs": [], - "source": [ - "overall_scores_df = pd.read_parquet(f\"{GCS_FILE_PATH}{OVERALL_SCORE_FILE}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "898592ba-7655-41c9-a982-251491bd9083", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
project_nameaccessibility_scoredac_accessibility_scoredac_traffic_impacts_scorefreight_efficiency_scorefreight_sustainability_scoremode_shift_scorelu_natural_resources_scoresafety_scorevmt_scorezev_scorepublic_engagement_scoreclimate_resilience_scoreprogram_fit_scoreoverall_score
0Meadow Magic Multi-Use Path28810235327661072
1Bunny Hop Bike Boulevard3976763221026568
\n", - "
" - ], - "text/plain": [ - " project_name accessibility_score dac_accessibility_score \\\n", - "0 Meadow Magic Multi-Use Path 2 8 \n", - "1 Bunny Hop Bike Boulevard 3 9 \n", - "\n", - " dac_traffic_impacts_score freight_efficiency_score \\\n", - "0 8 10 \n", - "1 7 6 \n", - "\n", - " freight_sustainability_score mode_shift_score lu_natural_resources_score \\\n", - "0 2 3 5 \n", - "1 7 6 3 \n", - "\n", - " safety_score vmt_score zev_score public_engagement_score \\\n", - "0 3 2 7 6 \n", - "1 2 2 10 2 \n", - "\n", - " climate_resilience_score program_fit_score overall_score \n", - "0 6 10 72 \n", - "1 6 5 68 " - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "overall_scores_df.head(2)" - ] - }, { "cell_type": "markdown", "id": "4c2dd160-ec10-41ce-b5c0-a9be5934d6ee", @@ -299,11 +65,11 @@ " * Min overall score\n", " * Number of unique projects\n", "* Annoyingly enough, the `overall_score` column and the `ct_district` are in two different dataframes. \n", - "* You'll have to merge it on the common column(s) the two dataframes share.\n", + "* You'll have to merge the dataframes on the common column(s) the two dataframes share.\n", "* Welcome to DDS! This will happen to you all the time starting now. \n", "\n", "### Relevant Resources\n", - "* Read about and practice merges before diving in. \n", + "* Read about and practice merges before continuing on the exercise. \n", " * [Resource #1 is a great tutorial for beginners](https://www.practicalpythonfordatascience.com/03_cleaning_data.html?highlight=merge#merging-dataframes-together).\n", " * [Resource #2 is written by our own Tiffany Ku, but it contains some geospatial references so it's a bit more to digest](https://docs.calitp.org/data-infra/analytics_new_analysts/01-data-analysis-intro.html#merge-tabular-and-geospatial-data-for-data-analysis).\n", " " @@ -311,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "2d356494-a12a-4f67-beb3-b6ba92c8135f", "metadata": {}, "outputs": [], @@ -328,7 +94,7 @@ "**Food for Thought**\n", "* Which columns do the two dataframes have in common?\n", "* What type of merge will achieve my goal?\n", - " * Inner, outer, left, or right\n", + " * Inner, outer, left, or right?\n", "* What do I expect out of the merge?\n", " * Do I expect all the values of the merge keys to be 1:1? Or m:1? \n", " * Do I expect a project to correspond with multiple districts? Maybe, projects can and do cross multiple boundaries.\n", @@ -345,80 +111,7 @@ "### Double Checking\n", "* How many rows do you expect?\n", "* How many unique projects are there? \n", - "* Hint: check your original dataframes as well" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "ad4962ca-ed83-48a3-b1e6-79e5d5b1042b", - "metadata": {}, - "outputs": [], - "source": [ - "m1 = pd.merge(projects_df, overall_scores_df, on=[\"project_name\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "e820d7af-17d4-4b2a-8007-5d958a3f7d9e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(41, 19)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "m1.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "3642de14-3bf4-47c0-bd80-3502819ea14d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "41" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "m1.project_name.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "4b5be67a-f579-4f22-97cb-b6b31d7b8433", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "44" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "projects_df.project_name.nunique()" + "* Hint: check the lengths of your original dataframes as well" ] }, { @@ -430,44 +123,17 @@ "* As you have noticed, we are missing a couple of projects.\n", "* This is where `outer` joins are very useful.\n", "* Merge your dataframes again using an `outer` join and with `indicator = True` on.\n", - "* Using `.value_counts()` check out how many rows are found in both dataframes, the left only, and the right only" + " * `m2 = pd.merge(df1, df2, on=[column], indicator=True, how=\"outer\")`\n", + "* Using `.value_counts()` on the column named `_merge` created by `indicator=True` to check out how many rows are found in both dataframes, the left only, and the right only" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "98e92c3f-ccd4-45f8-b6a6-523ddcb4a7ac", + "execution_count": null, + "id": "dcfb00b5-a08a-49b2-8978-e42abf4538fe", "metadata": {}, "outputs": [], - "source": [ - "m2 = pd.merge(\n", - " projects_df, overall_scores_df, on=[\"project_name\"], indicator=True, how=\"outer\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "f134cddf-5220-44f9-9e15-1c5171cbedfd", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "both 41\n", - "left_only 3\n", - "right_only 3\n", - "Name: _merge, dtype: int64" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "m2._merge.value_counts()" - ] + "source": [] }, { "cell_type": "markdown", @@ -482,181 +148,27 @@ }, { "cell_type": "code", - "execution_count": 17, - "id": "4dd07bab-4d1b-41a0-954e-4c2d59584e57", + "execution_count": null, + "id": "97144120-29b6-4948-aa68-736ddbeb8d38", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
project_name_merge
10Rainbow Rush hot Lanesleft_only
12Bunny Lane HOV+2 heavenleft_only
26main street muffin topleft_only
44Rainbow Rush HOT Lanesright_only
45Bunny Lane HOV+2 Havenright_only
46Main Street Muffin Top Revitalizationright_only
\n", - "
" - ], - "text/plain": [ - " project_name _merge\n", - "10 Rainbow Rush hot Lanes left_only\n", - "12 Bunny Lane HOV+2 heaven left_only\n", - "26 main street muffin top left_only\n", - "44 Rainbow Rush HOT Lanes right_only\n", - "45 Bunny Lane HOV+2 Haven right_only\n", - "46 Main Street Muffin Top Revitalization right_only" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "m2.loc[m2._merge != \"both\"][[\"project_name\", \"_merge\"]]" - ] + "outputs": [], + "source": [] }, { "cell_type": "markdown", "id": "044330d9-8562-4510-ae62-268f240ec3bc", "metadata": {}, "source": [ - "* You could also use `isin([list of elements you want to keep])`" + "* You could also use `isin([list of elements you want to keep])` to retain multiple elements you want." ] }, { "cell_type": "code", - "execution_count": 18, - "id": "c47ef38d-6db5-4bf1-bd87-62b7d84943b6", + "execution_count": null, + "id": "121e64fa-cf65-4a3b-9a53-43994223d150", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
project_name_merge
10Rainbow Rush hot Lanesleft_only
12Bunny Lane HOV+2 heavenleft_only
26main street muffin topleft_only
44Rainbow Rush HOT Lanesright_only
45Bunny Lane HOV+2 Havenright_only
46Main Street Muffin Top Revitalizationright_only
\n", - "
" - ], - "text/plain": [ - " project_name _merge\n", - "10 Rainbow Rush hot Lanes left_only\n", - "12 Bunny Lane HOV+2 heaven left_only\n", - "26 main street muffin top left_only\n", - "44 Rainbow Rush HOT Lanes right_only\n", - "45 Bunny Lane HOV+2 Haven right_only\n", - "46 Main Street Muffin Top Revitalization right_only" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "m2.loc[m2._merge.isin([\"left_only\",\"right_only\"])][[\"project_name\", \"_merge\"]]" - ] + "outputs": [], + "source": [] }, { "cell_type": "markdown", @@ -668,24 +180,11 @@ }, { "cell_type": "code", - "execution_count": 19, - "id": "09ff3055-29ee-4ea1-a164-5d4796aa1807", + "execution_count": null, + "id": "a1ffc2fe-e611-4cd1-8491-957a8f50b523", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(41, 20)" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "m2.loc[~m2._merge.isin([\"left_only\",\"right_only\"])].shape" - ] + "outputs": [], + "source": [] }, { "cell_type": "markdown", @@ -695,21 +194,21 @@ "### Dictionaries\n", "* String data is often entered in many different ways. \n", " * BART can be entered in as bart, Bay Area Rapid Transit, BaRT, and more. \n", - "* Often, strings are the reason why your dataframe is not merging properly.\n", + "* Often, differing strings between two dataframes are the reason why your dataframe is not merging properly.\n", "* In Excel, it's easy to go in and manually tweak everything. However, that is not reproducible and time consuming. \n", "* Luckily with Python we can automate this. \n", - "* Since there are a couple of names to replace, we can do it using a dictionary.\n", + "* Since there are onlh a couple of names to replace, we can do it using a dictionary.\n", "\n", "#### What is a dictionary?\n", "* Per Practical Python for Data Science, a dictionary is Dictionaries are used to store data values in key:value pairs. Similar to the list, a dictionary is a collection of objects. It is also mutable, meaning that you can add, remove, change values inside of it...With the list, we access elements using the index. With the dictionary, we access elements using keys..\n", - "* Dictionaries are very important. \n", + "* Dictionaries are very important.\n", "* Read more [here](https://www.practicalpythonfordatascience.com/00_python_crash_course_datatypes.html?highlight=dictionary#dictionary) and **follow its example in the cells below.**\n", " " ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "df6fa95e-cc25-4142-8c2b-ee254863e609", "metadata": {}, "outputs": [], @@ -722,7 +221,7 @@ "id": "76e42f11-fdcb-48f3-8951-2f2cea0384c0", "metadata": {}, "source": [ - "#### Replacing Values\n", + "#### Application of Dictionaries: Replacing Values\n", "* [Resource](https://www.practicalpythonfordatascience.com/03_cleaning_data#recoding-column-values)\n", "* **Step 1**: Filter out for the rows that didn't merge. Find the unique values of the `project_name` column using `.unique()`\n", "* Take a look at elements using \n", @@ -734,27 +233,11 @@ }, { "cell_type": "code", - "execution_count": 21, - "id": "5601fd36-d221-41da-ab76-b88c616e5e62", + "execution_count": null, + "id": "5cdc18c4-2312-41ae-a027-8c0fe164018b", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['Rainbow Rush hot Lanes', 'Bunny Lane HOV+2 heaven',\n", - " 'main street muffin top ', 'Rainbow Rush HOT Lanes',\n", - " 'Bunny Lane HOV+2 Haven', 'Main Street Muffin Top Revitalization'],\n", - " dtype=object)" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "m2.loc[m2._merge.isin([\"left_only\",\"right_only\"])].project_name.unique()" - ] + "outputs": [], + "source": [] }, { "cell_type": "markdown", @@ -762,22 +245,18 @@ "metadata": {}, "source": [ "* **Step 2:** Decide whether you want to rename the values in the left dataframe or the right one. \n", - "* **Step 3:** The keys, are the values you want to replace. The values, are what you want to replace these values with. " + "* **Step 3:** The keys, are the values you want to replace. The values, are what you want to replace these values with. \n", + " * Let's say my left value is \"AC Transit\" but I want it to be \"Alameda Contra Costa County Transit\", my dictionary would be \n", + " * `my_dict = {\"AC Transit\":\"Alameda Contra Costa County Transit\"`" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "id": "9dad92fe-87a6-434d-a62f-d269f3ad1054", "metadata": {}, "outputs": [], - "source": [ - "new_names = {\n", - " \"main street muffin top \": \"Main Street Muffin Top Revitalization\",\n", - " \"Bunny Lane HOV+2 heaven\": \"Bunny Lane HOV+2 Haven\",\n", - " \"Rainbow Rush hot Lanes\": \"Rainbow Rush HOT Lanes\",\n", - "}" - ] + "source": [] }, { "cell_type": "markdown", @@ -789,12 +268,12 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "d8532992-771e-446a-b419-55ad757ff45f", "metadata": {}, "outputs": [], "source": [ - "projects_df.project_name = projects_df.project_name.replace(new_names)" + "df.project_name = df.project_name.replace(your_dictionary)" ] }, { @@ -802,157 +281,34 @@ "id": "68562b10-b9bd-4892-8780-a66cad1a06d4", "metadata": {}, "source": [ - "#### Merge your dataframes again. This time the number of unique project names should match the rows of the merged dataframe perfectly." + "#### Merge your dataframes again. \n", + "* This time the number of unique project names should match the rows of the merged dataframe perfectly.\n", + "* Make sure to double check that!" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "id": "db09aa04-7a94-4b94-9ade-10b1a987e006", "metadata": {}, "outputs": [], - "source": [ - "final_m = pd.merge(projects_df, overall_scores_df, how=\"inner\", on=\"project_name\")" - ] - }, - { - "cell_type": "markdown", - "id": "144aa8a8-df59-418a-a0c7-4dbc3537c68f", - "metadata": {}, - "source": [ - "* You can check if two values are equal using `==`." - ] + "source": [] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "id": "04f4f6d8-55b6-460c-8a52-8626dcfd1cb9", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(final_m) == final_m.project_name.nunique()" - ] + "outputs": [], + "source": [] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "39d74f54-a72b-4acc-91b0-b3dcb4539a92", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ct_districtproject_namescope_of_workproject_costlead_agencyaccessibility_scoredac_accessibility_scoredac_traffic_impacts_scorefreight_efficiency_scorefreight_sustainability_scoremode_shift_scorelu_natural_resources_scoresafety_scorevmt_scorezev_scorepublic_engagement_scoreclimate_resilience_scoreprogram_fit_scoreoverall_score
01Meadow Magic Multi-Use PathA 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife.5245734Meadow Bunny Public Transportation (MBPT)28810235327661072
\n", - "
" - ], - "text/plain": [ - " ct_district project_name \\\n", - "0 1 Meadow Magic Multi-Use Path \n", - "\n", - " scope_of_work \\\n", - "0 A 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife. \n", - "\n", - " project_cost lead_agency \\\n", - "0 5245734 Meadow Bunny Public Transportation (MBPT) \n", - "\n", - " accessibility_score dac_accessibility_score dac_traffic_impacts_score \\\n", - "0 2 8 8 \n", - "\n", - " freight_efficiency_score freight_sustainability_score mode_shift_score \\\n", - "0 10 2 3 \n", - "\n", - " lu_natural_resources_score safety_score vmt_score zev_score \\\n", - "0 5 3 2 7 \n", - "\n", - " public_engagement_score climate_resilience_score program_fit_score \\\n", - "0 6 6 10 \n", - "\n", - " overall_score \n", - "0 72 " - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "final_m.head(1)" - ] + "outputs": [], + "source": [] }, { "cell_type": "markdown", @@ -965,14 +321,11 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "id": "70410a43-62c9-467c-b777-3415f22abe01", "metadata": {}, "outputs": [], - "source": [ - "\n", - "final_m.to_parquet(f\"{GCS_FILE_PATH}starter_kit_example_merge.parquet\")" - ] + "source": [] }, { "cell_type": "markdown", @@ -981,24 +334,24 @@ "source": [ "## Groupby\n", "* You're done merging...Oh wait, that wasn't even part of your manager's request. You still need to aggregate. \n", - "* The refresh your memory by Caltrans District to find\n", + "* By Caltrans District to find\n", " * Median overall score\n", " * Max overall score \n", " * Min overall score\n", " * Number of unique projects\n", "* There are many options Some are `groupby / agg`, `pivot_table`, `groupby / transform`\n", - "* Resource: Use the space below to explore this example.\n", + "* Resource: \n", " * [DDS Docs](https://docs.calitp.org/data-infra/analytics_new_analysts/01-data-analysis-intro.html#aggregating)" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "id": "02f34bdf-3c17-4674-bdf0-f9982e7fac0a", "metadata": {}, "outputs": [], "source": [ - "# Practice tutorial here" + "# Practice tutorial linked above here" ] }, { @@ -1007,222 +360,19 @@ "metadata": {}, "source": [ "### Apply your new knowledge to the prompt above.\n", - "* Hint: After aggregating, your column name will no longer be relevant. For example, if you use `scope_of_work` to count the number of projects, this column no longer represents `scope_of_work`. It should be renamed something like `n_projects`.\n", - " * Rename your columns using this `df.rename(columns={\"old_column_name\":\"new_column_name\"})`" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "7328fcf2-ea52-46b8-8624-a7f3f39428df", - "metadata": {}, - "outputs": [], - "source": [ - "final_m[\"min_score\"] = final_m.overall_score" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "8dc4063c-1150-4b67-a125-16f245f4b9c4", - "metadata": {}, - "outputs": [], - "source": [ - "final_m[\"max_score\"] = final_m.overall_score" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "0892a805-7d7f-47cf-b086-f5e320c5361c", - "metadata": {}, - "outputs": [], - "source": [ - "agg1 = (\n", - " final_m.groupby([\"ct_district\"])\n", - " .agg(\n", - " {\n", - " \"overall_score\": \"median\",\n", - " \"min_score\": \"min\",\n", - " \"max_score\": \"max\",\n", - " \"project_name\": \"nunique\",\n", - " }\n", - " )\n", - " .reset_index()\n", - ")" + "* Hint: After aggregating, some of the column names will no longer be relevant. \n", + "* For example, if you use `scope_of_work` to count the number of projects, this column no longer represents `scope_of_work`.\n", + "* It should be renamed something like `n_projects`.\n", + "* Rename your columns using this `df.rename(columns={\"old_column_name\":\"new_column_name\"})`" ] }, { "cell_type": "code", - "execution_count": 32, - "id": "94c178b1-ff70-4d63-8820-aef101928c75", - "metadata": {}, - "outputs": [], - "source": [ - "agg1 = agg1.rename(\n", - " columns={\"overall_score\": \"median_score\", \"project_name\": \"n_projects\"}\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 33, + "execution_count": null, "id": "70178e81-0d11-4d19-9001-96e466d6dced", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ct_districtmedian_scoremin_scoremax_scoren_projects
0172.0072721
1261.5060632
2380.5054976
3470.5060976
4577.0058984
5672.0063773
6782.0079943
7873.0066855
8975.0067873
91072.5059862
101175.0055895
111272.5060974
\n", - "
" - ], - "text/plain": [ - " ct_district median_score min_score max_score n_projects\n", - "0 1 72.00 72 72 1\n", - "1 2 61.50 60 63 2\n", - "2 3 80.50 54 97 6\n", - "3 4 70.50 60 97 6\n", - "4 5 77.00 58 98 4\n", - "5 6 72.00 63 77 3\n", - "6 7 82.00 79 94 3\n", - "7 8 73.00 66 85 5\n", - "8 9 75.00 67 87 3\n", - "9 10 72.50 59 86 2\n", - "10 11 75.00 55 89 5\n", - "11 12 72.50 60 97 4" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "agg1" - ] + "outputs": [], + "source": [] }, { "cell_type": "markdown", @@ -1246,7 +396,7 @@ "* Some ideas:\n", " * Change the font\n", " * Turn off the index\n", - " * Use colors to indicate low-high values\n", + " * Use colors to code low-high values\n", " * Change the alignment of the values" ] }, @@ -1256,7 +406,9 @@ "id": "a0d8b97b-0f34-495a-8dfb-61642c44879a", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Practice here " + ] }, { "cell_type": "markdown", @@ -1266,7 +418,7 @@ "### Altair\n", "* While a table is great, sometimes a chart is a better way to display an insight.\n", "* Our preferred visualization library is `Altair`.\n", - " * Their docs page is [here](https://altair-viz.github.io/).\n", + " * Docs page is [here](https://altair-viz.github.io/).\n", "* The code to create a simple bar chart goes something like this. \n", " * `alt.Chart(source).mark_bar().encode(x='a',y='b')`\n", " * `source` is the dataframe you want to use for your chart.\n", @@ -1277,91 +429,14 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "id": "fdcece32-d053-4b32-9e76-0f5ffed9ff52", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "alt.Chart(agg1).mark_bar().encode(x=\"ct_district\", y=\"n_projects\")" + "alt.Chart(agg1).mark_bar().encode(\n", + " x=\"ct_district\", y=\"n_projects\"\n", + ")" ] }, { @@ -1373,95 +448,17 @@ "* `altair` offers an endless ways to amp up the personality of your chart.\n", "* Additionally, the chart above without a title and legend is a data visualization \"taboo\" and the dull blue is uninspiring. \n", "\n", - "##### Add a title\n", - "* You can do so within `.Chart()`" + "#### Add a title\n", + "* You can do so within `.Chart()`\n", + "`alt.Chart(source, title=\"your_title_here\").mark_bar().encode(x='a',y='b')`" ] }, { "cell_type": "code", - "execution_count": 35, - "id": "88e1dff9-0188-49c9-b6cc-599610aca9a7", + "execution_count": null, + "id": "fc691315-e545-4120-971e-7a8f295a7c19", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "alt.Chart(agg1, title=\"your_title_here\").mark_bar().encode(\n", " x=\"ct_district\", y=\"n_projects\"\n", @@ -1479,180 +476,10 @@ }, { "cell_type": "code", - "execution_count": 36, - "id": "d43cae4f-1faf-48fb-8c21-559feb5243b1", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "alt.Chart(agg1, title=\"your_title_here\").mark_circle().encode(\n", - " x=\"ct_district\", y=\"n_projects\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 37, + "execution_count": null, "id": "9b94c3f2-af01-43d4-9f17-98cd863511a3", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "alt.Chart(agg1, title=\"your_title_here\").mark_line().encode(\n", " x=\"ct_district\", y=\"n_projects\"\n", @@ -1670,7 +497,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "id": "cb8df2a3-bf37-4fe4-833e-1259a6ad7f15", "metadata": {}, "outputs": [], @@ -1690,66 +517,13 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "id": "aa21d088-3360-4d3e-811c-8cc5bdb2d3a8", "metadata": { "scrolled": true, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "\u001b[0;31mType:\u001b[0m module\n", - "\u001b[0;31mString form:\u001b[0m \n", - "\u001b[0;31mFile:\u001b[0m /opt/conda/lib/python3.9/site-packages/calitp_data_analysis/calitp_color_palette.py\n", - "\u001b[0;31mSource:\u001b[0m \n", - "\u001b[0;31m# --------------------------------------------------------------#\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;31m# Cal-ITP style guide\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;31m# Google Drive > Cal-ITP Team > Project Resources >\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;31m# Branded Resources and External Comms Guidelines > Branded Resources > Style Guide\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;31m# --------------------------------------------------------------#\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0mCALITP_CATEGORY_BRIGHT_COLORS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#2EA8CE\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# darker blue\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#EB9F3C\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# orange\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#F4D837\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# yellow\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#51BF9D\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# green\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#8CBCCB\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# lighter blue\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#9487C0\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# purple\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0mCALITP_CATEGORY_BOLD_COLORS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#136C97\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# darker blue\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#E16B26\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# orange\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#F6BF16\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# yellow\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#00896B\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# green\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#7790A3\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# lighter blue\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#5B559C\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# purple\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0mCALITP_DIVERGING_COLORS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#E16B26\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#EB9F3C\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# oranges\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#f6e7e1\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# linen\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#8CBCCB\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#2EA8CE\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#136C97\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# blues\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0mCALITP_SEQUENTIAL_COLORS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#B9D6DF\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# light blue (lightest)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#8CBCCB\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# lighter blue bright\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#2EA8CE\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# darker blue bright\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#136C97\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# darker blue bold\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"#0B405B\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# indigo dye (darkest)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "calitp_color_palette??" ] @@ -1759,95 +533,16 @@ "id": "f3971ba8-1c8f-4003-8e34-c3fd31f3f585", "metadata": {}, "source": [ - "* Place your color palette in the `scale` argument `scale=alt.Scale(range=your_color_palette)`.\n", - "* If I'm using a palette from `calitp_color_palette`, I would write `scale=alt.Scale(range=calitp_color_palette.CALITP_DIVERGING_COLORS)`." + "* Place the column you want the colors to be based on in `color=alt.Color(column)`\n", + "* Place your color palette in the `scale` argument `scale=alt.Scale(range=your_color_palette)`." ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": null, "id": "c629f242-9b1b-49d1-b4b0-1bb956782d69", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "alt.Chart(agg1, title=\"your_title_here\").mark_bar().encode(\n", " x=\"ct_district\",\n", @@ -1856,8 +551,8 @@ " \"n_projects\", # This is the column you want the color of your bar to be based on\n", " title=\"legend_title_here\", # This is the legend of your title\n", " scale=alt.Scale(\n", - " range=calitp_color_palette.CALITP_DIVERGING_COLORS # This is where you can customize the colors,\n", - " ), \n", + " range=calitp_color_palette.CALITP_DIVERGING_COLORS # This is where you can customize the colors,\n", + " ),\n", " ),\n", ")" ] @@ -1869,8 +564,7 @@ "source": [ "#### Adjusting the Axis\n", "* Sometimes, we want to adjust the axis to have a min and max value.\n", - "* You do so using the `scale=alt.Scale(domain=[min_value, max_value]))` argument behind the X and Y axis.\n", - "* `alt.X()` and `alt.Y` gives you many more customization options." + "* You do so using the `scale=alt.Scale(domain=[min_value, max_value]))` argument behind the X and Y axis." ] }, { @@ -1898,97 +592,18 @@ "source": [ "### Finishing Touches \n", "* `.properties(width=400, height=250)` adjusts the size of your chart. \n", - "* `tooltip=[columns you want]` gives you additional details on the columns you specify when you hover over each bar/circle/etc.\n", + "* `tooltip=[columns you want]` allows you to create a tooltip that pops up when you hover over each bar/circle/etc.\n", "* `.mark_bar(size=10)` adjusts the size of the bar/circle/etc." ] }, { "cell_type": "code", - "execution_count": 45, + "execution_count": null, "id": "8b85dd29-88cb-4b4b-b3b7-20ee1851335e", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "alt.Chart(agg1, title=\"your_title_here\").mark_bar(size = 10).encode(\n", + "alt.Chart(agg1, title=\"your_title_here\").mark_bar(size=10).encode(\n", " x=alt.X(\"ct_district\", scale=alt.Scale(domain=[1, 12])),\n", " y=alt.Y(\"n_projects\", scale=alt.Scale(domain=[0, 10])),\n", " color=alt.Color(\n", @@ -1996,14 +611,16 @@ " title=\"legend_title_here\",\n", " scale=alt.Scale(range=calitp_color_palette.CALITP_DIVERGING_COLORS),\n", " ),\n", - " tooltip=[\"ct_district\", \"n_projects\"]\n", + " tooltip=[\"ct_district\", \"n_projects\"],\n", ").properties(width=400, height=250)" ] }, { "cell_type": "markdown", "id": "281e37d9-8ece-471d-abc2-38e4ad9f9e83", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "### We have only visualized one column of data. \n", "* We have only visualized one column of data, but we have a couple of columns above. \n", @@ -2014,6 +631,14 @@ " * Altair's [gallery](https://altair-viz.github.io/gallery/index.html)\n", " * DDS's [portfolio](https://analysis.calitp.org/)\n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffbfbafb-1055-448c-a889-18d1fe508cab", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/starter_kit/2024_basics_03.ipynb b/starter_kit/2024_basics_03.ipynb index 85796cd3d..58be931be 100644 --- a/starter_kit/2024_basics_03.ipynb +++ b/starter_kit/2024_basics_03.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "ba8a0d90-9d57-4d01-9eb4-0b255970995e", "metadata": {}, "outputs": [], @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "ddcdbbc1-2e1b-4797-bd34-07d9a1999cb6", "metadata": {}, "outputs": [], @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "7c52b09e-90b5-4a5d-8fda-ca19cb8fe3cd", "metadata": {}, "outputs": [], @@ -54,133 +54,19 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "e0222b8c-0996-47bb-8639-fc703cfbd249", "metadata": {}, "outputs": [], - "source": [ - "FILE = \"starter_kit_example_merge.parquet\"" - ] + "source": [] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "36bbc1d2-4285-4399-a0fd-1e02c5e5d5a1", "metadata": {}, "outputs": [], - "source": [ - "df = pd.read_parquet(f\"{GCS_FILE_PATH}{FILE}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "c97f0ec6-bea0-401a-bb27-f37984a762eb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ct_districtproject_namescope_of_workproject_costlead_agencyaccessibility_scoredac_accessibility_scoredac_traffic_impacts_scorefreight_efficiency_scorefreight_sustainability_scoremode_shift_scorelu_natural_resources_scoresafety_scorevmt_scorezev_scorepublic_engagement_scoreclimate_resilience_scoreprogram_fit_scoreoverall_score
01Meadow Magic Multi-Use PathA 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife.5245734Meadow Bunny Public Transportation (MBPT)28810235327661072
\n", - "
" - ], - "text/plain": [ - " ct_district project_name \\\n", - "0 1 Meadow Magic Multi-Use Path \n", - "\n", - " scope_of_work \\\n", - "0 A 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife. \n", - "\n", - " project_cost lead_agency \\\n", - "0 5245734 Meadow Bunny Public Transportation (MBPT) \n", - "\n", - " accessibility_score dac_accessibility_score dac_traffic_impacts_score \\\n", - "0 2 8 8 \n", - "\n", - " freight_efficiency_score freight_sustainability_score mode_shift_score \\\n", - "0 10 2 3 \n", - "\n", - " lu_natural_resources_score safety_score vmt_score zev_score \\\n", - "0 5 3 2 7 \n", - "\n", - " public_engagement_score climate_resilience_score program_fit_score \\\n", - "0 6 6 10 \n", - "\n", - " overall_score \n", - "0 72 " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head(1)" - ] + "source": [] }, { "cell_type": "markdown", @@ -188,8 +74,8 @@ "metadata": {}, "source": [ "## Categorizing\n", - "* There are 40+ projects. They all vary in themes, some are transit oriented while others are focused on Active Transportation (ATP).\n", - "* Categorizing data is an important part of data cleaning and analyzing so we can present the data on a more succinct, broader level. \n", + "* There are 40+ projects. They all vary in themes, some contain transit elements while others contain Active Transportation (ATP) components. Some contain both! \n", + "* Categorizing data is an important part of data cleaning and analyzing so we can present the data on a more succinct level. \n", "* Let's organize projects into three categories.\n", " * ATP\n", " * Transit\n", @@ -203,13 +89,13 @@ "source": [ "### Task 1: Strings\n", "* Below are some of the common keywords that fall into the categories detailed above. They are held in a `list`.\n", - "* Feel free to add other terms you think are relevant. \n", + "* Add other terms you think are relevant. \n", "* We are going to search the `Scope of Work` column for these keywords. " ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "6a6b817f-15e2-4d1c-aeae-5d7e9661a6f0", "metadata": {}, "outputs": [], @@ -228,29 +114,20 @@ "* Remember in Exercise 2 some of the project names didn't merge between the two dataframes?\n", "* In the real world, you won't have the bandwidth and time to replace each individual string value with a dictionary.\n", "* An easy way to clean most of the values up is by lowercasing, stripping the white spaces, and replacing characters.\n", - "* In our goal of categorizing values, we can search through it easier when we clean up the string values." + "* We can search through a string column easier when we simplify up the values." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "ea4a4df7-61ec-430b-a827-302704857318", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2254/3600759827.py:2: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n", - " df.scope_of_work.str.lower()\n" - ] - } - ], + "outputs": [], "source": [ "df.scope_of_work = (\n", - " df.scope_of_work.str.lower()\n", - " .str.strip()\n", - " .str.replace(\"-\", \" \")\n", + " df.scope_of_work.str.lower() # Lowers the strings\n", + " .str.strip() # Strips trailing white spaces\n", + " .str.replace(\"-\", \" \") # Replaces hyphens with a space\n", " .str.replace(\"+\", \" \")\n", " .str.replace(\"_\", \" \")\n", ")" @@ -272,7 +149,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "be843d6a-b751-4e9f-8820-b521089914d3", "metadata": {}, "outputs": [], @@ -286,36 +163,14 @@ "metadata": {}, "source": [ "* Let's see how many transit projects are in this dataset.\n", - "* Let's read through the Scope of Work to make sure it's what we expect.\n", "* Tip\n", - " * The data we work with tends to be pretty wide. Scrolling horizontally gets tiresome.\n", - " * Placing all the columns you want to temporarily work within a `list` like `preview_subset` below is a good idea. " + " * The data we typically work with tends to be wide (read about wide vs. long data [here](https://www.statology.org/long-vs-wide-data/)). Scrolling horizontally gets tiresome.\n", + " * Placing all the columns you want to temporarily work within a `list` like `preview_subset` below is a good idea to temporarily narrow down your dataframe while working. " ] }, { "cell_type": "code", - "execution_count": 10, - "id": "0d9a6259-8748-41fe-a549-01bdf0e9c273", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "7" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(transit_only_projects)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "315228d8-a72e-4f18-a0e7-2a254c87cc23", "metadata": {}, "outputs": [], @@ -325,100 +180,10 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "6789307c-5808-4501-a1a6-5a14a12b0219", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
project_namescope_of_work
11Greenway Gables Managed Lanesmanaged lanes prioritizing carpools, clean vehicles, and public transit, featuring real time traffic updates and incentives for sustainable transportation choices.
16Sparkle City Smart Streets Initiativean intelligent transportation system integrating traffic management, real time transit information, and smart parking solutions to enhance mobility and reduce congestion.
19Rolling Renaissance Rabbit Expressnew, eco friendly rolling stock for public transit, incorporating advanced propulsion systems, comfortable seating, and onboard amenities.
20Transit Treasure Transit Oasistransit supportive features, including shelters, wi fi, and real time information displays, prioritizing passenger convenience and accessibility.
25Trail of Treats and Transit Huba multi use path connecting to public transit, featuring public art installations, wayfinding signage, and amenities like bike storage and repair stations.
27Park and Ride Petal Paradisean attractive park and ride facility with amenities like ev charging, wi fi, and convenient access to nearby transit options.
43Brookside Bus Blossom Laneprioritize public transportation and enhance air quality by dedicating lanes to buses and hovs on brookside boulevard, integrating smart traffic signals and real time transit information inspired by the ancient elves.
\n", - "
" - ], - "text/plain": [ - " project_name \\\n", - "11 Greenway Gables Managed Lanes \n", - "16 Sparkle City Smart Streets Initiative \n", - "19 Rolling Renaissance Rabbit Express \n", - "20 Transit Treasure Transit Oasis \n", - "25 Trail of Treats and Transit Hub \n", - "27 Park and Ride Petal Paradise \n", - "43 Brookside Bus Blossom Lane \n", - "\n", - " scope_of_work \n", - "11 managed lanes prioritizing carpools, clean vehicles, and public transit, featuring real time traffic updates and incentives for sustainable transportation choices. \n", - "16 an intelligent transportation system integrating traffic management, real time transit information, and smart parking solutions to enhance mobility and reduce congestion. \n", - "19 new, eco friendly rolling stock for public transit, incorporating advanced propulsion systems, comfortable seating, and onboard amenities. \n", - "20 transit supportive features, including shelters, wi fi, and real time information displays, prioritizing passenger convenience and accessibility. \n", - "25 a multi use path connecting to public transit, featuring public art installations, wayfinding signage, and amenities like bike storage and repair stations. \n", - "27 an attractive park and ride facility with amenities like ev charging, wi fi, and convenient access to nearby transit options. \n", - "43 prioritize public transportation and enhance air quality by dedicating lanes to buses and hovs on brookside boulevard, integrating smart traffic signals and real time transit information inspired by the ancient elves. " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "transit_only_projects[preview_subset]" ] @@ -430,16 +195,16 @@ "source": [ "#### Step 2: Filtering\n", "* We've found all the projects that says \"transit\" somewhere in its description. \n", - "* Now there are just many more elements to go. We forgot about bikes, bus, rail, so on and so forth.\n", + "* Now there are just many more transit related elements to go. We forgot about bikes, bus, rail, so on and so forth.\n", "* The method above leaves us with multiple dataframes. We actually just want our one original dataframe tagged with categories. \n", "* A faster way: join all the keywords you want into one large string.\n", " * | designates \"or\".\n", - " * You can read `transit_keywords` as \"I want projects that contain the word transit or passenger rai or bus or ferry\"" + " * You can read `transit_keywords` as \"I want projects that contain the word transit or passenger rail or bus or ferry\"" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "c2575f75-44ac-46ba-a334-fdf984546cd3", "metadata": {}, "outputs": [], @@ -449,21 +214,10 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "f6a2a521-c0ae-4c2d-830d-4020a13855f2", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'(transit|passenger rail|bus|ferry)'" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Print it out\n", "transit_keywords" @@ -479,122 +233,10 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "e5e23b6f-98b8-4219-bc52-d847ea39d121", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2254/1070197006.py:1: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", - " df.loc[df.scope_of_work.str.contains(transit_keywords)][preview_subset]\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
project_namescope_of_work
11Greenway Gables Managed Lanesmanaged lanes prioritizing carpools, clean vehicles, and public transit, featuring real time traffic updates and incentives for sustainable transportation choices.
16Sparkle City Smart Streets Initiativean intelligent transportation system integrating traffic management, real time transit information, and smart parking solutions to enhance mobility and reduce congestion.
18Coastal Commuter Carousela 30 mile passenger rail line connecting coastal towns, featuring modern train sets, enhanced station amenities, and scenic viewing cars.
19Rolling Renaissance Rabbit Expressnew, eco friendly rolling stock for public transit, incorporating advanced propulsion systems, comfortable seating, and onboard amenities.
20Transit Treasure Transit Oasistransit supportive features, including shelters, wi fi, and real time information displays, prioritizing passenger convenience and accessibility.
21Berry Best Bus Rapid Transitdedicated bus lanes with comfortable stops, featuring off board fare payment, priority traffic signals, and enhanced passenger amenities.
25Trail of Treats and Transit Huba multi use path connecting to public transit, featuring public art installations, wayfinding signage, and amenities like bike storage and repair stations.
27Park and Ride Petal Paradisean attractive park and ride facility with amenities like ev charging, wi fi, and convenient access to nearby transit options.
43Brookside Bus Blossom Laneprioritize public transportation and enhance air quality by dedicating lanes to buses and hovs on brookside boulevard, integrating smart traffic signals and real time transit information inspired by the ancient elves.
\n", - "
" - ], - "text/plain": [ - " project_name \\\n", - "11 Greenway Gables Managed Lanes \n", - "16 Sparkle City Smart Streets Initiative \n", - "18 Coastal Commuter Carousel \n", - "19 Rolling Renaissance Rabbit Express \n", - "20 Transit Treasure Transit Oasis \n", - "21 Berry Best Bus Rapid Transit \n", - "25 Trail of Treats and Transit Hub \n", - "27 Park and Ride Petal Paradise \n", - "43 Brookside Bus Blossom Lane \n", - "\n", - " scope_of_work \n", - "11 managed lanes prioritizing carpools, clean vehicles, and public transit, featuring real time traffic updates and incentives for sustainable transportation choices. \n", - "16 an intelligent transportation system integrating traffic management, real time transit information, and smart parking solutions to enhance mobility and reduce congestion. \n", - "18 a 30 mile passenger rail line connecting coastal towns, featuring modern train sets, enhanced station amenities, and scenic viewing cars. \n", - "19 new, eco friendly rolling stock for public transit, incorporating advanced propulsion systems, comfortable seating, and onboard amenities. \n", - "20 transit supportive features, including shelters, wi fi, and real time information displays, prioritizing passenger convenience and accessibility. \n", - "21 dedicated bus lanes with comfortable stops, featuring off board fare payment, priority traffic signals, and enhanced passenger amenities. \n", - "25 a multi use path connecting to public transit, featuring public art installations, wayfinding signage, and amenities like bike storage and repair stations. \n", - "27 an attractive park and ride facility with amenities like ev charging, wi fi, and convenient access to nearby transit options. \n", - "43 prioritize public transportation and enhance air quality by dedicating lanes to buses and hovs on brookside boulevard, integrating smart traffic signals and real time transit information inspired by the ancient elves. " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.loc[df.scope_of_work.str.contains(transit_keywords)][preview_subset]" ] @@ -604,36 +246,16 @@ "id": "c82ef0b7-d2c9-48d1-a53f-625fb083e196", "metadata": {}, "source": [ - "* Notice how many more projects appear when we filter for 3 additional transit related keywords, compared to only transit?" + "* Count how many more projects appear when we filter for 3 additional transit related keywords, compared to only transit below." ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "7b62f28d-7b28-4258-8efa-74d1f9a41d04", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "7\n", - "9\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2254/2770509021.py:2: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", - " print(len(df.loc[df.scope_of_work.str.contains(transit_keywords)]))\n" - ] - } - ], - "source": [ - "print(len(transit_only_projects))\n", - "print(len(df.loc[df.scope_of_work.str.contains(transit_keywords)]))" - ] + "outputs": [], + "source": [] }, { "cell_type": "markdown", @@ -648,19 +270,10 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "47afb269-672f-44c1-8ab5-d70921c6e703", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2254/653877654.py:2: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", - " (df.scope_of_work.str.contains(transit_keywords)),\n" - ] - } - ], + "outputs": [], "source": [ "df[\"Transit\"] = np.where(\n", " (df.scope_of_work.str.contains(transit_keywords)),\n", @@ -679,26 +292,11 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "c63f2ff8-3d2f-41c6-96d1-36d35159aef8", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "N 35\n", - "Y 9\n", - "Name: Transit, dtype: int64" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.Transit.value_counts()" - ] + "outputs": [], + "source": [] }, { "cell_type": "markdown", @@ -708,9 +306,9 @@ }, "source": [ "### Task 2: Functions \n", - "* It looks only the 9 transit projects were categorized.\n", - "* We are missing the 2 categories: ATP and General Lane related projects.\n", - "* We could repeat the steps above or we can use a function.\n", + "* It looks like there are only 9 transit projects.\n", + "* We are missing the 2 other categories: ATP and General Lane related projects.\n", + "* We could repeat the steps above or we can use a **function.**\n", " * You can think of a function as a piece of code you write only once but reuse more than once.\n", " * In the long run, functions save you work and look neater when you present your work.\n", "* You may not have realized this but you've been using functions this whole time.\n", @@ -719,21 +317,10 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "8c62fef2-8215-4983-a4e6-c671177b822f", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "44" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(df)" ] @@ -748,63 +335,30 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "0659a036-76ad-4251-80a1-323a0a04c912", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pandas.core.frame.DataFrame" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "type(df)" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "id": "2985ec16-35e1-4eae-b2c5-facb354ce4e5", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "str" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "type(GCS_FILE_PATH)" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "id": "65c6b0c7-a314-434f-8304-10afd6c84514", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "list" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "type(transit)" ] @@ -824,7 +378,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "00ead246-8879-4075-a632-d0ded58df558", "metadata": {}, "outputs": [], @@ -840,13 +394,14 @@ }, "source": [ "#### Let's build a function together.\n", - "* This will be repetitive after the tutorials, but you will use functions all the time at DDS and this is a concept we would like to drive home.\n", - "* Start your function with `def():`` and the name you'd like." + "* This will be repetitive after the tutorials, but you will use functions all the time at DDS.\n", + "##### Step 1\n", + "* Start your function with `def` and the name you'd like. I'm calling it `categorize():`" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "id": "97e597a2-8625-4f2b-8646-760c0c011208", "metadata": {}, "outputs": [], @@ -859,6 +414,7 @@ "id": "06ccd282-cf21-462b-8930-9a3148671ff1", "metadata": {}, "source": [ + "##### Step 2 \n", "* Now let's think of what are the two elements that we will repeat.\n", "* We merely want to substitute `transit_keywords` with ATP or General Lane related keywords.\n", "* Instead of the `df[\"Transit]\"`, we want to create two new columns called something like `df[\"ATP]\"` and `df[\"General_Lanes]\"` to hold our yes/no results.\n", @@ -869,7 +425,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "id": "61973dc6-d99b-48f0-842f-a3c8fe74f064", "metadata": {}, "outputs": [], @@ -882,13 +438,14 @@ "id": "ae178f6d-0f76-419c-aab2-9924ba294605", "metadata": {}, "source": [ - "* It's also a nice idea to document what your function will return.\n", + "##### Step 3\n", + "* It's also good to document what your function will return.\n", "* In our case, it's a Pandas dataframe. " ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "a794693a-3bf2-48ba-b0a7-1ca3a41e03af", "metadata": {}, "outputs": [], @@ -901,6 +458,7 @@ "id": "be820c1a-a0d2-4b2f-bf01-70e753603291", "metadata": {}, "source": [ + "##### Step 4\n", "* Think about the steps we took to categorize transit only.\n", "* Add the sections of the code we will be reusing and sub in the original variables for the arguments.\n", " * First, we joined the keywords from a list into a big string.\n", @@ -910,7 +468,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "id": "4721b564-726a-4e05-9d27-8035609b5fcf", "metadata": {}, "outputs": [], @@ -935,110 +493,20 @@ "id": "81bbb109-beef-452c-b8d9-eb13e7b9ee03", "metadata": {}, "source": [ - "* Now let's use your function" + "#### Step 5 \n", + "* Now let's use your function: input the arguments in for each of the lists that hold the categorical keywords." ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "id": "23e31c98-17b3-41e2-883a-14dae9d6da7e", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2254/2245515441.py:7: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", - " df[new_column] = np.where((df.scope_of_work.str.contains(joined_keywords)),\n" - ] - } - ], - "source": [ - "df = categorize(df, atp, \"ATP\")" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "d5ec64cf-432c-45e2-b14d-f4ea7ca3de2a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "N 30\n", - "Y 14\n", - "Name: ATP, dtype: int64" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.ATP.value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "882a02a6-ce39-4da2-b2be-7e91322624e4", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2254/2245515441.py:7: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", - " df[new_column] = np.where((df.scope_of_work.str.contains(joined_keywords)),\n" - ] - } - ], - "source": [ - "df = categorize(df, transit, \"Transit\")" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "ee56ee97-307c-44a4-a2d4-b02eff954f87", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2254/2245515441.py:7: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", - " df[new_column] = np.where((df.scope_of_work.str.contains(joined_keywords)),\n" - ] - } - ], - "source": [ - "df = categorize(df, general_lanes, \"General_Lanes\")" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "96f2efba-4179-4a8c-b969-fd2990f8a129", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "N 35\n", - "Y 9\n", - "Name: General_Lanes, dtype: int64" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "df.General_Lanes.value_counts()" + "df = categorize(df = df, \n", + " keywords = atp, \n", + " new_column = \"ATP\")" ] }, { @@ -1054,111 +522,11 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "id": "62115dcb-ea34-4bb1-9bd1-e678ec015b8c", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
General_LanesTransitATPproject_nameoverall_score
0NNN1573.00
1NNY1172.00
2NYN875.00
3NYY175.00
4YNN773.00
5YNY282.00
\n", - "
" - ], - "text/plain": [ - " General_Lanes Transit ATP project_name overall_score\n", - "0 N N N 15 73.00\n", - "1 N N Y 11 72.00\n", - "2 N Y N 8 75.00\n", - "3 N Y Y 1 75.00\n", - "4 Y N N 7 73.00\n", - "5 Y N Y 2 82.00" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.groupby([\"General_Lanes\", \"Transit\", \"ATP\"]).aggregate(\n", - " {\"project_name\": \"nunique\", \"overall_score\": \"median\"}\n", - ").reset_index()" - ] + "outputs": [], + "source": [] }, { "cell_type": "markdown", @@ -1177,7 +545,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "id": "6d824c18-4c2b-41c9-950b-866e567ab7f5", "metadata": {}, "outputs": [], @@ -1190,15 +558,13 @@ "id": "212570f5-e8ed-4151-be24-dd0994304334", "metadata": {}, "source": [ - "Goal: \n", + "### Practice #1: \n", "* We are going to write an If-Else function that categorizes projects by whether it scored low, medium, or high based on its `overall_score` and percentiles.\n", - "* For example, if a project scores below the 25% percentile, it is a \"low scoring project\". If a project scores above the 25% percentile but below the 75% percentile, it is a \"medium scoring project\". Anything above the 75% percentile is \"high scoring\".\n", - "* Use the values you find from .describe() as reference.\n", - "* You aren't limited to only the 25th, 50th, and 75th percentile. You can categorize low,medium, and high based on other percentile ranges. \n", - " * You can do so by specifying within `describe` like `.describe(percentiles=[0.05, 0.1, 0.9, 0.95])`.\n", + " * If a project scores below the 25% percentile, it is a \"low scoring project\". If a project scores above the 25% percentile but below the 75% percentile, it is a \"medium scoring project\". Anything above the 75% percentile is \"high scoring\".\n", "* In Data Science, we like to save our work into variables.\n", - " * If new projects are added, then what determines the different percentiles will likely switch.\n", - " * As such, you can save whatever percentile you like using `p75 = df.overall_score.quantile(0.75).astype(float)` which will change along with the dataset when you load in the new data." + " * If new projects are added, then different percentiles will likely switch.\n", + " * As such, you can save whatever percentile you like using `p75 = df.overall_score.quantile(0.75).astype(float)` which will change automatically when you load in the new data.\n", + "* Write an if-else and set the various percentiles using variables. " ] }, { @@ -1220,38 +586,19 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "id": "d560dad0-de03-4469-99f8-5fadd9b198dc", "metadata": {}, "outputs": [], - "source": [ - "def categorize(row):\n", - " if (row.General_Lanes == \"N\") & (row.Transit == \"N\") & (row.ATP == \"N\"):\n", - " return \"Other\"\n", - " elif (row.General_Lanes == \"N\") & (row.Transit == \"N\") & (row.ATP == \"Y\"):\n", - " return \"ATP\"\n", - " elif (row.General_Lanes == \"N\") & (row.Transit == \"Y\") & (row.ATP == \"N\"):\n", - " return \"Transit\"\n", - " elif (row.General_Lanes == \"N\") & (row.Transit == \"Y\") & (row.ATP == \"Y\"):\n", - " return \"Transit and ATP\"\n", - " elif (row.General_Lanes == \"Y\") & (row.Transit == \"N\") & (row.ATP == \"N\"):\n", - " return \"General Lanes\"\n", - " elif (row.General_Lanes == \"Y\") & (row.Transit == \"N\") & (row.ATP == \"Y\"):\n", - " return \"General Lanes and ATP\"\n", - " else:\n", - " return \"Transit, General Lanes, and ATP\"" - ] + "source": [] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "id": "f8b7d946-c724-43cb-9a93-d1003f7f024f", "metadata": {}, "outputs": [], - "source": [ - "# Apply your function\n", - "df[\"category\"] = df.apply(categorize, axis=1)" - ] + "source": [] }, { "cell_type": "markdown", @@ -1263,132 +610,19 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "id": "f18a7754-907c-46fa-ad77-4a09abb03206", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ct_districtproject_namescope_of_workproject_costlead_agencyaccessibility_scoredac_accessibility_scoredac_traffic_impacts_scorefreight_efficiency_scorefreight_sustainability_scoremode_shift_scorelu_natural_resources_scoresafety_scorevmt_scorezev_scorepublic_engagement_scoreclimate_resilience_scoreprogram_fit_scoreoverall_scoreTransitATPGeneral_Lanescategory
01Meadow Magic Multi-Use Patha 2 mile class i bike lane and multi use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife.5245734Meadow Bunny Public Transportation (MBPT)28810235327661072NYNATP
\n", - "
" - ], - "text/plain": [ - " ct_district project_name \\\n", - "0 1 Meadow Magic Multi-Use Path \n", - "\n", - " scope_of_work \\\n", - "0 a 2 mile class i bike lane and multi use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife. \n", - "\n", - " project_cost lead_agency \\\n", - "0 5245734 Meadow Bunny Public Transportation (MBPT) \n", - "\n", - " accessibility_score dac_accessibility_score dac_traffic_impacts_score \\\n", - "0 2 8 8 \n", - "\n", - " freight_efficiency_score freight_sustainability_score mode_shift_score \\\n", - "0 10 2 3 \n", - "\n", - " lu_natural_resources_score safety_score vmt_score zev_score \\\n", - "0 5 3 2 7 \n", - "\n", - " public_engagement_score climate_resilience_score program_fit_score \\\n", - "0 6 6 10 \n", - "\n", - " overall_score Transit ATP General_Lanes category \n", - "0 72 N Y N ATP " - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head(1)" - ] + "outputs": [], + "source": [] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "id": "2245ce0c-97fb-4f08-9791-9fb6b28b49c7", "metadata": {}, "outputs": [], - "source": [ - "\n", - "df.to_parquet(f\"{GCS_FILE_PATH}starter_kit_example_categorized.parquet\")" - ] + "source": [] }, { "cell_type": "markdown", @@ -1402,27 +636,10 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "id": "48495a9f-e29c-41eb-b3e7-de6371fbd182", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0\n", - "1\n", - "2\n", - "3\n", - "4\n", - "5\n", - "6\n", - "7\n", - "8\n", - "9\n" - ] - } - ], + "outputs": [], "source": [ "for i in range(10):\n", " print(i)" @@ -1440,89 +657,24 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "id": "fca9e430-a906-4d0e-8046-36a0687b0636", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Statistics for zev_score\n" - ] - }, - { - "data": { - "text/plain": [ - "count 44.00\n", - "mean 6.00\n", - "std 2.96\n", - "min 1.00\n", - "25% 3.75\n", - "50% 6.50\n", - "75% 8.00\n", - "max 10.00\n", - "Name: zev_score, dtype: float64" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Statistics for vmt_score\n" - ] - }, - { - "data": { - "text/plain": [ - "count 44.00\n", - "mean 4.52\n", - "std 2.73\n", - "min 1.00\n", - "25% 2.00\n", - "50% 4.00\n", - "75% 6.00\n", - "max 10.00\n", - "Name: vmt_score, dtype: float64" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Statistics for accessibility_score\n" - ] - }, - { - "data": { - "text/plain": [ - "count 44.00\n", - "mean 5.14\n", - "std 2.66\n", - "min 1.00\n", - "25% 3.00\n", - "50% 5.00\n", - "75% 7.00\n", - "max 10.00\n", - "Name: accessibility_score, dtype: float64" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "for column in [\"zev_score\", \"vmt_score\", \"accessibility_score\"]:\n", " print(f\"Statistics for {column}\")\n", " display(df[column].describe())" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "51cd25c3-2234-4d89-a715-4e5365d7c99a", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "ded54884-4bad-46ae-a82f-2a67936c57dd", @@ -1534,7 +686,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "id": "5b414d3f-71a4-4078-9d98-b9082114e2c5", "metadata": {}, "outputs": [], @@ -1557,99 +709,10 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": null, "id": "1698fe9c-6d1f-412b-a632-826aae1ffc65", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
categorymedian_scoremedian_project_costtotal_projects
0ATP72.004991255.0011
1General Lanes73.007487963.007
2General Lanes and ATP82.005672550.502
3Other73.003708858.0015
4Transit75.004399886.008
5Transit and ATP75.002069143.001
\n", - "
" - ], - "text/plain": [ - " category median_score median_project_cost total_projects\n", - "0 ATP 72.00 4991255.00 11\n", - "1 General Lanes 73.00 7487963.00 7\n", - "2 General Lanes and ATP 82.00 5672550.50 2\n", - "3 Other 73.00 3708858.00 15\n", - "4 Transit 75.00 4399886.00 8\n", - "5 Transit and ATP 75.00 2069143.00 1" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "agg1" ] @@ -1664,7 +727,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": null, "id": "320bd91e-b9ed-4423-80d4-c1a1aa5ba59f", "metadata": {}, "outputs": [], @@ -1700,92 +763,11 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "id": "a6103703-8131-4ed8-9482-314c7895c279", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "create_chart(agg1, \"median_score\")" - ] + "outputs": [], + "source": [] }, { "cell_type": "markdown", @@ -1793,262 +775,18 @@ "metadata": {}, "source": [ "* We have a couple of other columns left that still need to be visualized. \n", - "* This is the perfect case for using a for loop, since we all we want to do is replace the column above with the two remainig columns. \n", + "* This is the perfect case for using a for loop, since all we want to do is replace the column above with the two remainig columns. \n", "* Try this below! \n", " * Hint: you'll have to wrap the function with `display()` to get your results." ] }, { "cell_type": "code", - "execution_count": 45, + "execution_count": null, "id": "ca8659f1-0842-4bb5-a544-9a2a5fb93c02", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "for column in [\"median_score\", \"median_project_cost\", \"total_projects\"]:\n", - " display(create_chart(agg1, column))" - ] - }, - { - "cell_type": "markdown", - "id": "0f77dcf4-7b19-4e58-b20b-ae59721deb9c", - "metadata": {}, - "source": [ - "### Try it out yourself\n", - "* Think of some other use cases for a for loop and try them out here." - ] + "outputs": [], + "source": [] } ], "metadata": { diff --git a/starter_kit/2024_basics_04.ipynb b/starter_kit/2024_basics_04.ipynb index c36cc6a54..b63e78160 100644 --- a/starter_kit/2024_basics_04.ipynb +++ b/starter_kit/2024_basics_04.ipynb @@ -6,15 +6,15 @@ "metadata": {}, "source": [ "# Exercise 4: Python Scripts, Concept of Grains, Display, Markdown,\n", - "* Cleaning and analyzing data takes a lot of time, patience, and skill.\n", - "* However, presenting the data to stakeholders is also equaly important.\n", - "* At DDS, we often present our work in a Jupyter Notebook.\n", - "* This exercise will walk you through how we do so. " + "* After cleaning and analyzing data, it's time to present the data in a beautifl fashion.\n", + "* At DDS, we often present our work directly in a Jupyter Notebook, which has many benefits such as.\n", + " * We save the time it takes to copy and paste our graphs into a PowerPoint \n", + " * We ensure the accuracy of the data since we aren't manually retyping the data. " ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "1d4e2cdf-a5b9-4ebb-aa2f-c7abe897a683", "metadata": {}, "outputs": [], @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "a0403b50-d81c-4499-9b69-e164eb38f8cd", "metadata": {}, "outputs": [], @@ -47,15 +47,22 @@ "## Python Scripts\n", "* Up until now, we have been placing all of our code in the Jupyter Notebook.\n", "* While this is convenient, it's not the best practice. \n", - "* A notebook full of code isn't easy for viewers - it gets chaotic, quickly! \n", - "* Jupyter notebooks are also very difficult for Git to version control. \n", + "* A notebook full of code also isn't easy for viewers - it gets chaotic, quickly! \n", "* **The best solution is to move the bulk of your code when you have reached a stopping point to a Python Script.**\n", - " * Read all about the benefits of scripts [here in our DDS docs](https://docs.calitp.org/data-infra/analytics_tools/scripts.html).\n", - " * Summary points from the docs page above:\n", + "* Read all about the benefits of scripts [here in our DDS docs](https://docs.calitp.org/data-infra/analytics_tools/scripts.html). Summary points below: \n", + " * Summary points from the docs page above. What are Python scripts?\n", " * Python scripts (.py) are plain text files. Git tracks plain text changes easily.\n", " * Scripts are robust to scaling and reproducing work.\n", " * Break out scripts by concepts / stages\n", " * All functions used in scripts should have docstrings. Type hints are encouraged!\n", + " * Which components should a script contain?\n", + " * 1 script for importing external data and changing it from shapefile/geojson/csv to parquet/geoparquet\n", + " * If only using warehouse data or upstream warehouse data cached in GCS, can skip this first script\n", + " * At least 1 script for data processing to produce processed output for visualization\n", + " * Break out scripts by concepts / stages\n", + " * Include data catalog, README for the project\n", + " * All functions used in scripts should have docstrings. Type hints are encouraged!\n", + "### Sample Script \n", "* Making Python scripts is an art and not straight forward.\n", "* I have already populated a `.py` file called `_starterkit_utils` with some sample functions.\n", "* I imported my Python Script just like how I imported my other dependencies (Pandas, Altair, Numpy)." @@ -63,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "68d8980b-e857-491e-b03a-4648c5f4c5f3", "metadata": {}, "outputs": [], @@ -76,49 +83,19 @@ "id": "6f37fc46-a49e-45b4-92bf-d5b3910b2325", "metadata": {}, "source": [ - "### Breakdown of a Script.\n", + "### Breakdown of the Sample Script\n", "#### Function 1\n", "* You can also preview what a function does by writing `script_name.function_name??`\n", - "\n", "* Following what the DDS docs says, I am creating a new function every time I am processing the data in another stage.\n", "* I have one function that loads in my dataset." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "3454fecc-0b6b-4f1f-b74d-17792165f990", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\u001b[0;31mSignature:\u001b[0m \u001b[0m_starterkit_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_dataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mSource:\u001b[0m \n", - "\u001b[0;32mdef\u001b[0m \u001b[0mload_dataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m->\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"\"\"\u001b[0m\n", - "\u001b[0;34m Load the final dataframe.\u001b[0m\n", - "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mGCS_FILE_PATH\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"gs://calitp-analytics-data/data-analyses/starter_kit/\"\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mFILE\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"starter_kit_example_categorized.parquet\"\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;31m# Read dataframe in\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_parquet\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{GCS_FILE_PATH}{FILE}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;31m# Capitalize the Scope of Work column again since it is all lowercase\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscope_of_work\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscope_of_work\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcapitalize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;31m# Clean up the column names\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreverse_snakecase\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mFile:\u001b[0m ~/data-analyses/starter_kit/_starterkit_utils.py\n", - "\u001b[0;31mType:\u001b[0m function" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "_starterkit_utils.load_dataset??" ] @@ -135,7 +112,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "44467ccf-599f-4662-8164-8a58fac85711", "metadata": {}, "outputs": [], @@ -155,58 +132,17 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "0d82c825-d789-469e-8ae2-c69a94984511", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\u001b[0;31mSignature:\u001b[0m \u001b[0m_starterkit_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maggregate_by_category\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mSource:\u001b[0m \n", - "\u001b[0;32mdef\u001b[0m \u001b[0maggregate_by_category\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"\"\"\u001b[0m\n", - "\u001b[0;34m Find the median overall score and project cost \u001b[0m\n", - "\u001b[0;34m and total unique projects by category.\u001b[0m\n", - "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0magg1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Category\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0maggregate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"Overall Score\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"median\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"Project Cost\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"median\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"Project Name\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"nunique\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mrename\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"Overall Score\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"Median Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"Project Cost\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"Median Project Cost\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"Project Name\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"Total Projects\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;31m# Format the Cost column properly\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0magg1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Median Project Cost'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0magg1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Median Project Cost'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'${:,.0f}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0magg1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mFile:\u001b[0m ~/data-analyses/starter_kit/_starterkit_utils.py\n", - "\u001b[0;31mType:\u001b[0m function" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "_starterkit_utils.aggregate_by_category??" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "f9635fe8-a6c7-4813-9f25-7ba555ce9726", "metadata": {}, "outputs": [], @@ -216,99 +152,10 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "2bdcb3e6-2add-4af6-a20a-9072b7ba075c", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CategoryMedian ScoreMedian Project CostTotal Projects
0ATP72.00$4,991,25511
1General Lanes73.00$7,487,9637
2General Lanes and ATP82.00$5,672,5502
3Other73.00$3,708,85815
4Transit75.00$4,399,8868
5Transit and ATP75.00$2,069,1431
\n", - "
" - ], - "text/plain": [ - " Category Median Score Median Project Cost Total Projects\n", - "0 ATP 72.00 $4,991,255 11\n", - "1 General Lanes 73.00 $7,487,963 7\n", - "2 General Lanes and ATP 82.00 $5,672,550 2\n", - "3 Other 73.00 $3,708,858 15\n", - "4 Transit 75.00 $4,399,886 8\n", - "5 Transit and ATP 75.00 $2,069,143 1" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "aggregated_df" ] @@ -319,64 +166,24 @@ "metadata": {}, "source": [ "#### Function 3\n", - "* I want to swap my dataframe from wide to long. \n", + "* I want process my data a second way by changing it from wide to long. \n", "* [Read about wide to long.](https://www.statology.org/long-vs-wide-data/)\n", "* [Pandas doc on melt](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.melt.html)" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "2b0231f0-eb97-46d4-9541-aee43b138755", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\u001b[0;31mSignature:\u001b[0m \u001b[0m_starterkit_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwide_to_long\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mSource:\u001b[0m \n", - "\u001b[0;32mdef\u001b[0m \u001b[0mwide_to_long\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m->\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"\"\"\u001b[0m\n", - "\u001b[0;34m Change the dataframe from wide to long based on the project name and\u001b[0m\n", - "\u001b[0;34m Caltrans District.\u001b[0m\n", - "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdf2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmelt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mid_vars\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"CalTrans District\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"Project Name\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mvalue_vars\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"Accessibility Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"DAC Accessibility Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"DAC Traffic Impacts Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"Freight Efficiency Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"Freight Sustainability Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"Mode Shift Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"Landuse Natural Resources Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"Safety Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"VMT Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"ZEV Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"Public Engagement Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"Climate Resilience Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"Program Fit Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdf2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrename\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m'variable'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m'Metric'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m'value'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m'Score'\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mFile:\u001b[0m ~/data-analyses/starter_kit/_starterkit_utils.py\n", - "\u001b[0;31mType:\u001b[0m function" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "_starterkit_utils.wide_to_long??" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "82172952-3d59-436e-b08c-7096454b6e04", "metadata": {}, "outputs": [], @@ -386,67 +193,10 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "1bcac91b-b0a1-4efd-8a73-f019c376d030", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CalTrans DistrictProject NameMetricScore
01Meadow Magic Multi-Use PathAccessibility Score2
14Bunny Hop Bike BoulevardAccessibility Score3
\n", - "
" - ], - "text/plain": [ - " CalTrans District Project Name Metric Score\n", - "0 1 Meadow Magic Multi-Use Path Accessibility Score 2\n", - "1 4 Bunny Hop Bike Boulevard Accessibility Score 3" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df2.head(2)" ] @@ -463,76 +213,10 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "d69a4f91-4e37-4207-93e0-2eaa18f998ff", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CategoryMedian ScoreMedian Project CostTotal Projects
ATP72$4,991,25511
General Lanes73$7,487,9637
General Lanes and ATP82$5,672,5502
Other73$3,708,85815
Transit75$4,399,8868
Transit and ATP75$2,069,1431
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "_starterkit_utils.style_df(aggregated_df)" ] @@ -543,154 +227,26 @@ "metadata": {}, "source": [ "#### Function 5 \n", + "* After aggregating and reshaping the data, the next function presents the data.\n", "* This is function that creates a chart that shows the scores by metric for each project." ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "1e2ec6b7-b494-4db5-a863-91882c77a7a8", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\u001b[0;31mSignature:\u001b[0m \u001b[0m_starterkit_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate_metric_chart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0maltair\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvegalite\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mv5\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mChart\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mSource:\u001b[0m \n", - "\u001b[0;32mdef\u001b[0m \u001b[0mcreate_metric_chart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0malt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mChart\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"\"\"\u001b[0m\n", - "\u001b[0;34m Create a chart that displays metric scores\u001b[0m\n", - "\u001b[0;34m for each project.\u001b[0m\n", - "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;31m# Create dropdown\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mmetrics_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Metric\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munique\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtolist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mmetrics_dropdown\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0malt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbinding_select\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0moptions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmetrics_list\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"Metrics: \"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;31m# Column that controls the bar charts\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mxcol_param\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0malt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselection_point\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Metric\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmetrics_list\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbind\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmetrics_dropdown\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mchart\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0malt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mChart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtitle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"Metric by Categories\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mmark_circle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m200\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0malt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Score\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscale\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0malt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mScale\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdomain\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0malt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mY\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Project Name\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mcolor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0malt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mColor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mscale\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0malt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mScale\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcalitp_color_palette\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mCALITP_CATEGORY_BRIGHT_COLORS\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mtooltip\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mproperties\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwidth\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m400\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m250\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mchart\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mchart\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_params\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxcol_param\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform_filter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxcol_param\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mchart\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mFile:\u001b[0m ~/data-analyses/starter_kit/_starterkit_utils.py\n", - "\u001b[0;31mType:\u001b[0m function" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "_starterkit_utils.create_metric_chart??" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "7f39ca4e-9fb9-497d-bee6-22be385a9d34", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "_starterkit_utils.create_metric_chart(df2)" ] @@ -703,85 +259,16 @@ "## Grains\n", "* This is a light introduction to the concept of grains.\n", "* Grain means the level your dataset is presented at.\n", - "* You can think of it as: what does each row represent?\n", - "* The original dataset is presented on the project-level grain because each row represents a unique project. \n" + "* Basically, what does each row represent?\n", + "* The original dataset is presented on the project-level grain because each row represents a unique project. " ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "bb228391-f907-4d76-a2b5-45b7fd188d21", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Project NameOverall Score
0Meadow Magic Multi-Use Path72
1Bunny Hop Bike Boulevard68
2Strawberry Shortcake Sidewalks87
3River Ramble Rabbit Trail75
4Lilac Lane Dream Complete Street72
\n", - "
" - ], - "text/plain": [ - " Project Name Overall Score\n", - "0 Meadow Magic Multi-Use Path 72\n", - "1 Bunny Hop Bike Boulevard 68\n", - "2 Strawberry Shortcake Sidewalks 87\n", - "3 River Ramble Rabbit Trail 75\n", - "4 Lilac Lane Dream Complete Street 72" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df[[\"Project Name\", \"Overall Score\"]].head()" ] @@ -796,121 +283,10 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "9a332009-4ac5-4eca-9632-6d45c03765a4", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CalTrans DistrictTotal Projects
011
122
236
346
454
563
673
785
893
9102
10115
11124
\n", - "
" - ], - "text/plain": [ - " CalTrans District Total Projects\n", - "0 1 1\n", - "1 2 2\n", - "2 3 6\n", - "3 4 6\n", - "4 5 4\n", - "5 6 3\n", - "6 7 3\n", - "7 8 5\n", - "8 9 3\n", - "9 10 2\n", - "10 11 5\n", - "11 12 4" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.groupby([\"CalTrans District\"]).agg({\"Project Name\": \"nunique\"}).reset_index().rename(\n", " columns={\"Project Name\": \"Total Projects\"}\n", @@ -927,127 +303,10 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "c0f00432-60ca-4e8a-9c2a-45bba234dbd7", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Lead AgencyTotal Projects
0Bunny's Meadow Hop Transportation (BMHT)3
1Cherry Metro Services (CMS)1
2Dewdrop Ride Transit2
3Elf's Efficient Transportation (EET)3
4Fairy Creek Public Transit (FCPT)5
5Gnome Valley Rail Link (GVRL)3
6Meadow Bunny Public Transportation (MBPT)4
7Morning Dewdrop Transit (MDT)4
8Mushroom Metro Transit Agency (MMTA)5
9Rainbow Mushroom Transportation Corporation (RMTC)5
10Shining Sparkle Transit Systems (SSTS)4
11Strawberry Rainbow Transit Systems (SRTS)4
12Unicorn Fairy Express Bus (UFX)1
\n", - "
" - ], - "text/plain": [ - " Lead Agency Total Projects\n", - "0 Bunny's Meadow Hop Transportation (BMHT) 3\n", - "1 Cherry Metro Services (CMS) 1\n", - "2 Dewdrop Ride Transit 2\n", - "3 Elf's Efficient Transportation (EET) 3\n", - "4 Fairy Creek Public Transit (FCPT) 5\n", - "5 Gnome Valley Rail Link (GVRL) 3\n", - "6 Meadow Bunny Public Transportation (MBPT) 4\n", - "7 Morning Dewdrop Transit (MDT) 4\n", - "8 Mushroom Metro Transit Agency (MMTA) 5\n", - "9 Rainbow Mushroom Transportation Corporation (RMTC) 5\n", - "10 Shining Sparkle Transit Systems (SSTS) 4\n", - "11 Strawberry Rainbow Transit Systems (SRTS) 4\n", - "12 Unicorn Fairy Express Bus (UFX) 1" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.groupby([\"Lead Agency\"]).agg({\"Project Name\": \"nunique\"}).reset_index().rename(\n", " columns={\"Project Name\": \"Total Projects\"}\n", @@ -1059,349 +318,19 @@ "id": "55137a34-1624-4d8a-8ee8-33c773868cde", "metadata": {}, "source": [ - "* Grains can get very complicated. The one below is Lead Agency and Category Grain. " + "* Grains can get very minute. The one below is Lead Agency and Category Grain. " ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "id": "7892454f-5f70-4237-9f04-560405cf1775", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Lead AgencyCategoryTotal Projects
0Bunny's Meadow Hop Transportation (BMHT)Other2
1Bunny's Meadow Hop Transportation (BMHT)Transit1
2Cherry Metro Services (CMS)Other1
3Dewdrop Ride TransitATP1
4Dewdrop Ride TransitOther1
5Elf's Efficient Transportation (EET)ATP1
6Elf's Efficient Transportation (EET)General Lanes1
7Elf's Efficient Transportation (EET)Transit1
8Fairy Creek Public Transit (FCPT)ATP1
9Fairy Creek Public Transit (FCPT)Other2
10Fairy Creek Public Transit (FCPT)Transit2
11Gnome Valley Rail Link (GVRL)ATP1
12Gnome Valley Rail Link (GVRL)Other1
13Gnome Valley Rail Link (GVRL)Transit and ATP1
14Meadow Bunny Public Transportation (MBPT)ATP1
15Meadow Bunny Public Transportation (MBPT)General Lanes and ATP1
16Meadow Bunny Public Transportation (MBPT)Other1
17Meadow Bunny Public Transportation (MBPT)Transit1
18Morning Dewdrop Transit (MDT)General Lanes2
19Morning Dewdrop Transit (MDT)Other1
20Morning Dewdrop Transit (MDT)Transit1
21Mushroom Metro Transit Agency (MMTA)General Lanes1
22Mushroom Metro Transit Agency (MMTA)Other3
23Mushroom Metro Transit Agency (MMTA)Transit1
24Rainbow Mushroom Transportation Corporation (RMTC)ATP2
25Rainbow Mushroom Transportation Corporation (RMTC)General Lanes1
26Rainbow Mushroom Transportation Corporation (RMTC)Other1
27Rainbow Mushroom Transportation Corporation (RMTC)Transit1
28Shining Sparkle Transit Systems (SSTS)ATP1
29Shining Sparkle Transit Systems (SSTS)General Lanes1
30Shining Sparkle Transit Systems (SSTS)General Lanes and ATP1
31Shining Sparkle Transit Systems (SSTS)Other1
32Strawberry Rainbow Transit Systems (SRTS)ATP2
33Strawberry Rainbow Transit Systems (SRTS)General Lanes1
34Strawberry Rainbow Transit Systems (SRTS)Other1
35Unicorn Fairy Express Bus (UFX)ATP1
\n", - "
" - ], - "text/plain": [ - " Lead Agency Category \\\n", - "0 Bunny's Meadow Hop Transportation (BMHT) Other \n", - "1 Bunny's Meadow Hop Transportation (BMHT) Transit \n", - "2 Cherry Metro Services (CMS) Other \n", - "3 Dewdrop Ride Transit ATP \n", - "4 Dewdrop Ride Transit Other \n", - "5 Elf's Efficient Transportation (EET) ATP \n", - "6 Elf's Efficient Transportation (EET) General Lanes \n", - "7 Elf's Efficient Transportation (EET) Transit \n", - "8 Fairy Creek Public Transit (FCPT) ATP \n", - "9 Fairy Creek Public Transit (FCPT) Other \n", - "10 Fairy Creek Public Transit (FCPT) Transit \n", - "11 Gnome Valley Rail Link (GVRL) ATP \n", - "12 Gnome Valley Rail Link (GVRL) Other \n", - "13 Gnome Valley Rail Link (GVRL) Transit and ATP \n", - "14 Meadow Bunny Public Transportation (MBPT) ATP \n", - "15 Meadow Bunny Public Transportation (MBPT) General Lanes and ATP \n", - "16 Meadow Bunny Public Transportation (MBPT) Other \n", - "17 Meadow Bunny Public Transportation (MBPT) Transit \n", - "18 Morning Dewdrop Transit (MDT) General Lanes \n", - "19 Morning Dewdrop Transit (MDT) Other \n", - "20 Morning Dewdrop Transit (MDT) Transit \n", - "21 Mushroom Metro Transit Agency (MMTA) General Lanes \n", - "22 Mushroom Metro Transit Agency (MMTA) Other \n", - "23 Mushroom Metro Transit Agency (MMTA) Transit \n", - "24 Rainbow Mushroom Transportation Corporation (RMTC) ATP \n", - "25 Rainbow Mushroom Transportation Corporation (RMTC) General Lanes \n", - "26 Rainbow Mushroom Transportation Corporation (RMTC) Other \n", - "27 Rainbow Mushroom Transportation Corporation (RMTC) Transit \n", - "28 Shining Sparkle Transit Systems (SSTS) ATP \n", - "29 Shining Sparkle Transit Systems (SSTS) General Lanes \n", - "30 Shining Sparkle Transit Systems (SSTS) General Lanes and ATP \n", - "31 Shining Sparkle Transit Systems (SSTS) Other \n", - "32 Strawberry Rainbow Transit Systems (SRTS) ATP \n", - "33 Strawberry Rainbow Transit Systems (SRTS) General Lanes \n", - "34 Strawberry Rainbow Transit Systems (SRTS) Other \n", - "35 Unicorn Fairy Express Bus (UFX) ATP \n", - "\n", - " Total Projects \n", - "0 2 \n", - "1 1 \n", - "2 1 \n", - "3 1 \n", - "4 1 \n", - "5 1 \n", - "6 1 \n", - "7 1 \n", - "8 1 \n", - "9 2 \n", - "10 2 \n", - "11 1 \n", - "12 1 \n", - "13 1 \n", - "14 1 \n", - "15 1 \n", - "16 1 \n", - "17 1 \n", - "18 2 \n", - "19 1 \n", - "20 1 \n", - "21 1 \n", - "22 3 \n", - "23 1 \n", - "24 2 \n", - "25 1 \n", - "26 1 \n", - "27 1 \n", - "28 1 \n", - "29 1 \n", - "30 1 \n", - "31 1 \n", - "32 2 \n", - "33 1 \n", - "34 1 \n", - "35 1 " - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "df.groupby([\"Lead Agency\", \"Category\"]).agg({\"Project Name\": \"nunique\"}).reset_index().rename(\n", - " columns={\"Project Name\": \"Total Projects\"}\n", - ")" + "df.groupby([\"Lead Agency\", \"Category\"]).agg(\n", + " {\"Project Name\": \"nunique\"}\n", + ").reset_index().rename(columns={\"Project Name\": \"Total Projects\"})" ] }, { @@ -1410,9 +339,8 @@ "metadata": {}, "source": [ "## Create your own Script\n", - "* **Make sure your functions make sense for the district grain.**\n", - "* You will be using these functions for Exercise 5. \n", - "* Make sure to separate out functions by theme. \n", + "* **Make sure your functions make sense for the district grain. You will be using these functions for Exercise 5.**\n", + "* In your script, separate out functions by step like above. \n", " * One function that loads the dataset and does some light cleaning.\n", " * One (or more) functions that transform your dataframe.\n", " * `melt()`, `.T`, `.groupby()` are just some of the many options available through `pandas`. \n", @@ -1421,10 +349,10 @@ "* Other things to consider\n", " * Our [DDS Docs](https://docs.calitp.org/data-infra/publishing/sections/4_notebooks_styling.html#narrative) has a great guide on what \"checkboxes\" need to be \"checked\" when presenting data. The first 3 sections are the most relevant.\n", " * To summarize the docs, double check:\n", - " * Are the currency columns formatted with $ and commas?\n", + " * Are currency columns formatted with $ and commas?\n", " * Are all the scores formatted with the same number of decimals?\n", " * Are the string columns formatted with the right punctuation and capitalization?\n", - " * Are the column names formatted properly? While `snake_case` is very handy when we are analyzing the dataframe, it is not slightly when presenting the data. We typically reverse the `snake_case` back to something like `Project Name`.\n", + " * Are the column names formatted properly? While `snake_case` is very handy when we are analyzing the dataframe, it is not very nice when presenting the data. We typically reverse the `snake_case` back to something like `Project Name`.\n", " * [CalTrans Districts are currently integers, but they have actual names that can be mapped.](https://cwwp2.dot.ca.gov/documentation/district-map-county-chart.htm) \n", " " ] @@ -1437,24 +365,19 @@ "## Markdown/Display\n", "* Although our code is now neatly stored in a Python script, a Jupyter Notebook on its own is a bit plain, even when we have beautiful charts. \n", "* There are many ways to jazz it up.\n", - "* **Resource**: [Data Camp](https://www.datacamp.com/tutorial/markdown-in-jupyter-notebook)" - ] - }, - { - "cell_type": "markdown", - "id": "ed396a2f-c3f1-40be-aad2-64835be8431b", - "metadata": {}, - "source": [ - "#### Images\n", + "* **Resource**: [Data Camp's Markdown Tutorial](https://www.datacamp.com/tutorial/markdown-in-jupyter-notebook)\n", + "### Images\n", + "#### In a Markdown Cell\n", "* You can add an image in a markdown cell\n", "``

\n", "\n", + "#### In a Code Cell\n", "* You can add an image in a code cell if you import the packages below." ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "4ec41786-491f-46ad-963e-f380d8095ade", "metadata": {}, "outputs": [], @@ -1464,26 +387,10 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "8d0a6849-f178-46fc-919a-f45b5436c423", "metadata": {}, - "outputs": [ - { - "data": { - "image/jpeg": "", - "text/plain": [ - "" - ] - }, - "metadata": { - "image/jpeg": { - "height": 600, - "width": 960 - } - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "display(Image(filename=\"./19319_en_1.jpg\", retina=True))" ] @@ -1496,7 +403,7 @@ "### Display\n", "* Of course, you can write your narratives in a Markdown cell like what I'm doing right now.\n", "* However, what if you want to incorporate values from your dataframe into the narrative?\n", - "* Writing out the values manually in markdown locks you in. If the values change, you'll have to rewrite your narrative.\n", + "* Writing out the values manually in markdown locks you in. If the values change, you'll have to rewrite your narrative which is timely and prone to inaccuracy.\n", "* The best way is to use `display` and `markdown` from `from IPython.display`\n", "* We are using District 3 as an example" ] @@ -1507,12 +414,12 @@ "metadata": {}, "source": [ "#### No hard coding\n", - "* Save out your desired value into a new variable if you are manipulating it." + "* Save out your desired value into a new variable whenever you want to reference it in a narrative." ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "caecab58-2d26-4604-a3f1-ab4a11400038", "metadata": {}, "outputs": [], @@ -1523,7 +430,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "id": "995eb899-0397-4f60-b587-18fcf8a4cb0e", "metadata": {}, "outputs": [], @@ -1534,7 +441,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "id": "bc73cc66-911a-44bf-9710-920328b40609", "metadata": {}, "outputs": [], @@ -1545,7 +452,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "8a5dafba-c901-412c-bf53-e418dc558787", "metadata": {}, "outputs": [], @@ -1556,7 +463,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "id": "e183e629-7f79-45e3-810b-294851ca9abf", "metadata": {}, "outputs": [], @@ -1571,43 +478,27 @@ "metadata": {}, "source": [ "#### Long F-String + Headers\n", - "* The f-string has multiple quotation marks. This allows you to write a f-string that goes over multiple lines.\n", - "*

and

displays District 3 in a header. Headers vary in size, 1 being the largest. \n", + "* F-strings can have multiple quotation marks. This allows you to write a f-string that goes over multiple lines.\n", + "* `

` and `

` displays District 3 in a header. \n", + " * Headers vary in size, 1 being the largest. \n", "* `` bolds the text. \n", - " * ` italicizes the text.\n", - "* Notice that you always have to **close** your HTML with `` strikes the text.\n", + "* Notice that you always have to **close** your HTML with ``" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "id": "8d7e68ad-79dd-48b6-8e17-90496d470b69", "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "

District 3

\n", - " The median score for projects in District 3 is 80.5
\n", - " The total number of projects is 6
\n", - " The most expensive project costs $9,448,022.00\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "display(\n", " Markdown(\n", " f\"\"\"

District 3

\n", " The median score for projects in District 3 is {d3_median_score}
\n", " The total number of projects is {d3_total_projects}
\n", - " The most expensive project costs {d3_max_project}\n", + " The most expensive project costs {d3_max_project}\n", " \"\"\"\n", " )\n", ")" @@ -1618,111 +509,20 @@ "id": "4aa41900-7dbf-4c61-b927-4f1e42f6b8da", "metadata": {}, "source": [ - "* You can code in this cell. I'm filtering out for district 3 values.\n", + "#### You can code in this cell. I'm filtering out for district 3 values.\n", "* Notice the header went from `

` to `

`. " ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "id": "38d84ed3-9626-4f91-9aea-e2449aef4cf8", "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "

Metric Scores

\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "display(\n", " Markdown(\n", - " f\"\"\"

Metric Scores

\n", + " f\"\"\"
Metric Scores
\n", " \"\"\"\n", " )\n", ")\n", @@ -1734,549 +534,27 @@ "id": "fe45d252-1d46-4d34-98f4-7118afd96406", "metadata": {}, "source": [ - "### This can be a function too\n", - "* What if I wanted to generate these narratives for every district?\n", - "* I can simply turn this into a function.\n", - "* I only want to print out a couple of districts or else this notebook will become too large" + "### `Markdown` and `Display` can be worked into functions \n", + "* What if I wanted to generate these reports for every district?\n", + "* I can simply turn this into a function." ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "id": "8875a82f-2df3-4777-a115-87ba84ea96a3", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\u001b[0;31mSignature:\u001b[0m\n", - "\u001b[0m_starterkit_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate_district_summary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mcaltrans_district\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mSource:\u001b[0m \n", - "\u001b[0;32mdef\u001b[0m \u001b[0mcreate_district_summary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcaltrans_district\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"\"\"\u001b[0m\n", - "\u001b[0;34m Create a summary of CSIS metrics for one Caltrans District.\u001b[0m\n", - "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mfiltered_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"CalTrans District\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mcaltrans_district\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdrop\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;31m# Finding the values referenced in the narrative\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mmedian_score\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfiltered_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Overall Score\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmedian\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mtotal_projects\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfiltered_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Project Name\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnunique\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mmax_project\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfiltered_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Project Cost\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mmax_project\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mf\"${max_project:,.2f}\"\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;31m# Aggregate the dataframe\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0maggregated_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maggregate_by_category\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfiltered_df\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;31m# Change the dataframe from wide to long\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdf2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwide_to_long\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfiltered_df\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;31m# Create narrative\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mMarkdown\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34mf\"\"\"The median score for projects in District {caltrans_district} is {median_score}
\u001b[0m\n", - "\u001b[0;34m The total number of projects is {total_projects}
\u001b[0m\n", - "\u001b[0;34m The most expensive project costs {max_project}\u001b[0m\n", - "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mMarkdown\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34mf\"\"\"

Metrics aggregated by Categories

\u001b[0m\n", - "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mstyle_df\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maggregated_df\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mMarkdown\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34mf\"\"\"

Overview of Projects

\u001b[0m\n", - "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mstyle_df\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfiltered_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Project Name\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Overall Score\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Scope Of Work\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mMarkdown\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34mf\"\"\"

Metric Scores by Project

\u001b[0m\n", - "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcreate_metric_chart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mFile:\u001b[0m ~/data-analyses/starter_kit/_starterkit_utils.py\n", - "\u001b[0;31mType:\u001b[0m function" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "_starterkit_utils.create_district_summary??" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "id": "6d6f524a-d49b-4729-801f-ccc4bd800149", "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "The median score for projects in District 10 is 72.5
\n", - " The total number of projects is 2
\n", - " The most expensive project costs $7,160,933.00\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "

Metrics aggregated by Categories

\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CategoryMedian ScoreMedian Project CostTotal Projects
Other59$816,5691
Transit86$7,160,9331
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "

Overview of Projects

\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Project NameOverall ScoreScope Of Work
Countryside Clover Rail Connector59A 20 mile rail improvement project for freight transportation, upgrading track infrastructure, and implementing advanced safety features to reduce derailment risk.
Brookside Bus Blossom Lane86Prioritize public transportation and enhance air quality by dedicating lanes to buses and hovs on brookside boulevard, integrating smart traffic signals and real time transit information inspired by the ancient elves.
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "

Metric Scores by Project

\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "The median score for projects in District 11 is 75.0
\n", - " The total number of projects is 5
\n", - " The most expensive project costs $8,956,026.00\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "

Metrics aggregated by Categories

\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CategoryMedian ScoreMedian Project CostTotal Projects
ATP79$8,956,0261
General Lanes89$1,557,7511
Other75$5,796,4771
Transit55$5,425,7841
Transit and ATP75$2,069,1431
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "

Overview of Projects

\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Project NameOverall ScoreScope Of Work
Berry Best Bus Rapid Transit55Dedicated bus lanes with comfortable stops, featuring off board fare payment, priority traffic signals, and enhanced passenger amenities.
Trail of Treats and Transit Hub75A multi use path connecting to public transit, featuring public art installations, wayfinding signage, and amenities like bike storage and repair stations.
Fairy Glen Boulevard79Welcome travelers to our enchanted town with a refreshed fairy glen boulevard, featuring sparkling streetlights, lush wildflower medians, and meandering pedestrian paths
Parkside Pixie Carpool Lane75Encourage sustainable transportation and reduce traffic congestion by constructing high occupancy vehicle (hov) lanes along parkside drive, adorned with fairy inspired artwork.
Ridgewood Ride-Share Rainbow Lane89Support environmentally friendly commuting options by building hov lanes on ridgewood highway, featuring designated ride share pickup and drop off zones, and a touch of magic from the meadow.
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "

Metric Scores by Project

\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "for district in range(10, 12):\n", " _starterkit_utils.create_district_summary(df, district)" @@ -2289,7 +567,7 @@ "source": [ "## Your turn to combine all your functions into one function\n", "* Take some inspiration from ` _starterkit_utils.create_district_summary(df, district).`\n", - "* Incorporate concepts from `markdown` and `display`. " + "* Incorporate concepts from `markdown` and `display` to create a polished report. " ] } ], diff --git a/starter_kit/2024_basics_05.ipynb b/starter_kit/2024_basics_05.ipynb index 40e31e42d..f53ee454f 100644 --- a/starter_kit/2024_basics_05.ipynb +++ b/starter_kit/2024_basics_05.ipynb @@ -14,8 +14,8 @@ "* Spend some time exploring our portfolio above. \n", "\n", "**How does the portfolio work?**\n", - "* For the majority of the sites on the portfolio are using a single notebook essentially as a template that is looped one or more variables. \n", - " * This [National Transit Dataset Monthly Ridership by Regional Transit Planning Authority (RTPA) portfolio](https://ntd-monthly-ridership--cal-itp-data-analyses.netlify.app/readme) takes [this notebook](https://github.com/cal-itp/data-analyses/blob/main/ntd/monthly_ridership_report.ipynb) and reruns it for every \n", + "* The majority of the sites on the portfolio use a single notebook essentially as a template that is looped one or more variables. \n", + " * This [National Transit Dataset Monthly Ridership by Regional Transit Planning Authority (RTPA) portfolio](https://ntd-monthly-ridership--cal-itp-data-analyses.netlify.app/readme) is generated from [this notebook](https://github.com/cal-itp/data-analyses/blob/main/ntd/monthly_ridership_report.ipynb) which is reran over every \n", "RTPA in this [yml file](https://github.com/cal-itp/data-analyses/blob/main/portfolio/sites/ntd_monthly_ridership.yml). \n", " * This process of looping over variables to generate new notebooks is called parameterizing a notebook.\n", " \n", @@ -24,7 +24,7 @@ " * [Publishing to the portfolio](https://docs.calitp.org/data-infra/publishing/sections/5_analytics_portfolio_site.html)\n", "\n", "**Let's make a portfolio**\n", - "* Feel free to delete all the instructions off once you're done. \n", + "* Feel free to delete all the instruction markdown cells (including this one) off once you're done. \n", "* Spoiler alert! Your end result will look something like [this](https://ha-starterkit-district--cal-itp-data-analyses.netlify.app/readme)." ] }, @@ -90,7 +90,7 @@ "* For every notebook you make, **you must copy and paste this block of code below in this exact order.** Otherwise, your notebook won't work.\n", "* What am I importing?\n", " * `%%capture`: Captures the parameter/yml parts.\n", - " * `import warnings warnings.filterwarnings('ignore')`: Sometimes when you are analyzing data, warnings pop up. These warnings are quite unattractive and we don't want them to be displayed in a portfolio so we turn off these warnings. You don't want to turn off the warnings if you are still analyzing your data! \n", + " * `import warnings warnings.filterwarnings('ignore')`: Sometimes when you are analyzing data, harmless warnings pop up. These warnings are quite unattractive and we don't want them to be displayed in a portfolio so we turn off these warnings.\n", " * `import calitp_data_analysis.magics`: the library that makes the parameterization magic happen.\n", "* **Resource**: [DDS Getting Notebooks Ready for Parameterization](https://docs.calitp.org/data-infra/publishing/sections/4_notebooks_styling.html#getting-ready-for-parameterization)" ] @@ -110,7 +110,7 @@ "import calitp_data_analysis.magics\n", "\n", "# All your other packages go here\n", - "# Here I just want pandas and my own utils.\n", + "# Here I only need pandas and my own utils.\n", "import pandas as pd\n", "import _starterkit_utils " ] @@ -136,8 +136,9 @@ "**Step 7: Setting your parameters**\n", "* While these steps have already been done for you, it would still benefit you to re-do these steps and refer to the resource below. \n", "* **Resource**: [DDS Docs Capturing Parameters](https://docs.calitp.org/data-infra/publishing/sections/4_notebooks_styling.html#capturing-parameters).\n", - "* **Parameter #1:** Set a cell that is commented out with your parameter. Turn on the parameter tag.\n", - " * To turn on the parameter tag: go to the code cell go to the upper right hand corner -> click on the gears -> go to “Cell Tags” -> Add Tag + -> add a tag called “parameters” -> click on the new “parameters” tag to ensure a checkmark shows up and it turns dark gray" + "* **Parameter #1:** Set a cell that is commented out with your parameter. \n", + " * Our parameter is every district in the `yml` file. \n", + " * Turn on the parameter tag: go to the code cell go to the upper right hand corner -> click on the gears -> go to “Cell Tags” -> Add Tag + -> add a tag called “parameters” -> click on the new “parameters” tag to ensure a checkmark shows up and it turns dark gray" ] }, { @@ -179,7 +180,9 @@ "id": "f9285e93-924d-4681-b526-97b8e46643b1", "metadata": {}, "source": [ - "* **Parameter #3:** The first markdown cell must include parameters to inject. This line below generates the title District 1 Analysis when it is creating the notebook for District 1. Likewise, it'll say District 2 Analysis for District 2's page. \n", + "* **Parameter #3:** \n", + "* The first markdown cell must include parameters to inject. \n", + "* This line below generates the title District 1 Analysis when it is creating the notebook for District 1. Likewise, it'll say District 2 Analysis for District 2's page. \n", "* Feel free to change this to anything you wish, but make sure this stays a markdown cell.\n", "* This cell is extremely important and read why [here](https://docs.calitp.org/data-infra/publishing/sections/4_notebooks_styling.html#header)." ] @@ -197,7 +200,7 @@ "id": "64615b63-3848-45b7-af2e-19c7b7346997", "metadata": {}, "source": [ - "**Step 8: Input your functions**\n", + "**Step 8: Input your functions that will create your report**\n", "* I am loading my dataset first.\n", "* Then I am adding in my dataset and the district parameter into `_starterkit_utils.create_district_summary`." ] @@ -239,10 +242,10 @@ "metadata": {}, "source": [ "**Step 10: Build your portfolio**\n", - "* Double check you are at the root of your repo.\n", + "* Double check you that are at the root of your repo.\n", "* Replace `REPLACE_YML_NAME` with just the name of your `yml` file without the `.yml` extension into the command below.\n", "* Run `python portfolio/portfolio.py build REPLACE_YML_NAME --deploy` to build your portfolio.\n", - " * Example: My yml is called `ha_starterkit_district.yml` so I would run `python portfolio/portfolio.py build ha_starterkit_district --deploy`." + "* Example: My yml is called `ha_starterkit_district.yml` so I would run `python portfolio/portfolio.py build ha_starterkit_district --deploy`." ] }, { @@ -264,9 +267,10 @@ "metadata": {}, "source": [ "**Step 12: Something not right?**\n", - "* What if something is a little off? After updating your code, rerun this line of code to redo your portfolio. You must always `clean` your portfolio before regenerating new notebooks. \n", + "* What if something is a little off? After updating your code, rerun this line of code to redo your portfolio. \n", + " * You must always `clean` your portfolio before regenerating new notebooks. \n", "` python portfolio/portfolio.py clean REPLACE_YML_NAME && python portfolio/portfolio.py build REPLACE_YML_NAME --deploy`\n", - "* There are many other specifications you can add to `python portfolio/portfolio.py build` and they are all detailed on [DDS Other Specifications](https://docs.calitp.org/data-infra/publishing/sections/5_analytics_portfolio_site.html#other-specifications). " + "* There are many other specifications you can add to `python portfolio/portfolio.py build`, detailed on [DDS Other Specifications](https://docs.calitp.org/data-infra/publishing/sections/5_analytics_portfolio_site.html#other-specifications). " ] }, { @@ -276,12 +280,11 @@ "source": [ "**Step 13: Run a Makefile**\n", "* You can generate all 12 of your notebooks in one swift line of code instead of running the same couple of lines over and over again using a `Makefile`. \n", - "* You can think of a `Makefile` as a coffee machine that does the same thing day in and day out. \n", + "* You can think of a `Makefile` as a machine that does the same thing day in and day out. \n", " * You always install the same packages.\n", " * You always clean out the repo.\n", " * You generally will rerun the notebook in its entirety.\n", " * You always add the `md,yml,ipynb` and other files that the parameterization process creates.\n", - "* Makefiles are great for automating tasks and saving time. \n", "\n", "**Instructions** \n", "* Make sure you are still at the root of our repo `~/data-analyses`.\n",