Skip to content

Commit

Permalink
reorganize, new nb and script for reading in data entry
Browse files Browse the repository at this point in the history
  • Loading branch information
KatrinaMKaiser committed Aug 11, 2023
1 parent 035b03a commit 43d5adb
Show file tree
Hide file tree
Showing 6 changed files with 2,157 additions and 0 deletions.
275 changes: 275 additions & 0 deletions project_prioritization/metrics/landuse_demo_project.ipynb

Large diffs are not rendered by default.

34 changes: 34 additions & 0 deletions project_prioritization/metrics/metrics_catalog.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
metadata:
version: 1
sources:
# project shapefiles
shp_demoproj_bikeped:
driver: shapefile
description: Watsonville-SC Bike-Ped Overcrossing
args:
urlpath: gs://calitp-analytics-data/data-analyses/project_prioritization/SantaCruz_Watsonville_Project_SHP_files/sc_bikeped_overcrossing.zip
use_fsspec: true
shp_demoproj_auxlane:
driver: shapefile
description: Watsonville-SC Auxiliary Lane
args:
urlpath: gs://calitp-analytics-data/data-analyses/project_prioritization/SantaCruz_Watsonville_Project_SHP_files/sc_auxiliary_lane.zip
use_fsspec: true
shp_demoproj_busshoulder:
driver: shapefile
description: Watsonville-SC Bus on SHoulder
args:
urlpath: gs://calitp-analytics-data/data-analyses/project_prioritization/SantaCruz_Watsonville_Project_SHP_files/sc_bus_on_shoulder.zip
use_fsspec: true
shp_demoproj_multimodal:
driver: shapefile
description: Watsonville-SC Multimodal Street
args:
urlpath: gs://calitp-analytics-data/data-analyses/project_prioritization/SantaCruz_Watsonville_Project_SHP_files/sc_multimodal_street.zip
use_fsspec: true
shp_demoproj_transitroute:
driver: shapefile
description: Santa Cruz Metro Route 69
args:
urlpath: gs://calitp-analytics-data/data-analyses/project_prioritization/SantaCruz_Watsonville_Project_SHP_files/transit_route.zip
use_fsspec: true
193 changes: 193 additions & 0 deletions project_prioritization/metrics/read_data_entry.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "34104da9-6af5-42e4-981c-c74ccf987005",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"CALITP_BQ_MAX_BYTES\"] = str(1_000_000_000_000) ## 1TB?\n",
"\n",
"import geopandas as gpd\n",
"import pandas as pd\n",
"import numpy as np\n",
"from shared_utils import utils, geography_utils\n",
"\n",
"pd.options.display.max_columns = 100\n",
"\n",
"import gcsfs\n",
"\n",
"from calitp_data_analysis import get_fs\n",
"fs = get_fs()\n",
"from calitp_data_analysis.sql import to_snakecase\n",
"\n",
"GCS_FILE_PATH = \"gs://calitp-analytics-data/data-analyses/project_prioritization/\""
]
},
{
"cell_type": "markdown",
"id": "4c6a51c4-dda9-47e2-8a76-51b8cb183b9b",
"metadata": {},
"source": [
"# Read In CSIS Metrics Testing Data Entry\n",
"\n",
"Process data entry tabs and save as parquets "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "05ddff70-f751-4754-ad0f-c0609488d084",
"metadata": {},
"outputs": [],
"source": [
"# safety\n",
"safety_df = to_snakecase(pd.read_excel(f'{GCS_FILE_PATH}Metrics_Scoring_All_Projects.xlsx', sheet_name=\"Safety\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5375388c-9f2c-4a67-b10e-d07619e006e1",
"metadata": {},
"outputs": [],
"source": [
"safety_df.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f01bd3fd-c753-476a-b6f3-f3afb92eb0ab",
"metadata": {},
"outputs": [],
"source": [
"safety_df.crf_2.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "807f27a7-7603-464f-9d05-65e5e2842920",
"metadata": {},
"outputs": [],
"source": [
"# replace field that's entirely space (or empty) with NaN\n",
"safety_df=safety_df.replace(r'^\\s*$', np.nan, regex=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d5bc6400-415b-402a-9bff-7a305bf597dc",
"metadata": {},
"outputs": [],
"source": [
"safety_df = safety_df.astype({'crf_1':'float','crf_2':'float'})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4d170ddc-f0a1-4062-a942-650ede2fdffc",
"metadata": {},
"outputs": [],
"source": [
"safety_df.to_parquet(f'{GCS_FILE_PATH}data_entry_raw_safety.parquet')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d864d8a1-f9d9-4a43-ba76-fa28693feab0",
"metadata": {},
"outputs": [],
"source": [
"# DAC Traffic Impacts\n",
"dac_traffic = pd.read_excel(f'{GCS_FILE_PATH}Metrics_Scoring_All_Projects.xlsx', sheet_name=\"DAC Traffic Impacts\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b3ba39ef-faa4-4010-b7e2-033b5fb9eac2",
"metadata": {},
"outputs": [],
"source": [
"dac_traffic.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "604f1de6-68b4-4e5b-9ad7-c171338e4c3e",
"metadata": {},
"outputs": [],
"source": [
"dac_traffic.to_parquet(f'{GCS_FILE_PATH}data_entry_raw_dac_traffic.parquet')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f118f97-dcaf-433b-88ee-268557b6ac0c",
"metadata": {},
"outputs": [],
"source": [
"# Land Use\n",
"land_use = pd.read_excel(f'{GCS_FILE_PATH}Metrics_Scoring_All_Projects.xlsx', sheet_name=\"Land Use\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa2ef74e-c60a-4a54-a69d-ca77098cd00e",
"metadata": {},
"outputs": [],
"source": [
"land_use.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "968be7ac-9b13-4100-88d8-c1e6c7c69466",
"metadata": {},
"outputs": [],
"source": [
"print(pd. __version__)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "598992c9-c36a-4702-802d-aa75366c7410",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
28 changes: 28 additions & 0 deletions project_prioritization/metrics/read_data_entry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Read In CSIS Metrics Testing Data Entry
# Process data entry tabs and save as parquets

# header info
import pandas as pd
import numpy as np
from shared_utils import utils
pd.options.display.max_columns = 100
import gcsfs
from calitp_data_analysis.sql import to_snakecase

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/project_prioritization/"

# safety
safety = to_snakecase(pd.read_excel(f'{GCS_FILE_PATH}Metrics_Scoring_All_Projects.xlsx', sheet_name="Safety"))
# fix save error due to spaces in blank values: replace field that's entirely space (or empty) with NaN
safety=safety.replace(r'^\s*$', np.nan, regex=True)
safety = safety.astype({'crf_1':'float','crf_2':'float'})
safety.to_parquet(f'{GCS_FILE_PATH}data_entry_raw_safety.parquet')

# DAC Traffic Impacts
dac_traffic = to_snakecase(pd.read_excel(f'{GCS_FILE_PATH}Metrics_Scoring_All_Projects.xlsx', sheet_name="DAC Traffic Impacts"))
dac_traffic.to_parquet(f'{GCS_FILE_PATH}data_entry_raw_dac_traffic.parquet')

# Land use
land_use = to_snakecase(pd.read_excel(f'{GCS_FILE_PATH}Metrics_Scoring_All_Projects.xlsx', sheet_name="Land Use"))
land_use.to_parquet(f'{GCS_FILE_PATH}data_entry_raw_land_use.parquet')

Loading

0 comments on commit 43d5adb

Please sign in to comment.