Skip to content

fix(provider-map): handle new datasets #10

fix(provider-map): handle new datasets

fix(provider-map): handle new datasets #10

name: Provider Map Jobs
on:
# TODO: Remove this once behavior has been finalized
push:
branches:
- vlad/provider-map-gha
workflow_dispatch:
schedule:
- cron: "0 0 * * 0"
jobs:
fetch_warehouse:
name: Fetch Warehouse Updates
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Authenticate with Google
uses: google-github-actions/auth@v1
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v1
- name: Download providers.csv from warehouse
run: |
bq query \
--quiet \
--headless \
--format=csv \
--use_legacy_sql=false \
"SELECT * FROM \`mart_transit_database.dim_mobility_mart_providers\`" \
> src/metadata/providers/providers.csv
- name: Fix our CSV file
run: |
# Workaround because of...
# https://github.com/google-github-actions/setup-gcloud/issues/666
sed -i -n -e '/agency_name/,$p' src/metadata/providers/providers.csv
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install pandas
run: |
pip install pandas
- name: Update Counties GeoJSON file
shell: python
run: |
import json
import pandas as pd
providers_file = 'src/metadata/providers/providers.csv'
df = pd.read_csv(providers_file)
city_lookup = pd.read_csv('src/metadata/cities_to_county.csv')
city_to_county = dict(zip(city_lookup['City'], city_lookup['County']))
lookup_records = df[df['counties_served'].isna()]['ntd_id']
# Fill in the null values for counties served with the HQ county
for record in lookup_records:
city = df[df['ntd_id'] == record]['hq_city'].values[0]
try:
county = city_to_county[city] or city_to_county[f'City of {city}']
df.loc[df['ntd_id'] == record, 'hq_county'] = county
df.loc[df['ntd_id'] == record, 'counties_served'] = county
except KeyError:
print("No county found for city: ", city)
df.to_csv(providers_file)
# Do a group by for the counties served
county_counts = df['counties_served'].str.split(';') \
.explode('counties_served') \
.value_counts()
geojson_file = 'src/metadata/providers/counties.geojson'
geojson = json.load(open(geojson_file))
# Add the county counts to the geojson
for feature in geojson['features']:
county_name = feature['properties']['county']
if county_name in county_counts:
feature['properties']['num_providers'] = int(county_counts[county_name])
else:
feature['properties']['num_providers'] = 0
# Write the geojson back to the file
with open(geojson_file, 'w') as f:
json.dump(geojson, f, indent=2)
- name: Setup yq
uses: vegardit/gha-setup-yq@v1
with:
use-cache: true
version: 4.40.5
- name: Create Pull Request
uses: peter-evans/create-pull-request@v5
with:
title: Provider Map Data Auto Update
body: |
It's that time again! The warehouse has delivered new data for us to use. This is an automatic pull request created by the `provider-map-jobs.yml` workflow; it is triggered via a cron that runs every Sunday at midnight UTC.
commit-message: Auto-update provider data from warehouse
add-paths: |
src/metadata/providers/providers.csv
src/metadata/providers/counties.geojson
base: main