Skip to content

Let's try calculating column diffs #4

Let's try calculating column diffs

Let's try calculating column diffs #4

name: Provider Map Jobs
on:
# TODO: Remove this once behavior has been finalized
push:
branches:
- vlad/provider-map-gha
workflow_dispatch:
schedule:
- cron: "0 0 * * 0"
jobs:
fetch_warehouse:
name: Fetch Warehouse Updates
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Authenticate with Google
uses: google-github-actions/auth@v1
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v1
- name: Download providers.csv from warehouse
run: |
bq query \
--quiet \
--headless \
--format=csv \
--use_legacy_sql=false \
"SELECT * FROM \`mart_transit_database.dim_mobility_mart_providers\`" \
> src/metadata/providers/providers.csv
- name: Fix our CSV file
run: |
# Workaround because of...
# https://github.com/google-github-actions/setup-gcloud/issues/666
sed -i -n -e '/agency_name/,$p' src/metadata/providers/providers.csv
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install pandas
run: |
pip install pandas
- name: Update Counties GeoJSON file
shell: python
run: |
import json
import pandas as pd
df = pd.read_csv('src/metadata/providers/providers.csv')
# Drop the null values for TAs that don't have counties served
df = df[df['counties_served'].notnull()]
# Do a group by for the counties served
county_counts = df['counties_served'].str.split(';') \
.explode('counties_served') \
.value_counts()
geojson_file = 'src/metadata/providers/counties.geojson'
geojson = json.load(open(geojson_file))
# Add the county counts to the geojson
for feature in geojson['features']:
county_name = feature['properties']['county']
if county_name in county_counts:
feature['properties']['num_providers'] = int(county_counts[county_name])
else:
feature['properties']['num_providers'] = 0
# Write the geojson back to the file
with open(geojson_file, 'w') as f:
json.dump(geojson, f, indent=2)
- name: Setup yq
uses: vegardit/gha-setup-yq@v1
with:
use-cache: true
version: 4.40.5
- name: Check for column updates
id: column-updates
run: |
curl -O https://raw.githubusercontent.com/cal-itp/data-infra/main/warehouse/models/mart/transit_database/_mart_transit_database.yml
columns_from_dbt=$(yq '.models[] | select(.name == "dim_mobility_mart_providers") | .columns[].name' _mart_transit_database.yml)
columns_from_repo=$(yq '.[].column' src/metadata/providers/dictionary.csv)
column_diff=$(diff <( printf '%s\n' "$columns_from_dbt" ) <( printf '%s\n' "$columns_from_repo" ))
echo "column_diff=$column_diff" >> "$GITHUB_OUTPUT"
- name: Create Pull Request
uses: peter-evans/create-pull-request@v5
with:
title: Provider Map Data Auto Update
body: |
It's that time again! The warehouse has delivered new data for us to use. This is an automatic pull request created by the `provider-map-jobs.yml` workflow; it is triggered via a cron that runs every Sunday at midnight UTC.
## Changed Columns
These are columns that differ between the warehouse and the repository. If you see descrepancies here, please update the `src/metadata/providers/dictionary.csv` file to match the warehouse.
```diff
${{ steps.column-updates.outputs.column_diff }}}
```
commit-message: Auto-update provider data from warehouse
add-paths: |
src/metadata/providers/providers.csv
src/metadata/providers/counties.geojson
base: main