From f6bb956bee5df7cbd8dcf343d8c777e7cbbc9f29 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Wed, 15 Jan 2025 13:11:48 -0300 Subject: [PATCH 01/12] Add geocodio as geocoder --- docker-compose.yaml | 1 + requirements.txt | 1 + src/dbcp/transform/geocodio.py | 124 +++++++++++++++++++ src/dbcp/transform/gridstatus.py | 4 +- src/dbcp/transform/helpers.py | 119 +++++++++++++----- src/dbcp/transform/local_opposition.py | 23 +++- src/dbcp/transform/rmi_energy_communities.py | 10 +- src/dbcp/validation/tests.py | 2 +- 8 files changed, 244 insertions(+), 40 deletions(-) create mode 100644 src/dbcp/transform/geocodio.py diff --git a/docker-compose.yaml b/docker-compose.yaml index 7490c482..81b4077e 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -6,6 +6,7 @@ services: environment: - API_KEY_GOOGLE_MAPS=${API_KEY_GOOGLE_MAPS} # get this value from our google account: https://console.cloud.google.com/google/maps-apis/credentials?project=dbcp-dev&supportedpurview=project - AIRTABLE_API_KEY=${AIRTABLE_API_KEY} + - GEOCODIO_API_KEY=${GEOCODIO_API_KEY} depends_on: postgres: condition: service_healthy diff --git a/requirements.txt b/requirements.txt index b513b3cc..1031ae02 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ psycopg2~=2.9.3 pytest~=6.2.5 tqdm>=4.64.1,<5.0.0 python-docx~=0.8.11 +pygeocodio~=1.4.0 googlemaps~=4.5.3 pandas-gbq~=0.19.1 pydata-google-auth~=1.7.0 diff --git a/src/dbcp/transform/geocodio.py b/src/dbcp/transform/geocodio.py new file mode 100644 index 00000000..83053492 --- /dev/null +++ b/src/dbcp/transform/geocodio.py @@ -0,0 +1,124 @@ +"""Geocodio geocoding functions.""" + +import os +from pathlib import Path + +import pandas as pd +from geocodio import GeocodioClient +from joblib import Memory +from pydantic import BaseModel + +geocoder_local_cache = Path("/app/data/geocodio_cache") +GEOCODER_CACHE = Memory(location=geocoder_local_cache, bytes_limit=2**19) + + +class AddressComponents(BaseModel): + """Address components from Geocodio.""" + + number: str = "" + predirectional: str = "" + street: str = "" + suffix: str = "" + formatted_street: str = "" + city: str = "" + county: str = "" + state: str = "" + zip: str = "" # noqa: A003 + country: str = "" + + +class Location(BaseModel): + """Location from Geocodio.""" + + lat: float = 0.0 + lng: float = 0.0 + + +class AddressData(BaseModel): + """Address data from Geocodio.""" + + address_components: AddressComponents + formatted_address: str = "" + location: Location + accuracy: float = 0.0 + accuracy_type: str = "" + source: str = "" + + +def _geocode_batch( + batch: pd.DataFrame, client: GeocodioClient, state_col: str, locality_col: str +) -> pd.DataFrame: + """Geocode a batch of addresses. + + Args: + batch: dataframe with address components + client: GeocodioClient object + state_col: name of the state column + locality_col: name of the locality column + + Returns: + dataframe with geocoded locality information + """ + batch = batch.rename(columns={locality_col: "city", state_col: "state"}) + batch["country"] = "US" + components_data = batch.to_dict(orient="records") + results = client.geocode(components_data=components_data) + + results_df = [] + for result in results: + if "error" in result: + results_df.append(["", "", ""]) + elif result["results"]: + ad = AddressData.parse_obj(result["results"][0]) + locality_type = ad.accuracy_type + if locality_type == "place": + locality_name = ad.address_components.city + locality_type = "city" + elif locality_type == "county": + locality_name = ad.address_components.county + else: + locality_name = "" + results_df.append( + [locality_name, locality_type, ad.address_components.county] + ) + else: + results_df.append(["", "", ""]) + + results_df = pd.DataFrame( + results_df, + columns=[ + "geocoded_locality_name", + "geocoded_locality_type", + "geocoded_containing_county", + ], + index=batch.index, + ) + return results_df + + +@GEOCODER_CACHE.cache() +def _geocode_locality( + state_locality_df: pd.DataFrame, + state_col: str = "state", + locality_col: str = "county", + batch_size: int = 100, +) -> pd.DataFrame: + """Geocode locality names in a dataframe. + + Args: + state_locality_df: dataframe with state and locality columns + state_col: name of the state column + locality_col: name of the locality column + batch_size: number of rows to geocode at once + Returns: + dataframe with geocoded locality information + """ + GEOCODIO_API_KEY = os.environ["GEOCODIO_API_KEY"] + client = GeocodioClient(GEOCODIO_API_KEY) + + geocoded_df = [] + + for start in range(0, len(state_locality_df), batch_size): + batch = state_locality_df.iloc[start : start + batch_size] # noqa: E203 + geocoded_df.append(_geocode_batch(batch, client, state_col, locality_col)) + return pd.concat(geocoded_df) diff --git a/src/dbcp/transform/gridstatus.py b/src/dbcp/transform/gridstatus.py index e5b5ec2f..c2564317 100644 --- a/src/dbcp/transform/gridstatus.py +++ b/src/dbcp/transform/gridstatus.py @@ -478,7 +478,7 @@ def _clean_resource_type( resource_locations["county_id_fips"].isin(coastal_county_id_fips.keys()) & resource_locations.resource_clean.eq("Onshore Wind") ].project_id - expected_n_coastal_wind_projects = 88 + expected_n_coastal_wind_projects = 81 assert ( len(nyiso_coastal_wind_project_project_ids) == expected_n_coastal_wind_projects ), f"Expected {expected_n_coastal_wind_projects} NYISO coastal wind projects but found {len(nyiso_coastal_wind_project_project_ids)}" @@ -1120,7 +1120,7 @@ def transform(raw_dfs: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]: intermediate_creator=_prep_for_deduplication, ) dupes = pre_dedupe - len(deduped_projects) - logger.info(f"Deduplicated {dupes} ({dupes/pre_dedupe:.2%}) projects.") + logger.info(f"Deduplicated {dupes} ({dupes / pre_dedupe:.2%}) projects.") # Normalize data ( diff --git a/src/dbcp/transform/helpers.py b/src/dbcp/transform/helpers.py index 1fa3bb3a..cea52925 100644 --- a/src/dbcp/transform/helpers.py +++ b/src/dbcp/transform/helpers.py @@ -8,6 +8,7 @@ from dbcp.constants import FIPS_CODE_VINTAGE from dbcp.helpers import add_fips_ids +from dbcp.transform import geocodio from dbcp.transform.geocoding import GoogleGeocoder UNIX_EPOCH_ORIGIN = pd.Timestamp("01/01/1970") @@ -299,6 +300,52 @@ def _geocode_locality( return new_cols +def test_geocode_and_add_fips( + nan_fips: pd.DataFrame, state_col="state", locality_col="county", api="geocodio" +) -> pd.DataFrame: + """Geocode locality names in a dataframe and add FIPS codes.""" + # Deduplicate on the state and locality columns to minimize API calls + key_cols = [state_col, locality_col] + deduped_nan_fips = nan_fips.loc[:, key_cols].drop_duplicates() + if api == "google": + deduped_geocoded = _geocode_locality( + deduped_nan_fips, + # pass subset to _geocode_locality to maximize chance of a cache hit + # (this way other columns can change but caching still works) + state_col=state_col, + locality_col=locality_col, + ) + elif api == "geocodio": + deduped_geocoded = geocodio._geocode_locality( + deduped_nan_fips, + state_col=state_col, + locality_col=locality_col, + ) + else: + raise ValueError(f"Unknown API: {api}") + + # recombine deduped geocoded data with original nan_fips + geocoded_deduped_nan_fips = pd.concat( + [deduped_nan_fips[key_cols], deduped_geocoded], axis=1 + ) + index_name = nan_fips.index.name + index_name = index_name if index_name is not None else "index" + geocoded = ( + nan_fips.reset_index() + .merge(geocoded_deduped_nan_fips, on=key_cols, how="left", validate="m:1") + .set_index(index_name)[deduped_geocoded.columns] + ) + + nan_fips = pd.concat([nan_fips, geocoded], axis=1) + # add fips using geocoded names + return add_fips_ids( + nan_fips, + state_col=state_col, + county_col="geocoded_containing_county", + vintage=FIPS_CODE_VINTAGE, + ) + + def add_county_fips_with_backup_geocoding( state_locality_df: pd.DataFrame, state_col="state", locality_col="county" ) -> pd.DataFrame: @@ -315,6 +362,14 @@ def add_county_fips_with_backup_geocoding( Returns: pd.DataFrame: copy of state_locality_df with new columns 'geocoded_locality_name', 'geocoded_locality_type', 'geocoded_containing_county' """ + cols_to_keep = [ + "state_id_fips", + "county_id_fips", + "geocoded_locality_name", + "geocoded_locality_type", + "geocoded_containing_county", + ] + filled_state_locality = state_locality_df.loc[:, [state_col, locality_col]].fillna( "" ) # copy @@ -332,7 +387,8 @@ def add_county_fips_with_backup_geocoding( with_fips["geocoded_locality_name"] = with_fips[locality_col] with_fips["geocoded_locality_type"] = "county" with_fips["geocoded_containing_county"] = with_fips[locality_col] - return with_fips + # attach to original df + return pd.concat([state_locality_df, with_fips[cols_to_keep]], axis=1) good_fips = with_fips.loc[~fips_is_nan, :].copy() # standardize output columns @@ -343,45 +399,40 @@ def add_county_fips_with_backup_geocoding( # geocode the lookup failures - they are often city/town names (instead of counties) or simply mis-spelled nan_fips = with_fips.loc[fips_is_nan, :].copy() - # Deduplicate on the state and locality columns to minimize API calls - key_cols = [state_col, locality_col] - deduped_nan_fips = nan_fips.loc[:, key_cols].drop_duplicates() - deduped_geocoded = _geocode_locality( - deduped_nan_fips, - # pass subset to _geocode_locality to maximize chance of a cache hit - # (this way other columns can change but caching still works) - state_col=state_col, - locality_col=locality_col, + google = test_geocode_and_add_fips( + nan_fips, state_col=state_col, locality_col=locality_col, api="google" ) - # recombine deduped geocoded data with original nan_fips - geocoded_deduped_nan_fips = pd.concat( - [deduped_nan_fips[key_cols], deduped_geocoded], axis=1 - ) - index_name = nan_fips.index.name - index_name = index_name if index_name is not None else "index" - geocoded = ( - nan_fips.reset_index() - .merge(geocoded_deduped_nan_fips, on=key_cols, how="left", validate="m:1") - .set_index(index_name)[deduped_geocoded.columns] + geocodio = test_geocode_and_add_fips( + nan_fips, state_col=state_col, locality_col=locality_col, api="geocodio" ) - nan_fips = pd.concat([nan_fips, geocoded], axis=1) - # add fips using geocoded names - filled_fips = add_fips_ids( - nan_fips, - state_col=state_col, - county_col="geocoded_containing_county", - vintage=FIPS_CODE_VINTAGE, + # compare geocoding results + comp = geocodio.merge( + google, + how="left", + validate="1:1", + left_index=True, + right_index=True, + suffixes=("_geocodio", "_google"), + ) + print("--------------------------------") + print("Geocoding comparison:") + print( + comp.county_id_fips_google.eq(comp.county_id_fips_geocodio).value_counts( + dropna=False + ) ) + # raw_comp = pd.concat( + # [comp, state_locality_df.loc[comp.index]], axis=1 + # ) # noqa: F841 + # eq = comp.geocoded_locality_name_geocodio.eq( + # comp.geocoded_locality_name_google + # ) # noqa: F841 + print("--------------------------------") + + filled_fips = geocodio # recombine and restore row order - cols_to_keep = [ - "state_id_fips", - "county_id_fips", - "geocoded_locality_name", - "geocoded_locality_type", - "geocoded_containing_county", - ] recombined = pd.concat([good_fips, filled_fips], axis=0).loc[ state_locality_df.index, cols_to_keep ] diff --git a/src/dbcp/transform/local_opposition.py b/src/dbcp/transform/local_opposition.py index ff26d66b..68cb3f1c 100644 --- a/src/dbcp/transform/local_opposition.py +++ b/src/dbcp/transform/local_opposition.py @@ -96,19 +96,40 @@ def _transform_local_ordinances(local_ord_df: pd.DataFrame) -> pd.DataFrame: # manual corrections location_corrections = { - "Batavia Township (Clermont County)": "Batavia Township (Branch County)", + "Batavia Township (Clermont County)": "Branch County", "Town of Albion (Kennebec County)": "Albion (Kennebec County)", "Town of Lovell (Oxford County)": "Lovell (Oxford County)", "Town of Charlton (Worcester County)": "Charlton (Worcester County)", "City of Owasso (Rogers and Tulsa Counties)": "Owasso (Rogers and Tulsa Counties)", "City of Burleson (Tarrant and Johnson Counties)": "Burleson (Tarrant and Johnson Counties)", "Montrose City (Genesee County)": "Montrose (Genesee County)", + "Genoa Township (Livingston County)": "Livingston County", + "Maple Valley Township (Montcalm County)": "Montcalm County", + "Ellington Township (Tuscola County)": "Tuscola County", + "Almer Township (Tuscola County)": "Tuscola County", + "Beaver Township (Bay County)": "Bay County", + "Matteson Township (Branch County)": "Branch County", + "Monitor Township (Bay County)": "Bay County", + "Town of Porter (Niagara County)": "Niagara County", } + raw_locality = local["locality"].copy() local.loc[:, "locality"].replace(location_corrections, inplace=True) + # Remove (Count Name) from localities because geocodio performs better with just the locality name + local["locality"] = local["locality"].str.replace(r"\s?\(.*?\)", "", regex=True) + + # Remove "City of" and "Town of" prefixes from localities that have them + # Geocodio thinks these prefixes are street names + local["locality"] = local["locality"].str.replace( + r"^(City of|Town of) ", "", regex=True + ) + # add fips codes to counties (but many names are cities) with_fips = add_county_fips_with_backup_geocoding(local, locality_col="locality") + # undo locality corrections so we can view the raw data + with_fips.loc[:, "locality"] = raw_locality + year_summaries = _extract_years(local["ordinance_text"]) local = pd.concat([with_fips, year_summaries], axis=1) local.rename( diff --git a/src/dbcp/transform/rmi_energy_communities.py b/src/dbcp/transform/rmi_energy_communities.py index 710f78a8..549b7642 100644 --- a/src/dbcp/transform/rmi_energy_communities.py +++ b/src/dbcp/transform/rmi_energy_communities.py @@ -1,4 +1,5 @@ """Transformations for RMI's energy communities analysis.""" + import pandas as pd from dbcp.transform.helpers import add_county_fips_with_backup_geocoding @@ -18,6 +19,10 @@ def transform(raw_dfs: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]: "percent_of_county_coal_qualified": "coal_qualifying_area_fraction", } transformed.rename(columns=rename_dict, inplace=True) + transformed["raw_county_name"] = transformed.apply( + lambda row: row["raw_county_name"].replace(f", {row['raw_state_name']}", ""), + axis=1, + ) # fix two counties whose FIPS changed from 2010 to 2015 transformed = add_county_fips_with_backup_geocoding( transformed, state_col="raw_state_name", locality_col="raw_county_name" @@ -25,9 +30,10 @@ def transform(raw_dfs: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]: # fix one null FIPS (Villalba Municipio, Puerto Rico) fips_is_nan = transformed["county_id_fips"].isna() + expected_null_fips = 0 assert ( - fips_is_nan.sum() == 1 - ), f"Assumption violation: expected 1 null FIPS, got {fips_is_nan.sum()}" + fips_is_nan.sum() == expected_null_fips + ), f"Assumption violation: expected {expected_null_fips} null FIPS, got {fips_is_nan.sum()}" transformed.loc[:, "county_id_fips"] = transformed.loc[:, "county_id_fips"].fillna( transformed.loc[:, "raw_county_id_fips"] ) diff --git a/src/dbcp/validation/tests.py b/src/dbcp/validation/tests.py index 70039990..58757e88 100644 --- a/src/dbcp/validation/tests.py +++ b/src/dbcp/validation/tests.py @@ -243,7 +243,7 @@ def test_county_wide_coverage(engine: Engine): df.shape[0] == n_counties ), "counties_wide_format does not contain all counties" notnull = df.notnull() - n_expected_counties = 2471 + n_expected_counties = 2472 assert notnull.any(axis=1).sum() == n_expected_counties, ( "counties_wide_format has unexpected county coverage." f" Expected {n_expected_counties}, found {notnull.any(axis=1).sum()}" From 4724e9ee3995633c9d1c6da18ad08fe0c4263cb5 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Wed, 22 Jan 2025 12:13:54 -0300 Subject: [PATCH 02/12] Clean up geocodio and google maps api logic --- .github/workflows/test-full-build.yml | 2 +- .github/workflows/update-data.yml | 2 +- README.md | 5 +- docker-compose.yaml | 2 +- src/dbcp/cli.py | 4 +- src/dbcp/etl.py | 4 +- src/dbcp/transform/geocodio.py | 11 +- .../{geocoding.py => google_maps.py} | 76 +++++++++ src/dbcp/transform/helpers.py | 160 +++++------------- test/unit/test_geocoding.py | 4 +- 10 files changed, 145 insertions(+), 125 deletions(-) rename src/dbcp/transform/{geocoding.py => google_maps.py} (70%) diff --git a/.github/workflows/test-full-build.yml b/.github/workflows/test-full-build.yml index 72a2bfcd..72c4b08d 100644 --- a/.github/workflows/test-full-build.yml +++ b/.github/workflows/test-full-build.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest env: - API_KEY_GOOGLE_MAPS: ${{ secrets.API_KEY_GOOGLE_MAPS }} + GEOCODIO_API_KEY: ${{ secrets.GEOCODIO_API_KEY }} steps: - name: Check out code diff --git a/.github/workflows/update-data.yml b/.github/workflows/update-data.yml index 60211711..d1a76989 100644 --- a/.github/workflows/update-data.yml +++ b/.github/workflows/update-data.yml @@ -93,7 +93,7 @@ jobs: matrix: ${{ fromJSON(needs.matrix_prep.outputs.matrix) }} fail-fast: false env: - API_KEY_GOOGLE_MAPS: ${{ secrets.API_KEY_GOOGLE_MAPS }} + GEOCODIO_API_KEY: ${{ secrets.GEOCODIO_API_KEY }} GITHUB_REF: ${{ github.ref_name }} # This is changed to dev if running on a schedule steps: - name: print matrix diff --git a/README.md b/README.md index 3a9aef39..5c95b72d 100644 --- a/README.md +++ b/README.md @@ -77,10 +77,11 @@ export GOOGLE_GHA_CREDS_PATH= `GOOGLE_GHA_CREDS_PATH` will be mounted into the container so the GCP APIs in the container can access the data stored in GCP. -You'll also need to set an environment variable for the Google Maps API Key: +You'll also need to set an environment variable for the Geocodio API Key. This api key is stored +GCP project Secret Manager as `geocodio-api-key`. ``` -export API_KEY_GOOGLE_MAPS={Google Maps API key for GCP project dbcp-dev-350818} +export GEOCODIO_API_KEY={geocodio api key} ``` ## Git Pre-commit Hooks diff --git a/docker-compose.yaml b/docker-compose.yaml index 81b4077e..190de645 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -6,7 +6,7 @@ services: environment: - API_KEY_GOOGLE_MAPS=${API_KEY_GOOGLE_MAPS} # get this value from our google account: https://console.cloud.google.com/google/maps-apis/credentials?project=dbcp-dev&supportedpurview=project - AIRTABLE_API_KEY=${AIRTABLE_API_KEY} - - GEOCODIO_API_KEY=${GEOCODIO_API_KEY} + - GEOCODIO_API_KEY=${GEOCODIO_API_KEY} # This api key is stored GCP project secret manager as geocodio-api-key depends_on: postgres: condition: service_healthy diff --git a/src/dbcp/cli.py b/src/dbcp/cli.py index 3593010f..36645228 100644 --- a/src/dbcp/cli.py +++ b/src/dbcp/cli.py @@ -10,7 +10,7 @@ from dbcp.commands.publish import publish_outputs from dbcp.commands.settings import save_settings from dbcp.transform.fips_tables import SPATIAL_CACHE -from dbcp.transform.helpers import GEOCODER_CACHE +from dbcp.transform.helpers import GEOCODER_CACHES logger = logging.getLogger(__name__) @@ -53,7 +53,7 @@ def cli(loglevel): def etl(data_mart: bool, data_warehouse: bool, clear_cache: bool): """Run the ETL process to produce the data warehouse and mart.""" if clear_cache: - GEOCODER_CACHE.clear() + GEOCODER_CACHES.clear_caches() SPATIAL_CACHE.clear() if data_warehouse: diff --git a/src/dbcp/etl.py b/src/dbcp/etl.py index 6766e921..2503e295 100644 --- a/src/dbcp/etl.py +++ b/src/dbcp/etl.py @@ -16,7 +16,7 @@ from dbcp.extract.ncsl_state_permitting import NCSLScraper from dbcp.helpers import enforce_dtypes, psql_insert_copy from dbcp.transform.fips_tables import SPATIAL_CACHE -from dbcp.transform.helpers import GEOCODER_CACHE +from dbcp.transform.helpers import GEOCODER_CACHES from dbcp.validation.tests import validate_warehouse logger = logging.getLogger(__name__) @@ -244,7 +244,7 @@ def run_etl(funcs: dict[str, Callable], schema_name: str): def etl(): """Run dbc ETL.""" # Reduce size of caches if necessary - GEOCODER_CACHE.reduce_size() + GEOCODER_CACHES.reduce_cache_sizes() SPATIAL_CACHE.reduce_size() # Run public ETL functions diff --git a/src/dbcp/transform/geocodio.py b/src/dbcp/transform/geocodio.py index 83053492..92ccded2 100644 --- a/src/dbcp/transform/geocodio.py +++ b/src/dbcp/transform/geocodio.py @@ -8,7 +8,16 @@ from joblib import Memory from pydantic import BaseModel -geocoder_local_cache = Path("/app/data/geocodio_cache") +try: # docker path + # 3 directories above current module + geocoder_local_cache = Path("/app/data/geocodio_cache") + assert geocoder_local_cache.exists() +except AssertionError: # local path + # 4 directories above current module + geocoder_local_cache = Path(__file__).resolve().parents[3] / "data/geocodio_cache" + assert geocoder_local_cache.exists() +# cache needs to be accessed outside this module to call .clear() +# limit cache size to 100 KB, keeps most recently accessed first GEOCODER_CACHE = Memory(location=geocoder_local_cache, bytes_limit=2**19) diff --git a/src/dbcp/transform/geocoding.py b/src/dbcp/transform/google_maps.py similarity index 70% rename from src/dbcp/transform/geocoding.py rename to src/dbcp/transform/google_maps.py index c1af3ad6..c71efe9d 100644 --- a/src/dbcp/transform/geocoding.py +++ b/src/dbcp/transform/google_maps.py @@ -1,14 +1,32 @@ """Classes and functions for geocoding address data using Google API.""" + import os from functools import lru_cache from logging import getLogger +from pathlib import Path from typing import Dict, List, Optional from warnings import warn import googlemaps +import pandas as pd +from joblib import Memory logger = getLogger("__name__") +try: # docker path + # 3 directories above current module + geocoder_local_cache = Path("/app/data/google_geocoder_cache") + assert geocoder_local_cache.exists() +except AssertionError: # local path + # 4 directories above current module + geocoder_local_cache = ( + Path(__file__).resolve().parents[3] / "data/google_geocoder_cache" + ) + assert geocoder_local_cache.exists() +# cache needs to be accessed outside this module to call .clear() +# limit cache size to 100 KB, keeps most recently accessed first +GEOCODER_CACHE = Memory(location=geocoder_local_cache, bytes_limit=2**19) + class GoogleGeocoder(object): """Class to interact with Google's Geocoding API.""" @@ -202,3 +220,61 @@ def _get_geocode_response( return response[0] except IndexError: # empty list = not found return {} + + +def _geocode_row( + ser: pd.Series, client: GoogleGeocoder, state_col="state", locality_col="county" +) -> List[str]: + """Function to pass into pandas df.apply() to geocode state/locality pairs. + + Args: + ser (pd.Series): a row of a larger dataframe to geocode + client (GoogleGeocoder): client for Google Maps Platform API + state_col (str, optional): name of the column of state names. Defaults to 'state'. + locality_col (str, optional): name of the column of locality names. Defaults to 'county'. + + Returns: + List[str]: geocoded_locality_name, geocoded_locality_type, and geocoded_containing_county + """ + client.geocode_request(name=ser[locality_col], state=ser[state_col]) + return client.describe() + + +@GEOCODER_CACHE.cache() +def _geocode_locality( + state_locality_df: pd.DataFrame, state_col="state", locality_col="county" +) -> pd.DataFrame: + """Use Google Maps Platform API to look up information about state/locality pairs in a dataframe. + + Args: + state_locality_df (pd.DataFrame): dataframe with state and locality columns + state_col (str, optional): name of the column of state names. Defaults to 'state'. + locality_col (str, optional): name of the column of locality names. Defaults to 'county'. + + Returns: + pd.DataFrame: new columns 'geocoded_locality_name', 'geocoded_locality_type', 'geocoded_containing_county' + """ + # NOTE: the purpose of the cache decorator is primarily to + # reduce API calls during development. A secondary benefit is to reduce + # execution time due to slow synchronous requests. + # That's why this is persisted to disk with joblib, not in memory with LRU_cache or something. + # Because it is on disk, caching the higher level dataframe function causes less IO overhead + # than caching individual API calls would. + # Because the entire input dataframe must be identical to the cached version, I + # recommend subsetting the dataframe to only state_col and locality_col when calling + # this function. That allows other, unrelated columns to change but still use the geocode cache. + geocoder = GoogleGeocoder() + new_cols = state_locality_df.apply( + _geocode_row, + axis=1, + result_type="expand", + client=geocoder, + state_col=state_col, + locality_col=locality_col, + ) + new_cols.columns = [ + "geocoded_locality_name", + "geocoded_locality_type", + "geocoded_containing_county", + ] + return new_cols diff --git a/src/dbcp/transform/helpers.py b/src/dbcp/transform/helpers.py index cea52925..76748b37 100644 --- a/src/dbcp/transform/helpers.py +++ b/src/dbcp/transform/helpers.py @@ -1,6 +1,6 @@ """Common transform operations.""" -from pathlib import Path +from dataclasses import dataclass from typing import Any, Dict, List, Optional, Sequence import pandas as pd @@ -8,8 +8,7 @@ from dbcp.constants import FIPS_CODE_VINTAGE from dbcp.helpers import add_fips_ids -from dbcp.transform import geocodio -from dbcp.transform.geocoding import GoogleGeocoder +from dbcp.transform import geocodio, google_maps UNIX_EPOCH_ORIGIN = pd.Timestamp("01/01/1970") # Excel parser is simplified and will be one day off for dates < 1900/03/01 @@ -18,17 +17,30 @@ # See xlrd.xldate.py:xldate_as_datetime for complete implementation. EXCEL_EPOCH_ORIGIN = pd.Timestamp("12/30/1899") -try: # docker path - # 3 directories above current module - geocoder_local_cache = Path("/app/data/geocoder_cache") - assert geocoder_local_cache.exists() -except AssertionError: # local path - # 4 directories above current module - geocoder_local_cache = Path(__file__).resolve().parents[3] / "data/geocoder_cache" - assert geocoder_local_cache.exists() -# cache needs to be accessed outside this module to call .clear() -# limit cache size to 100 KB, keeps most recently accessed first -GEOCODER_CACHE = Memory(location=geocoder_local_cache, bytes_limit=2**19) + +@dataclass +class MemoryCaches: + """ + Container for multiple Memory caches. + + Attributes: + caches: list of Memory caches + """ + + caches: list[Memory] + + def reduce_cache_sizes(self): + """Reduce the size of all caches.""" + for cache in self.caches: + cache.reduce_size() + + def clear_caches(self): + """Clear all caches.""" + for cache in self.caches: + cache.clear() + + +GEOCODER_CACHES = MemoryCaches([geocodio.GEOCODER_CACHE, google_maps.GEOCODER_CACHE]) def normalize_multicolumns_to_rows( @@ -242,88 +254,39 @@ def parse_dates(series: pd.Series, expected_mean_year=2020) -> pd.Series: return multiformat_string_date_parser(series) -def _geocode_row( - ser: pd.Series, client: GoogleGeocoder, state_col="state", locality_col="county" -) -> List[str]: - """Function to pass into pandas df.apply() to geocode state/locality pairs. - - Args: - ser (pd.Series): a row of a larger dataframe to geocode - client (GoogleGeocoder): client for Google Maps Platform API - state_col (str, optional): name of the column of state names. Defaults to 'state'. - locality_col (str, optional): name of the column of locality names. Defaults to 'county'. - - Returns: - List[str]: geocoded_locality_name, geocoded_locality_type, and geocoded_containing_county - """ - client.geocode_request(name=ser[locality_col], state=ser[state_col]) - return client.describe() - - -@GEOCODER_CACHE.cache() -def _geocode_locality( - state_locality_df: pd.DataFrame, state_col="state", locality_col="county" +def _geocode_and_add_fips( + nan_fips: pd.DataFrame, state_col="state", locality_col="county", api="geocodio" ) -> pd.DataFrame: - """Use Google Maps Platform API to look up information about state/locality pairs in a dataframe. + """Geocode locality names in a dataframe and add FIPS codes. - Args: - state_locality_df (pd.DataFrame): dataframe with state and locality columns - state_col (str, optional): name of the column of state names. Defaults to 'state'. - locality_col (str, optional): name of the column of locality names. Defaults to 'county'. + This is used for records that are not assigned a FIPS code by the initial lookup. + Args: + nan_fips: dataframe with state and locality columns + state_col: name of the state column + locality_col: name of the locality column + api: name of the geocoding API to use Returns: - pd.DataFrame: new columns 'geocoded_locality_name', 'geocoded_locality_type', 'geocoded_containing_county' + dataframe with geocoded locality information """ - # NOTE: the purpose of the cache decorator is primarily to - # reduce API calls during development. A secondary benefit is to reduce - # execution time due to slow synchronous requests. - # That's why this is persisted to disk with joblib, not in memory with LRU_cache or something. - # Because it is on disk, caching the higher level dataframe function causes less IO overhead - # than caching individual API calls would. - # Because the entire input dataframe must be identical to the cached version, I - # recommend subsetting the dataframe to only state_col and locality_col when calling - # this function. That allows other, unrelated columns to change but still use the geocode cache. - geocoder = GoogleGeocoder() - new_cols = state_locality_df.apply( - _geocode_row, - axis=1, - result_type="expand", - client=geocoder, - state_col=state_col, - locality_col=locality_col, - ) - new_cols.columns = [ - "geocoded_locality_name", - "geocoded_locality_type", - "geocoded_containing_county", - ] - return new_cols - - -def test_geocode_and_add_fips( - nan_fips: pd.DataFrame, state_col="state", locality_col="county", api="geocodio" -) -> pd.DataFrame: - """Geocode locality names in a dataframe and add FIPS codes.""" # Deduplicate on the state and locality columns to minimize API calls key_cols = [state_col, locality_col] deduped_nan_fips = nan_fips.loc[:, key_cols].drop_duplicates() if api == "google": - deduped_geocoded = _geocode_locality( - deduped_nan_fips, - # pass subset to _geocode_locality to maximize chance of a cache hit - # (this way other columns can change but caching still works) - state_col=state_col, - locality_col=locality_col, - ) + geocoding_module = google_maps elif api == "geocodio": - deduped_geocoded = geocodio._geocode_locality( - deduped_nan_fips, - state_col=state_col, - locality_col=locality_col, - ) + geocoding_module = geocodio else: raise ValueError(f"Unknown API: {api}") + deduped_geocoded = geocoding_module._geocode_locality( + deduped_nan_fips, + # pass subset to _geocode_locality to maximize chance of a cache hit + # (this way other columns can change but caching still works) + state_col=state_col, + locality_col=locality_col, + ) + # recombine deduped geocoded data with original nan_fips geocoded_deduped_nan_fips = pd.concat( [deduped_nan_fips[key_cols], deduped_geocoded], axis=1 @@ -399,39 +362,10 @@ def add_county_fips_with_backup_geocoding( # geocode the lookup failures - they are often city/town names (instead of counties) or simply mis-spelled nan_fips = with_fips.loc[fips_is_nan, :].copy() - google = test_geocode_and_add_fips( - nan_fips, state_col=state_col, locality_col=locality_col, api="google" - ) - geocodio = test_geocode_and_add_fips( + filled_fips = _geocode_and_add_fips( nan_fips, state_col=state_col, locality_col=locality_col, api="geocodio" ) - # compare geocoding results - comp = geocodio.merge( - google, - how="left", - validate="1:1", - left_index=True, - right_index=True, - suffixes=("_geocodio", "_google"), - ) - print("--------------------------------") - print("Geocoding comparison:") - print( - comp.county_id_fips_google.eq(comp.county_id_fips_geocodio).value_counts( - dropna=False - ) - ) - # raw_comp = pd.concat( - # [comp, state_locality_df.loc[comp.index]], axis=1 - # ) # noqa: F841 - # eq = comp.geocoded_locality_name_geocodio.eq( - # comp.geocoded_locality_name_google - # ) # noqa: F841 - print("--------------------------------") - - filled_fips = geocodio - # recombine and restore row order recombined = pd.concat([good_fips, filled_fips], axis=0).loc[ state_locality_df.index, cols_to_keep diff --git a/test/unit/test_geocoding.py b/test/unit/test_geocoding.py index 8df22e9c..beec8703 100644 --- a/test/unit/test_geocoding.py +++ b/test/unit/test_geocoding.py @@ -1,7 +1,7 @@ -"""Test suite for dbcp.transform.geocoding module.""" +"""Test suite for dbcp.transform.google_maps module.""" import pytest -from dbcp.transform.geocoding import GoogleGeocoder +from dbcp.transform.google_maps import GoogleGeocoder class mock_geocoder(GoogleGeocoder): From 8d4c579ba47d52e42c53ce46b37c06e66c53bfd4 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Wed, 22 Jan 2025 16:02:03 -0300 Subject: [PATCH 03/12] Add tests, debug option to geocoding and use full address field for geocodio instead of components data --- src/dbcp/transform/geocodio.py | 6 +-- src/dbcp/transform/helpers.py | 39 ++++++++++++++++- test/unit/test_geocoding.py | 80 ++++++++++++++++++++++++++++++++++ 3 files changed, 119 insertions(+), 6 deletions(-) diff --git a/src/dbcp/transform/geocodio.py b/src/dbcp/transform/geocodio.py index 92ccded2..b6fd64cd 100644 --- a/src/dbcp/transform/geocodio.py +++ b/src/dbcp/transform/geocodio.py @@ -68,10 +68,8 @@ def _geocode_batch( Returns: dataframe with geocoded locality information """ - batch = batch.rename(columns={locality_col: "city", state_col: "state"}) - batch["country"] = "US" - components_data = batch.to_dict(orient="records") - results = client.geocode(components_data=components_data) + batch["address"] = batch[locality_col] + ", " + batch[state_col] + results = client.geocode(batch["address"].tolist()) results_df = [] for result in results: diff --git a/src/dbcp/transform/helpers.py b/src/dbcp/transform/helpers.py index 76748b37..1251bc63 100644 --- a/src/dbcp/transform/helpers.py +++ b/src/dbcp/transform/helpers.py @@ -1,5 +1,6 @@ """Common transform operations.""" +import logging from dataclasses import dataclass from typing import Any, Dict, List, Optional, Sequence @@ -10,6 +11,8 @@ from dbcp.helpers import add_fips_ids from dbcp.transform import geocodio, google_maps +logger = logging.getLogger(__name__) + UNIX_EPOCH_ORIGIN = pd.Timestamp("01/01/1970") # Excel parser is simplified and will be one day off for dates < 1900/03/01 # The origin is actually 12/31/1899, but because Excel mistakenly thinks @@ -310,7 +313,10 @@ def _geocode_and_add_fips( def add_county_fips_with_backup_geocoding( - state_locality_df: pd.DataFrame, state_col="state", locality_col="county" + state_locality_df: pd.DataFrame, + state_col="state", + locality_col="county", + debug=False, ) -> pd.DataFrame: """Add state and county FIPS codes to a DataFrame with state and locality columns. @@ -362,9 +368,38 @@ def add_county_fips_with_backup_geocoding( # geocode the lookup failures - they are often city/town names (instead of counties) or simply mis-spelled nan_fips = with_fips.loc[fips_is_nan, :].copy() - filled_fips = _geocode_and_add_fips( + # Compare google and geocodio results + geocodio_df = _geocode_and_add_fips( nan_fips, state_col=state_col, locality_col=locality_col, api="geocodio" ) + if debug: + google_df = _geocode_and_add_fips( + nan_fips, state_col=state_col, locality_col=locality_col, api="google" + ) + + # combine the two geocoded dataframes + comp = geocodio_df.merge( + google_df, + left_index=True, + right_index=True, + how="outer", + validate="1:1", + suffixes=("_geocodio", "_google"), + ) + + county_eq = comp.geocoded_containing_county_geocodio.eq( + comp.geocoded_containing_county_google + ) + logger.info("---------------------") + logger.info( + f"---- pct of geocoded fip failures that don't match: {(~county_eq).sum() / len(comp)}" + ) + logger.info( + f"---- pct of all records that don't have the same county: {(~county_eq).sum() / len(state_locality_df)}" + ) + logger.info("---------------------") + + filled_fips = geocodio_df # recombine and restore row order recombined = pd.concat([good_fips, filled_fips], axis=0).loc[ diff --git a/test/unit/test_geocoding.py b/test/unit/test_geocoding.py index beec8703..e1ac4ad3 100644 --- a/test/unit/test_geocoding.py +++ b/test/unit/test_geocoding.py @@ -1,6 +1,8 @@ """Test suite for dbcp.transform.google_maps module.""" +import pandas as pd import pytest +import dbcp.transform.geocodio as geocodio from dbcp.transform.google_maps import GoogleGeocoder @@ -294,3 +296,81 @@ def test_GoogleGeocoder_init_and_properties(): full = GoogleGeocoder() full._response = mock_geocoder_town_and_county()._response assert full.locality_name == "Westport" + + +@pytest.mark.parametrize( + "raw_localities, expected", + [ + pytest.param( + {"state": "ny", "county": "richmond-nj"}, + { + "geocoded_locality_name": "Richmond County", + "geocoded_locality_type": "county", + "geocoded_containing_county": "Richmond County", + }, + ), + ( + {"state": "ny", "county": "renssalear"}, + { + "geocoded_locality_name": "Rensselaer", + "geocoded_locality_type": "city", + "geocoded_containing_county": "Rensselaer County", + }, + ), + ( + {"state": "me", "county": "fairfield"}, + { + "geocoded_locality_name": "Fairfield", + "geocoded_locality_type": "city", + "geocoded_containing_county": "Somerset County", + }, + ), + pytest.param( + {"state": "nc", "county": "northhampton"}, + { + "geocoded_locality_name": "Northampton County", + "geocoded_locality_type": "county", + "geocoded_containing_county": "Northampton County", + }, + ), + pytest.param( + {"state": "co", "county": "rio arriba"}, + { + "geocoded_locality_name": "Rio Arriba County", + "geocoded_locality_type": "county", + "geocoded_containing_county": "Rio Arriba County", + }, + marks=pytest.mark.xfail( + reason="There is an Arriba city in Colorado but there is a Rio Arriba County in NM." + ), + ), + pytest.param( + {"state": "ca", "county": "Sonoma"}, + { + "geocoded_locality_name": "Sonoma", + "geocoded_locality_type": "city", + "geocoded_containing_county": "Sonoma County", + }, + ), + pytest.param( + {"state": "XX", "county": "Random locality name"}, + { + "geocoded_locality_name": "", + "geocoded_locality_type": "", + "geocoded_containing_county": "", + }, + ), + ], +) +def test_geocodio_geocode_locality(raw_localities, expected): + """Test the geocode_locality() method.""" + # create dataframe from raw_localities + df = pd.DataFrame([raw_localities]) + # geocode locality + result = geocodio._geocode_locality.func( + df, state_col="state", locality_col="county" + ) + # create expected dataframe from expected dict + expected_df = pd.DataFrame([expected]) + # test equality + pd.testing.assert_frame_equal(result, expected_df) From 9a0a22f9962d3928bac4dc5462890d47d0b25316 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Wed, 22 Jan 2025 16:16:33 -0300 Subject: [PATCH 04/12] Create geocoding cache directories if they don't exist --- src/dbcp/transform/geocodio.py | 12 ++++-------- src/dbcp/transform/google_maps.py | 14 ++++---------- 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/src/dbcp/transform/geocodio.py b/src/dbcp/transform/geocodio.py index b6fd64cd..a72109ac 100644 --- a/src/dbcp/transform/geocodio.py +++ b/src/dbcp/transform/geocodio.py @@ -8,14 +8,10 @@ from joblib import Memory from pydantic import BaseModel -try: # docker path - # 3 directories above current module - geocoder_local_cache = Path("/app/data/geocodio_cache") - assert geocoder_local_cache.exists() -except AssertionError: # local path - # 4 directories above current module - geocoder_local_cache = Path(__file__).resolve().parents[3] / "data/geocodio_cache" - assert geocoder_local_cache.exists() +geocoder_local_cache = Path("/app/data/geocodio_cache") +# create geocoder_local_cache if it doesn't exist +geocoder_local_cache.mkdir(parents=True, exist_ok=True) +assert geocoder_local_cache.exists() # cache needs to be accessed outside this module to call .clear() # limit cache size to 100 KB, keeps most recently accessed first GEOCODER_CACHE = Memory(location=geocoder_local_cache, bytes_limit=2**19) diff --git a/src/dbcp/transform/google_maps.py b/src/dbcp/transform/google_maps.py index c71efe9d..d999701c 100644 --- a/src/dbcp/transform/google_maps.py +++ b/src/dbcp/transform/google_maps.py @@ -13,16 +13,10 @@ logger = getLogger("__name__") -try: # docker path - # 3 directories above current module - geocoder_local_cache = Path("/app/data/google_geocoder_cache") - assert geocoder_local_cache.exists() -except AssertionError: # local path - # 4 directories above current module - geocoder_local_cache = ( - Path(__file__).resolve().parents[3] / "data/google_geocoder_cache" - ) - assert geocoder_local_cache.exists() + +geocoder_local_cache = Path("/app/data/google_geocoder_cache") +geocoder_local_cache.mkdir(parents=True, exist_ok=True) +assert geocoder_local_cache.exists() # cache needs to be accessed outside this module to call .clear() # limit cache size to 100 KB, keeps most recently accessed first GEOCODER_CACHE = Memory(location=geocoder_local_cache, bytes_limit=2**19) From baaa91d6ce4ad38f9834dec303f54d748eb2f5f0 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Wed, 22 Jan 2025 16:45:12 -0300 Subject: [PATCH 05/12] Add google maps api key back to github actions --- .github/workflows/test-full-build.yml | 1 + .github/workflows/update-data.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/test-full-build.yml b/.github/workflows/test-full-build.yml index 72c4b08d..8151e941 100644 --- a/.github/workflows/test-full-build.yml +++ b/.github/workflows/test-full-build.yml @@ -14,6 +14,7 @@ jobs: env: GEOCODIO_API_KEY: ${{ secrets.GEOCODIO_API_KEY }} + API_KEY_GOOGLE_MAPS: ${{ secrets.API_KEY_GOOGLE_MAPS }} steps: - name: Check out code diff --git a/.github/workflows/update-data.yml b/.github/workflows/update-data.yml index d1a76989..447585eb 100644 --- a/.github/workflows/update-data.yml +++ b/.github/workflows/update-data.yml @@ -94,6 +94,7 @@ jobs: fail-fast: false env: GEOCODIO_API_KEY: ${{ secrets.GEOCODIO_API_KEY }} + API_KEY_GOOGLE_MAPS: ${{ secrets.API_KEY_GOOGLE_MAPS }} GITHUB_REF: ${{ github.ref_name }} # This is changed to dev if running on a schedule steps: - name: print matrix From 6820ac375b120b857dae3a5e035378ca8039f903 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Wed, 22 Jan 2025 17:49:28 -0300 Subject: [PATCH 06/12] Add geocodio exception and rmi comment --- src/dbcp/transform/geocodio.py | 8 +++++++- src/dbcp/transform/rmi_energy_communities.py | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/dbcp/transform/geocodio.py b/src/dbcp/transform/geocodio.py index a72109ac..067077b4 100644 --- a/src/dbcp/transform/geocodio.py +++ b/src/dbcp/transform/geocodio.py @@ -5,6 +5,7 @@ import pandas as pd from geocodio import GeocodioClient +from geocodio.exceptions import GeocodioAuthError from joblib import Memory from pydantic import BaseModel @@ -65,7 +66,12 @@ def _geocode_batch( dataframe with geocoded locality information """ batch["address"] = batch[locality_col] + ", " + batch[state_col] - results = client.geocode(batch["address"].tolist()) + try: + results = client.geocode(batch["address"].tolist()) + except GeocodioAuthError: + raise GeocodioAuthError( + "Geocodio API key is invalid or you hit the daily geocoding limit which you can change in the Geocodio billing tab." + ) results_df = [] for result in results: diff --git a/src/dbcp/transform/rmi_energy_communities.py b/src/dbcp/transform/rmi_energy_communities.py index 549b7642..decee99d 100644 --- a/src/dbcp/transform/rmi_energy_communities.py +++ b/src/dbcp/transform/rmi_energy_communities.py @@ -19,6 +19,7 @@ def transform(raw_dfs: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]: "percent_of_county_coal_qualified": "coal_qualifying_area_fraction", } transformed.rename(columns=rename_dict, inplace=True) + # Some county entries have the state name appended to the county name which confuses the geocoder transformed["raw_county_name"] = transformed.apply( lambda row: row["raw_county_name"].replace(f", {row['raw_state_name']}", ""), axis=1, From e7d7400b2fa20c4ba7955379d632afa4fdc4fb53 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Fri, 24 Jan 2025 13:16:54 -0300 Subject: [PATCH 07/12] Create stronger types for geocodio classes, clean up parsing logic --- src/dbcp/transform/geocodio.py | 95 +++++++++++++++++++++---------- src/dbcp/transform/google_maps.py | 2 +- test/unit/test_geocoding.py | 6 +- 3 files changed, 68 insertions(+), 35 deletions(-) diff --git a/src/dbcp/transform/geocodio.py b/src/dbcp/transform/geocodio.py index 067077b4..20e05028 100644 --- a/src/dbcp/transform/geocodio.py +++ b/src/dbcp/transform/geocodio.py @@ -1,54 +1,95 @@ """Geocodio geocoding functions.""" import os +from enum import Enum from pathlib import Path import pandas as pd from geocodio import GeocodioClient from geocodio.exceptions import GeocodioAuthError from joblib import Memory -from pydantic import BaseModel +from pydantic import BaseModel, confloat geocoder_local_cache = Path("/app/data/geocodio_cache") # create geocoder_local_cache if it doesn't exist geocoder_local_cache.mkdir(parents=True, exist_ok=True) assert geocoder_local_cache.exists() # cache needs to be accessed outside this module to call .clear() -# limit cache size to 100 KB, keeps most recently accessed first +# limit cache size to keep most recently accessed first GEOCODER_CACHE = Memory(location=geocoder_local_cache, bytes_limit=2**19) class AddressComponents(BaseModel): """Address components from Geocodio.""" - number: str = "" - predirectional: str = "" - street: str = "" - suffix: str = "" - formatted_street: str = "" - city: str = "" - county: str = "" - state: str = "" - zip: str = "" # noqa: A003 - country: str = "" + number: str | None = None + predirectional: str | None = None + street: str | None = None + suffix: str | None = None + formatted_street: str | None = None + city: str | None = None + county: str | None = None + state: str | None = None + zip: str | None = None # noqa: A003 + country: str | None = None class Location(BaseModel): """Location from Geocodio.""" - lat: float = 0.0 - lng: float = 0.0 + lat: float + lng: float + + +class AccuracyType(str, Enum): + """ + Accuracy types from Geocodio. + + Valid values are documented at https://www.geocod.io/guides/accuracy-types-scores/ + """ + + rooftop = "rooftop" + point = "point" + range_interpolation = "range_interpolation" + nearest_rooftop_match = "nearest_rooftop_match" + intersection = "intersection" + street_center = "street_center" + place = "place" + county = "county" + state = "state" class AddressData(BaseModel): """Address data from Geocodio.""" address_components: AddressComponents - formatted_address: str = "" + formatted_address: str location: Location - accuracy: float = 0.0 - accuracy_type: str = "" - source: str = "" + accuracy: confloat(ge=0, le=1) + accuracy_type: AccuracyType + source: str + + @property + def locality_name(self) -> str: + """Create a locality name based on the accuracy type.""" + if self.accuracy_type == "place": + return self.address_components.city + elif self.accuracy_type == "county": + return self.address_components.county + else: + # We only care about cities and counties. + return None + + @property + def locality_type(self) -> str: + """Geocodio places cities into the generic 'place' type. + + Historically we've only dealt with counties and cities. This function + converts 'place' to 'city' for consistency. + """ + if self.accuracy_type == "place": + return "city" + return self.accuracy_type def _geocode_batch( @@ -75,23 +116,15 @@ def _geocode_batch( results_df = [] for result in results: - if "error" in result: - results_df.append(["", "", ""]) - elif result["results"]: + if result.get("results"): + # The results are always ordered with the most accurate locations first. + # It is therefore always safe to pick the first result in the list. ad = AddressData.parse_obj(result["results"][0]) - locality_type = ad.accuracy_type - if locality_type == "place": - locality_name = ad.address_components.city - locality_type = "city" - elif locality_type == "county": - locality_name = ad.address_components.county - else: - locality_name = "" results_df.append( - [locality_name, locality_type, ad.address_components.county] + [ad.locality_name, ad.locality_type, ad.address_components.county] ) else: - results_df.append(["", "", ""]) + results_df.append([None, None, None]) results_df = pd.DataFrame( results_df, diff --git a/src/dbcp/transform/google_maps.py b/src/dbcp/transform/google_maps.py index d999701c..0e0a62ad 100644 --- a/src/dbcp/transform/google_maps.py +++ b/src/dbcp/transform/google_maps.py @@ -18,7 +18,7 @@ geocoder_local_cache.mkdir(parents=True, exist_ok=True) assert geocoder_local_cache.exists() # cache needs to be accessed outside this module to call .clear() -# limit cache size to 100 KB, keeps most recently accessed first +# limit cache size to keep most recently accessed first GEOCODER_CACHE = Memory(location=geocoder_local_cache, bytes_limit=2**19) diff --git a/test/unit/test_geocoding.py b/test/unit/test_geocoding.py index e1ac4ad3..2b18c51e 100644 --- a/test/unit/test_geocoding.py +++ b/test/unit/test_geocoding.py @@ -355,9 +355,9 @@ def test_GoogleGeocoder_init_and_properties(): pytest.param( {"state": "XX", "county": "Random locality name"}, { - "geocoded_locality_name": "", - "geocoded_locality_type": "", - "geocoded_containing_county": "", + "geocoded_locality_name": None, + "geocoded_locality_type": None, + "geocoded_containing_county": None, }, ), ], From c34d8f4b9db15a160de0b0646c370885a0196eca Mon Sep 17 00:00:00 2001 From: bendnorman Date: Fri, 24 Jan 2025 13:22:08 -0300 Subject: [PATCH 08/12] Clean up variable names in geocodio module --- src/dbcp/transform/geocodio.py | 35 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/src/dbcp/transform/geocodio.py b/src/dbcp/transform/geocodio.py index 20e05028..ca938141 100644 --- a/src/dbcp/transform/geocodio.py +++ b/src/dbcp/transform/geocodio.py @@ -108,26 +108,27 @@ def _geocode_batch( """ batch["address"] = batch[locality_col] + ", " + batch[state_col] try: - results = client.geocode(batch["address"].tolist()) + responses = client.geocode(batch["address"].tolist()) except GeocodioAuthError: raise GeocodioAuthError( "Geocodio API key is invalid or you hit the daily geocoding limit which you can change in the Geocodio billing tab." ) - results_df = [] - for result in results: - if result.get("results"): + geocoded_localities = [] + for r in responses: + results = r.get("results") + if results: # The results are always ordered with the most accurate locations first. # It is therefore always safe to pick the first result in the list. - ad = AddressData.parse_obj(result["results"][0]) - results_df.append( + ad = AddressData.parse_obj(results[0]) + geocoded_localities.append( [ad.locality_name, ad.locality_type, ad.address_components.county] ) else: - results_df.append([None, None, None]) + geocoded_localities.append([None, None, None]) - results_df = pd.DataFrame( - results_df, + geocoded_localities = pd.DataFrame( + geocoded_localities, columns=[ "geocoded_locality_name", "geocoded_locality_type", @@ -135,12 +136,12 @@ def _geocode_batch( ], index=batch.index, ) - return results_df + return geocoded_localities @GEOCODER_CACHE.cache() def _geocode_locality( - state_locality_df: pd.DataFrame, + localities: pd.DataFrame, state_col: str = "state", locality_col: str = "county", batch_size: int = 100, @@ -148,7 +149,7 @@ def _geocode_locality( """Geocode locality names in a dataframe. Args: - state_locality_df: dataframe with state and locality columns + localities: dataframe with state and locality columns state_col: name of the state column locality_col: name of the locality column batch_size: number of rows to geocode at once @@ -158,9 +159,9 @@ def _geocode_locality( GEOCODIO_API_KEY = os.environ["GEOCODIO_API_KEY"] client = GeocodioClient(GEOCODIO_API_KEY) - geocoded_df = [] + geocoded_results = [] - for start in range(0, len(state_locality_df), batch_size): - batch = state_locality_df.iloc[start : start + batch_size] # noqa: E203 - geocoded_df.append(_geocode_batch(batch, client, state_col, locality_col)) - return pd.concat(geocoded_df) + for start in range(0, len(localities), batch_size): + batch = localities.iloc[start : start + batch_size] # noqa: E203 + geocoded_results.append(_geocode_batch(batch, client, state_col, locality_col)) + return pd.concat(geocoded_results) From 8bed81f01b745eeb9fa3a1051b1e7f1956093360 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Fri, 24 Jan 2025 15:30:14 -0300 Subject: [PATCH 09/12] Set locality type to NOne if not a city or county --- src/dbcp/transform/geocodio.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/dbcp/transform/geocodio.py b/src/dbcp/transform/geocodio.py index ca938141..04bdb97c 100644 --- a/src/dbcp/transform/geocodio.py +++ b/src/dbcp/transform/geocodio.py @@ -89,7 +89,10 @@ def locality_type(self) -> str: """ if self.accuracy_type == "place": return "city" - return self.accuracy_type + elif self.accuracy_type == "county": + return "county" + else: + return None def _geocode_batch( From ffc857f41906cf2f46ec58ff11d9c687bb85174e Mon Sep 17 00:00:00 2001 From: bendnorman Date: Mon, 27 Jan 2025 10:01:41 -0300 Subject: [PATCH 10/12] Add tests for add_county_fips_with_backup_geocoding and clean up some duplicate code in the function --- docker-compose.yaml | 4 +-- src/dbcp/transform/helpers.py | 33 ++++++++++--------- test/unit/test_geocoding.py | 61 +++++++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 18 deletions(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index 190de645..4c847710 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -14,7 +14,7 @@ services: - ./src/dbcp:/app/dbcp:rw - ./notebooks:/app/notebooks:rw - ./data:/app/data:rw - - ./test:/app/test:ro + - ./test:/app/test:rw - ${GOOGLE_GHA_CREDS_PATH}:/app/gcloud_application_default_credentials.json:ro ports: - ${JUPYTER_PORT}:${JUPYTER_PORT} @@ -27,7 +27,7 @@ services: ports: - ${POSTGRES_PORT}:5432 healthcheck: - test: ["CMD-SHELL", "pg_isready -U youruser"] + test: ["CMD-SHELL", "pg_isready -U postgres"] interval: 5s timeout: 5s retries: 5 diff --git a/src/dbcp/transform/helpers.py b/src/dbcp/transform/helpers.py index 1251bc63..e230c272 100644 --- a/src/dbcp/transform/helpers.py +++ b/src/dbcp/transform/helpers.py @@ -289,6 +289,8 @@ def _geocode_and_add_fips( state_col=state_col, locality_col=locality_col, ) + # convert all columns of deduped_geocoded to dtype string + deduped_geocoded = deduped_geocoded.astype("string") # recombine deduped geocoded data with original nan_fips geocoded_deduped_nan_fips = pd.concat( @@ -344,29 +346,28 @@ def add_county_fips_with_backup_geocoding( ) # copy # first try a simple FIPS lookup and split by valid/invalid fips codes # The only purpose of this step is to save API calls on the easy ones (most of them) - with_fips = add_fips_ids( + add_fips_results = add_fips_ids( filled_state_locality, state_col=state_col, county_col=locality_col, vintage=FIPS_CODE_VINTAGE, ) - fips_is_nan = with_fips.loc[:, "county_id_fips"].isna() - if not fips_is_nan.any(): - # standardize output columns - with_fips["geocoded_locality_name"] = with_fips[locality_col] - with_fips["geocoded_locality_type"] = "county" - with_fips["geocoded_containing_county"] = with_fips[locality_col] - # attach to original df - return pd.concat([state_locality_df, with_fips[cols_to_keep]], axis=1) - - good_fips = with_fips.loc[~fips_is_nan, :].copy() + fips_code_is_nan = add_fips_results.loc[:, "county_id_fips"].isna() + has_fips_code = add_fips_results.loc[~fips_code_is_nan, :].copy() # standardize output columns - good_fips["geocoded_locality_name"] = good_fips[locality_col] - good_fips["geocoded_locality_type"] = "county" - good_fips["geocoded_containing_county"] = good_fips[locality_col] + has_fips_code["geocoded_locality_name"] = has_fips_code[locality_col] + has_fips_code["geocoded_locality_type"] = "county" + has_fips_code["geocoded_locality_type"] = has_fips_code[ + "geocoded_locality_type" + ].astype("string") + has_fips_code["geocoded_containing_county"] = has_fips_code[locality_col] + + # if all records have a FIPS code, no need to geocoded. Combine with original dataframe + if len(has_fips_code) == len(state_locality_df): + return pd.concat([state_locality_df, has_fips_code[cols_to_keep]], axis=1) # geocode the lookup failures - they are often city/town names (instead of counties) or simply mis-spelled - nan_fips = with_fips.loc[fips_is_nan, :].copy() + nan_fips = add_fips_results.loc[fips_code_is_nan, :].copy() # Compare google and geocodio results geocodio_df = _geocode_and_add_fips( @@ -402,7 +403,7 @@ def add_county_fips_with_backup_geocoding( filled_fips = geocodio_df # recombine and restore row order - recombined = pd.concat([good_fips, filled_fips], axis=0).loc[ + recombined = pd.concat([has_fips_code, filled_fips], axis=0).loc[ state_locality_df.index, cols_to_keep ] diff --git a/test/unit/test_geocoding.py b/test/unit/test_geocoding.py index 2b18c51e..762cb8c8 100644 --- a/test/unit/test_geocoding.py +++ b/test/unit/test_geocoding.py @@ -1,9 +1,11 @@ """Test suite for dbcp.transform.google_maps module.""" + import pandas as pd import pytest import dbcp.transform.geocodio as geocodio from dbcp.transform.google_maps import GoogleGeocoder +from dbcp.transform.helpers import add_county_fips_with_backup_geocoding class mock_geocoder(GoogleGeocoder): @@ -374,3 +376,62 @@ def test_geocodio_geocode_locality(raw_localities, expected): expected_df = pd.DataFrame([expected]) # test equality pd.testing.assert_frame_equal(result, expected_df) + + +@pytest.mark.parametrize( + "input_data, expected_data", + [ + # Test the dataframe is properly reconstructuted when no geocoding is needed + pytest.param( + { + "state": ["NY", "CA"], + "county": ["Tompkins", "Alameda"], + "metric": [1, 2], + }, + { + "state": ["NY", "CA"], + "county": ["Tompkins", "Alameda"], + "metric": [1, 2], + "state_id_fips": ["36", "06"], + "county_id_fips": ["36109", "06001"], + "geocoded_locality_name": ["Tompkins", "Alameda"], + "geocoded_locality_type": ["county", "county"], + "geocoded_containing_county": ["Tompkins", "Alameda"], + }, + ), + # Test add fips and geocded records are being combined properly + pytest.param( + { + "state": ["NY", "CA", "NY"], + "county": ["Tompkins", "Alameda", "Rchmond"], + "metric": [1, 2, 3], + }, + { + "state": ["NY", "CA", "NY"], + "county": ["Tompkins", "Alameda", "Rchmond"], + "metric": [1, 2, 3], + "state_id_fips": ["36", "06", "36"], + "county_id_fips": ["36109", "06001", "36085"], + "geocoded_locality_name": ["Tompkins", "Alameda", "Richmond County"], + "geocoded_locality_type": ["county", "county", "county"], + "geocoded_containing_county": [ + "Tompkins", + "Alameda", + "Richmond County", + ], + }, + ), + ], +) +def test_add_county_fips_with_backup_geocoding(input_data, expected_data): + """Test the add_county_fips_with_backup_geocoding() function.""" + # create dataframe from input_data + input_df = pd.DataFrame(input_data).convert_dtypes() + # geocode locality + result = add_county_fips_with_backup_geocoding( + input_df, state_col="state", locality_col="county" + ) + # create expected dataframe from expected_data + expected = pd.DataFrame(expected_data).convert_dtypes() + # test equality + pd.testing.assert_frame_equal(result, expected) From f0eca901a0ac7bce0fc93cc5fe886ce19c2dea9d Mon Sep 17 00:00:00 2001 From: bendnorman Date: Mon, 27 Jan 2025 11:57:53 -0300 Subject: [PATCH 11/12] Create DATA_DIR constant that is pulled from env var --- .env | 1 + default.env | 14 ------------ src/dbcp/constants.py | 12 ++++++++++- src/dbcp/etl.py | 27 ++++++++++++------------ src/dbcp/extract/local_opposition.py | 7 +++--- src/dbcp/helpers.py | 3 ++- src/dbcp/transform/eip_infrastructure.py | 4 ++-- src/dbcp/transform/fips_tables.py | 5 +++-- src/dbcp/transform/geocodio.py | 5 +++-- src/dbcp/transform/google_maps.py | 10 ++++----- 10 files changed, 44 insertions(+), 44 deletions(-) delete mode 100644 default.env diff --git a/.env b/.env index 8dc56d1f..ddf24b41 100644 --- a/.env +++ b/.env @@ -8,3 +8,4 @@ JUPYTER_PORT=8890 PUDL_VERSION=v2024.11.0 GOOGLE_APPLICATION_CREDENTIALS=/app/gcloud_application_default_credentials.json GOOGLE_CLOUD_PROJECT=dbcp-dev-350818 +DATA_DIR=/app/data diff --git a/default.env b/default.env deleted file mode 100644 index 6878f299..00000000 --- a/default.env +++ /dev/null @@ -1,14 +0,0 @@ -# Make a copy of this file and call it local.env -# Change the "Local Variables" values below as appropriate. -# local.env is gitignored to keep credentials secret. - -######## Static variables ######### -# No need to customize these -POSTGRES_USER=postgres -POSTGRES_PASSWORD=postgres -POSTGRES_DB=postgres -PUDL_VERSION=v2024.2.6 -GCP_PROJECT_ID=dbcp-dev-350818 - -######## Local Variables ########## -API_KEY_GOOGLE_MAPS= get this value from our google account: https://console.cloud.google.com/google/maps-apis/credentials?project=dbcp-dev&supportedpurview=project diff --git a/src/dbcp/constants.py b/src/dbcp/constants.py index 4001d1c8..5c8dae6f 100644 --- a/src/dbcp/constants.py +++ b/src/dbcp/constants.py @@ -1,5 +1,6 @@ """DBCP constants.""" +import os from io import StringIO from pathlib import Path @@ -119,4 +120,13 @@ ) US_STATES_TERRITORIES = US_STATES.union(US_TERRITORIES) -OUTPUT_DIR = Path("/app/data/output") +try: + DATA_DIR_ENV_VAR = os.environ["DATA_DIR"] +except KeyError: + raise KeyError( + "Please set the DATA_DIR environment variable to the path" + "of the data directory.\n" + "This is typically set in the .env file." + ) +DATA_DIR = Path(DATA_DIR_ENV_VAR) +OUTPUT_DIR = DATA_DIR / "output" diff --git a/src/dbcp/etl.py b/src/dbcp/etl.py index 2503e295..0c3af850 100644 --- a/src/dbcp/etl.py +++ b/src/dbcp/etl.py @@ -1,7 +1,6 @@ """The ETL module create the data warehouse tables.""" import logging -from pathlib import Path from typing import Callable, Dict import pandas as pd @@ -11,7 +10,7 @@ import dbcp from dbcp.archivers.utils import ExtractionSettings -from dbcp.constants import OUTPUT_DIR +from dbcp.constants import DATA_DIR, OUTPUT_DIR from dbcp.extract.fips_tables import CENSUS_URI, TRIBAL_LANDS_URI from dbcp.extract.ncsl_state_permitting import NCSLScraper from dbcp.helpers import enforce_dtypes, psql_insert_copy @@ -25,7 +24,7 @@ def etl_eip_infrastructure() -> Dict[str, pd.DataFrame]: """EIP Infrastructure ETL.""" # Extract - source_path = Path("/app/data/raw/2023.05.24 OGW database.xlsx") + source_path = DATA_DIR / "raw/2023.05.24 OGW database.xlsx" eip_raw_dfs = dbcp.extract.eip_infrastructure.extract(source_path) # Transform @@ -46,9 +45,11 @@ def etl_lbnl_iso_queue() -> Dict[str, pd.DataFrame]: def etl_columbia_local_opp() -> Dict[str, pd.DataFrame]: """Columbia Local Opposition ETL.""" # Extract - source_path = Path( - "/app/data/raw/2023.05.30 Opposition to Renewable Energy Facilities - FINAL.docx" + source_path = ( + DATA_DIR + / "raw/2023.05.30 Opposition to Renewable Energy Facilities - FINAL.docx" ) + extractor = dbcp.extract.local_opposition.ColumbiaDocxParser() extractor.load_docx(source_path) docx_dfs = extractor.extract() @@ -67,7 +68,7 @@ def etl_pudl_tables() -> Dict[str, pd.DataFrame]: def etl_ncsl_state_permitting() -> Dict[str, pd.DataFrame]: """NCSL State Permitting for Wind ETL.""" - source_path = Path("/app/data/raw/ncsl_state_permitting_wind.csv") + source_path = DATA_DIR / "raw/ncsl_state_permitting_wind.csv" if not source_path.exists(): NCSLScraper().scrape_and_save_to_disk(source_path) raw_df = dbcp.extract.ncsl_state_permitting.extract(source_path) @@ -92,7 +93,7 @@ def etl_fips_tables() -> Dict[str, pd.DataFrame]: def etl_justice40() -> dict[str, pd.DataFrame]: """ETL white house environmental justice dataset.""" - source_path = Path("/app/data/raw/1.0-communities.csv") + source_path = DATA_DIR / "raw/1.0-communities.csv" raw = dbcp.extract.justice40.extract(source_path) out = dbcp.transform.justice40.transform(raw) return out @@ -100,8 +101,8 @@ def etl_justice40() -> dict[str, pd.DataFrame]: def etl_nrel_ordinances() -> dict[str, pd.DataFrame]: """ETL NREL state and local ordinances for wind and solar.""" - wind_source_path = Path("/app/data/raw/NREL_Wind_Ordinances.xlsx") - solar_source_path = Path("/app/data/raw/NREL_Solar_Ordinances.xlsx") + wind_source_path = DATA_DIR / "raw/NREL_Wind_Ordinances.xlsx" + solar_source_path = DATA_DIR / "raw/NREL_Solar_Ordinances.xlsx" wind_raw_dfs = dbcp.extract.nrel_wind_solar_ordinances.extract( wind_source_path, wind_or_solar="wind" ) @@ -139,7 +140,7 @@ def etl_offshore_wind() -> dict[str, pd.DataFrame]: def etl_protected_area_by_county() -> dict[str, pd.DataFrame]: """ETL the PAD-US intersection with TIGER county geometries.""" - source_path = Path("/app/data/raw/padus_intersect_counties.parquet") + source_path = DATA_DIR / "raw/padus_intersect_counties.parquet" raw_df = dbcp.extract.protected_area_by_county.extract(source_path) transformed = dbcp.transform.protected_area_by_county.transform(raw_df) return transformed @@ -147,7 +148,7 @@ def etl_protected_area_by_county() -> dict[str, pd.DataFrame]: def etl_energy_communities_by_county() -> dict[str, pd.DataFrame]: """ETL RMI's energy communities analysis.""" - source_path = Path("/app/data/raw/rmi_energy_communities_counties.parquet") + source_path = DATA_DIR / "raw/rmi_energy_communities_counties.parquet" raw_df = dbcp.extract.rmi_energy_communities.extract(source_path) transformed = dbcp.transform.rmi_energy_communities.transform(raw_df) return transformed @@ -164,9 +165,9 @@ def etl_ballot_ready() -> dict[str, pd.DataFrame]: def etl_epa_avert() -> dict[str, pd.DataFrame]: """ETL EPA AVERT avoided emissions data.""" # https://github.com/USEPA/AVERT/blob/v4.1.0/utilities/data/county-fips.txt - path_county_region_xwalk = Path("/app/data/raw/avert_county-fips.txt") + path_county_region_xwalk = DATA_DIR / "raw/avert_county-fips.txt" # https://www.epa.gov/avert/avoided-emission-rates-generated-avert - path_emission_rates = Path("/app/data/raw/avert_emission_rates_04-25-23.xlsx") + path_emission_rates = DATA_DIR / "raw/avert_emission_rates_04-25-23.xlsx" raw_dfs = dbcp.extract.epa_avert.extract( county_crosswalk_path=path_county_region_xwalk, emission_rates_path=path_emission_rates, diff --git a/src/dbcp/extract/local_opposition.py b/src/dbcp/extract/local_opposition.py index 446544cf..66a0c745 100644 --- a/src/dbcp/extract/local_opposition.py +++ b/src/dbcp/extract/local_opposition.py @@ -4,13 +4,12 @@ formatting details (paragraph level, font, etc), but is surprisingly consistent. It is infrequently updated by a research group at Columbia University. """ -from pathlib import Path from typing import Dict, List, Optional import docx import pandas as pd -from dbcp.constants import US_STATES +from dbcp.constants import DATA_DIR, US_STATES class ColumbiaDocxParser(object): @@ -63,7 +62,7 @@ def __init__(self) -> None: } def load_docx( - self, source_path=Path("/app/data/raw/RELDI report updated 9.10.21 (1).docx") + self, source_path=DATA_DIR / "raw/RELDI report updated 9.10.21 (1).docx" ) -> None: """Read the .docx file with python-docx. @@ -91,7 +90,7 @@ def _remove_intro( return paragraphs[idx:] raise ValueError("Could not find starting state") - def _parse_values(self, text: str) -> None: + def _parse_values(self, text: str) -> None: # noqa: C901 """Parse and assign values to the correct dataset based on the current hierarchical headings. Args: diff --git a/src/dbcp/helpers.py b/src/dbcp/helpers.py index b41e06e7..e45d7b13 100644 --- a/src/dbcp/helpers.py +++ b/src/dbcp/helpers.py @@ -17,6 +17,7 @@ from tqdm import tqdm import dbcp +from dbcp.constants import DATA_DIR logger = logging.getLogger(__name__) @@ -152,7 +153,7 @@ def get_pudl_resource( """ PUDL_VERSION = os.environ["PUDL_VERSION"] - pudl_cache = Path("/app/data/data_cache/pudl/") + pudl_cache = DATA_DIR / "data_cache/pudl/" pudl_cache.mkdir(exist_ok=True) pudl_version_cache = pudl_cache / PUDL_VERSION pudl_version_cache.mkdir(exist_ok=True) diff --git a/src/dbcp/transform/eip_infrastructure.py b/src/dbcp/transform/eip_infrastructure.py index b525406c..6f680d1b 100644 --- a/src/dbcp/transform/eip_infrastructure.py +++ b/src/dbcp/transform/eip_infrastructure.py @@ -5,6 +5,7 @@ import pandas as pd +from dbcp.constants import DATA_DIR from dbcp.transform.helpers import ( add_county_fips_with_backup_geocoding, replace_value_with_count_validation, @@ -447,11 +448,10 @@ def transform(raw_eip_dfs: Dict[str, pd.DataFrame]) -> Dict[str, pd.DataFrame]: if __name__ == "__main__": # debugging entry point - from pathlib import Path from dbcp.extract.eip_infrastructure import extract - source_path = Path("/app/data/raw/2023.05.24 OGW database.xlsx") + source_path = DATA_DIR / "raw/2023.05.24 OGW database.xlsx" eip_raw_dfs = extract(source_path) eip_transformed_dfs = transform(eip_raw_dfs) print("yay") diff --git a/src/dbcp/transform/fips_tables.py b/src/dbcp/transform/fips_tables.py index de35319d..87cf5b67 100644 --- a/src/dbcp/transform/fips_tables.py +++ b/src/dbcp/transform/fips_tables.py @@ -1,17 +1,18 @@ """Tranform raw FIPS tables to a database-ready form.""" import logging -from pathlib import Path from typing import Dict, Sequence import geopandas as gpd import pandas as pd from joblib import Memory +from dbcp.constants import DATA_DIR + logger = logging.getLogger(__name__) # cache needs to be accessed outside this module to call .clear() # limit cache size to 1 MB, keeps most recently accessed first -SPATIAL_CACHE = Memory(location=Path("/app/data/spatial_cache"), bytes_limit=2**20) +SPATIAL_CACHE = Memory(location=DATA_DIR / "spatial_cache", bytes_limit=2**20) @SPATIAL_CACHE.cache() diff --git a/src/dbcp/transform/geocodio.py b/src/dbcp/transform/geocodio.py index 04bdb97c..a8a7027c 100644 --- a/src/dbcp/transform/geocodio.py +++ b/src/dbcp/transform/geocodio.py @@ -2,7 +2,6 @@ import os from enum import Enum -from pathlib import Path import pandas as pd from geocodio import GeocodioClient @@ -10,7 +9,9 @@ from joblib import Memory from pydantic import BaseModel, confloat -geocoder_local_cache = Path("/app/data/geocodio_cache") +from dbcp.constants import DATA_DIR + +geocoder_local_cache = DATA_DIR / "geocodio_cache" # create geocoder_local_cache if it doesn't exist geocoder_local_cache.mkdir(parents=True, exist_ok=True) assert geocoder_local_cache.exists() diff --git a/src/dbcp/transform/google_maps.py b/src/dbcp/transform/google_maps.py index 0e0a62ad..0d5f36e8 100644 --- a/src/dbcp/transform/google_maps.py +++ b/src/dbcp/transform/google_maps.py @@ -3,7 +3,6 @@ import os from functools import lru_cache from logging import getLogger -from pathlib import Path from typing import Dict, List, Optional from warnings import warn @@ -11,10 +10,12 @@ import pandas as pd from joblib import Memory +from dbcp.constants import DATA_DIR + logger = getLogger("__name__") -geocoder_local_cache = Path("/app/data/google_geocoder_cache") +geocoder_local_cache = DATA_DIR / "google_geocoder_cache" geocoder_local_cache.mkdir(parents=True, exist_ok=True) assert geocoder_local_cache.exists() # cache needs to be accessed outside this module to call .clear() @@ -41,10 +42,9 @@ def __init__(self, key=None) -> None: key = os.environ["API_KEY_GOOGLE_MAPS"] except ValueError as e: if "google.com" in e.args[0]: - # local.env wasn't updated properly raise ValueError( - "API_KEY_GOOGLE_MAPS must be defined in your local.env file." - " See README.md for instructions." + "API_KEY_GOOGLE_MAPS environment variable not set. " + " See README.md for how to set it." ) else: raise e From 46724d6f5a6f7591ab015373e9bad24a975e84e2 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Wed, 29 Jan 2025 09:18:06 -0300 Subject: [PATCH 12/12] Compare fips instead of county names in geocoding debug --- src/dbcp/transform/helpers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/dbcp/transform/helpers.py b/src/dbcp/transform/helpers.py index e230c272..f78d6f81 100644 --- a/src/dbcp/transform/helpers.py +++ b/src/dbcp/transform/helpers.py @@ -388,9 +388,7 @@ def add_county_fips_with_backup_geocoding( suffixes=("_geocodio", "_google"), ) - county_eq = comp.geocoded_containing_county_geocodio.eq( - comp.geocoded_containing_county_google - ) + county_eq = comp.county_id_fips_geocodio.eq(comp.county_id_fips_google) logger.info("---------------------") logger.info( f"---- pct of geocoded fip failures that don't match: {(~county_eq).sum() / len(comp)}"