Create DATA_DIR constant that is pulled from env var

deployment-gap-model-education-fund · Jan 27, 2025 · f0eca90 · f0eca90
1 parent ffc857f
commit f0eca90
Show file tree

Hide file tree

Showing 10 changed files with 44 additions and 44 deletions.
diff --git a/.env b/.env
@@ -8,3 +8,4 @@ JUPYTER_PORT=8890
 PUDL_VERSION=v2024.11.0
 GOOGLE_APPLICATION_CREDENTIALS=/app/gcloud_application_default_credentials.json
 GOOGLE_CLOUD_PROJECT=dbcp-dev-350818
+DATA_DIR=/app/data
diff --git a/default.env b/default.env
diff --git a/src/dbcp/constants.py b/src/dbcp/constants.py
@@ -1,5 +1,6 @@
 """DBCP constants."""
 
+import os
 from io import StringIO
 from pathlib import Path
 
@@ -119,4 +120,13 @@
 )
 US_STATES_TERRITORIES = US_STATES.union(US_TERRITORIES)
 
-OUTPUT_DIR = Path("/app/data/output")
+try:
+    DATA_DIR_ENV_VAR = os.environ["DATA_DIR"]
+except KeyError:
+    raise KeyError(
+        "Please set the DATA_DIR environment variable to the path"
+        "of the data directory.\n"
+        "This is typically set in the .env file."
+    )
+DATA_DIR = Path(DATA_DIR_ENV_VAR)
+OUTPUT_DIR = DATA_DIR / "output"
diff --git a/src/dbcp/etl.py b/src/dbcp/etl.py
@@ -1,7 +1,6 @@
 """The ETL module create the data warehouse tables."""
 
 import logging
-from pathlib import Path
 from typing import Callable, Dict
 
 import pandas as pd
@@ -11,7 +10,7 @@
 
 import dbcp
 from dbcp.archivers.utils import ExtractionSettings
-from dbcp.constants import OUTPUT_DIR
+from dbcp.constants import DATA_DIR, OUTPUT_DIR
 from dbcp.extract.fips_tables import CENSUS_URI, TRIBAL_LANDS_URI
 from dbcp.extract.ncsl_state_permitting import NCSLScraper
 from dbcp.helpers import enforce_dtypes, psql_insert_copy
@@ -25,7 +24,7 @@
 def etl_eip_infrastructure() -> Dict[str, pd.DataFrame]:
     """EIP Infrastructure ETL."""
     # Extract
-    source_path = Path("/app/data/raw/2023.05.24 OGW database.xlsx")
+    source_path = DATA_DIR / "raw/2023.05.24 OGW database.xlsx"
     eip_raw_dfs = dbcp.extract.eip_infrastructure.extract(source_path)
 
     # Transform
@@ -46,9 +45,11 @@ def etl_lbnl_iso_queue() -> Dict[str, pd.DataFrame]:
 def etl_columbia_local_opp() -> Dict[str, pd.DataFrame]:
     """Columbia Local Opposition ETL."""
     # Extract
-    source_path = Path(
-        "/app/data/raw/2023.05.30 Opposition to Renewable Energy Facilities - FINAL.docx"
+    source_path = (
+        DATA_DIR
+        / "raw/2023.05.30 Opposition to Renewable Energy Facilities - FINAL.docx"
     )
+
     extractor = dbcp.extract.local_opposition.ColumbiaDocxParser()
     extractor.load_docx(source_path)
     docx_dfs = extractor.extract()
@@ -67,7 +68,7 @@ def etl_pudl_tables() -> Dict[str, pd.DataFrame]:
 
 def etl_ncsl_state_permitting() -> Dict[str, pd.DataFrame]:
     """NCSL State Permitting for Wind ETL."""
-    source_path = Path("/app/data/raw/ncsl_state_permitting_wind.csv")
+    source_path = DATA_DIR / "raw/ncsl_state_permitting_wind.csv"
     if not source_path.exists():
         NCSLScraper().scrape_and_save_to_disk(source_path)
     raw_df = dbcp.extract.ncsl_state_permitting.extract(source_path)
@@ -92,16 +93,16 @@ def etl_fips_tables() -> Dict[str, pd.DataFrame]:
 
 def etl_justice40() -> dict[str, pd.DataFrame]:
     """ETL white house environmental justice dataset."""
-    source_path = Path("/app/data/raw/1.0-communities.csv")
+    source_path = DATA_DIR / "raw/1.0-communities.csv"
     raw = dbcp.extract.justice40.extract(source_path)
     out = dbcp.transform.justice40.transform(raw)
     return out
 
 
 def etl_nrel_ordinances() -> dict[str, pd.DataFrame]:
     """ETL NREL state and local ordinances for wind and solar."""
-    wind_source_path = Path("/app/data/raw/NREL_Wind_Ordinances.xlsx")
-    solar_source_path = Path("/app/data/raw/NREL_Solar_Ordinances.xlsx")
+    wind_source_path = DATA_DIR / "raw/NREL_Wind_Ordinances.xlsx"
+    solar_source_path = DATA_DIR / "raw/NREL_Solar_Ordinances.xlsx"
     wind_raw_dfs = dbcp.extract.nrel_wind_solar_ordinances.extract(
         wind_source_path, wind_or_solar="wind"
     )
@@ -139,15 +140,15 @@ def etl_offshore_wind() -> dict[str, pd.DataFrame]:
 
 def etl_protected_area_by_county() -> dict[str, pd.DataFrame]:
     """ETL the PAD-US intersection with TIGER county geometries."""
-    source_path = Path("/app/data/raw/padus_intersect_counties.parquet")
+    source_path = DATA_DIR / "raw/padus_intersect_counties.parquet"
     raw_df = dbcp.extract.protected_area_by_county.extract(source_path)
     transformed = dbcp.transform.protected_area_by_county.transform(raw_df)
     return transformed
 
 
 def etl_energy_communities_by_county() -> dict[str, pd.DataFrame]:
     """ETL RMI's energy communities analysis."""
-    source_path = Path("/app/data/raw/rmi_energy_communities_counties.parquet")
+    source_path = DATA_DIR / "raw/rmi_energy_communities_counties.parquet"
     raw_df = dbcp.extract.rmi_energy_communities.extract(source_path)
     transformed = dbcp.transform.rmi_energy_communities.transform(raw_df)
     return transformed
@@ -164,9 +165,9 @@ def etl_ballot_ready() -> dict[str, pd.DataFrame]:
 def etl_epa_avert() -> dict[str, pd.DataFrame]:
     """ETL EPA AVERT avoided emissions data."""
     # https://github.com/USEPA/AVERT/blob/v4.1.0/utilities/data/county-fips.txt
-    path_county_region_xwalk = Path("/app/data/raw/avert_county-fips.txt")
+    path_county_region_xwalk = DATA_DIR / "raw/avert_county-fips.txt"
     # https://www.epa.gov/avert/avoided-emission-rates-generated-avert
-    path_emission_rates = Path("/app/data/raw/avert_emission_rates_04-25-23.xlsx")
+    path_emission_rates = DATA_DIR / "raw/avert_emission_rates_04-25-23.xlsx"
     raw_dfs = dbcp.extract.epa_avert.extract(
         county_crosswalk_path=path_county_region_xwalk,
         emission_rates_path=path_emission_rates,

diff --git a/src/dbcp/extract/local_opposition.py b/src/dbcp/extract/local_opposition.py
@@ -4,13 +4,12 @@
 formatting details (paragraph level, font, etc), but is surprisingly consistent. It is
 infrequently updated by a research group at Columbia University.
 """
-from pathlib import Path
 from typing import Dict, List, Optional
 
 import docx
 import pandas as pd
 
-from dbcp.constants import US_STATES
+from dbcp.constants import DATA_DIR, US_STATES
 
 
 class ColumbiaDocxParser(object):
@@ -63,7 +62,7 @@ def __init__(self) -> None:
         }
 
     def load_docx(
-        self, source_path=Path("/app/data/raw/RELDI report updated 9.10.21 (1).docx")
+        self, source_path=DATA_DIR / "raw/RELDI report updated 9.10.21 (1).docx"
     ) -> None:
         """Read the .docx file with python-docx.
 
@@ -91,7 +90,7 @@ def _remove_intro(
                 return paragraphs[idx:]
         raise ValueError("Could not find starting state")
 
-    def _parse_values(self, text: str) -> None:
+    def _parse_values(self, text: str) -> None:  # noqa: C901
         """Parse and assign values to the correct dataset based on the current hierarchical headings.
 
         Args:

diff --git a/src/dbcp/helpers.py b/src/dbcp/helpers.py
@@ -17,6 +17,7 @@
 from tqdm import tqdm
 
 import dbcp
+from dbcp.constants import DATA_DIR
 
 logger = logging.getLogger(__name__)
 
@@ -152,7 +153,7 @@ def get_pudl_resource(
     """
     PUDL_VERSION = os.environ["PUDL_VERSION"]
 
-    pudl_cache = Path("/app/data/data_cache/pudl/")
+    pudl_cache = DATA_DIR / "data_cache/pudl/"
     pudl_cache.mkdir(exist_ok=True)
     pudl_version_cache = pudl_cache / PUDL_VERSION
     pudl_version_cache.mkdir(exist_ok=True)

diff --git a/src/dbcp/transform/eip_infrastructure.py b/src/dbcp/transform/eip_infrastructure.py
@@ -5,6 +5,7 @@
 
 import pandas as pd
 
+from dbcp.constants import DATA_DIR
 from dbcp.transform.helpers import (
     add_county_fips_with_backup_geocoding,
     replace_value_with_count_validation,
@@ -447,11 +448,10 @@ def transform(raw_eip_dfs: Dict[str, pd.DataFrame]) -> Dict[str, pd.DataFrame]:
 
 if __name__ == "__main__":
     #  debugging entry point
-    from pathlib import Path
 
     from dbcp.extract.eip_infrastructure import extract
 
-    source_path = Path("/app/data/raw/2023.05.24 OGW database.xlsx")
+    source_path = DATA_DIR / "raw/2023.05.24 OGW database.xlsx"
     eip_raw_dfs = extract(source_path)
     eip_transformed_dfs = transform(eip_raw_dfs)
     print("yay")
diff --git a/src/dbcp/transform/fips_tables.py b/src/dbcp/transform/fips_tables.py
@@ -1,17 +1,18 @@
 """Tranform raw FIPS tables to a database-ready form."""
 import logging
-from pathlib import Path
 from typing import Dict, Sequence
 
 import geopandas as gpd
 import pandas as pd
 from joblib import Memory
 
+from dbcp.constants import DATA_DIR
+
 logger = logging.getLogger(__name__)
 
 # cache needs to be accessed outside this module to call .clear()
 # limit cache size to 1 MB, keeps most recently accessed first
-SPATIAL_CACHE = Memory(location=Path("/app/data/spatial_cache"), bytes_limit=2**20)
+SPATIAL_CACHE = Memory(location=DATA_DIR / "spatial_cache", bytes_limit=2**20)
 
 
 @SPATIAL_CACHE.cache()

diff --git a/src/dbcp/transform/geocodio.py b/src/dbcp/transform/geocodio.py
@@ -2,15 +2,16 @@
 
 import os
 from enum import Enum
-from pathlib import Path
 
 import pandas as pd
 from geocodio import GeocodioClient
 from geocodio.exceptions import GeocodioAuthError
 from joblib import Memory
 from pydantic import BaseModel, confloat
 
-geocoder_local_cache = Path("/app/data/geocodio_cache")
+from dbcp.constants import DATA_DIR
+
+geocoder_local_cache = DATA_DIR / "geocodio_cache"
 # create geocoder_local_cache if it doesn't exist
 geocoder_local_cache.mkdir(parents=True, exist_ok=True)
 assert geocoder_local_cache.exists()

diff --git a/src/dbcp/transform/google_maps.py b/src/dbcp/transform/google_maps.py
@@ -3,18 +3,19 @@
 import os
 from functools import lru_cache
 from logging import getLogger
-from pathlib import Path
 from typing import Dict, List, Optional
 from warnings import warn
 
 import googlemaps
 import pandas as pd
 from joblib import Memory
 
+from dbcp.constants import DATA_DIR
+
 logger = getLogger("__name__")
 
 
-geocoder_local_cache = Path("/app/data/google_geocoder_cache")
+geocoder_local_cache = DATA_DIR / "google_geocoder_cache"
 geocoder_local_cache.mkdir(parents=True, exist_ok=True)
 assert geocoder_local_cache.exists()
 # cache needs to be accessed outside this module to call .clear()
@@ -41,10 +42,9 @@ def __init__(self, key=None) -> None:
                 key = os.environ["API_KEY_GOOGLE_MAPS"]
             except ValueError as e:
                 if "google.com" in e.args[0]:
-                    # local.env wasn't updated properly
                     raise ValueError(
-                        "API_KEY_GOOGLE_MAPS must be defined in your local.env file."
-                        " See README.md for instructions."
+                        "API_KEY_GOOGLE_MAPS environment variable not set. "
+                        " See README.md for how to set it."
                     )
                 else:
                     raise e