Skip to content

Commit

Permalink
Create DATA_DIR constant that is pulled from env var
Browse files Browse the repository at this point in the history
  • Loading branch information
bendnorman committed Jan 27, 2025
1 parent ffc857f commit f0eca90
Show file tree
Hide file tree
Showing 10 changed files with 44 additions and 44 deletions.
1 change: 1 addition & 0 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ JUPYTER_PORT=8890
PUDL_VERSION=v2024.11.0
GOOGLE_APPLICATION_CREDENTIALS=/app/gcloud_application_default_credentials.json
GOOGLE_CLOUD_PROJECT=dbcp-dev-350818
DATA_DIR=/app/data
14 changes: 0 additions & 14 deletions default.env

This file was deleted.

12 changes: 11 additions & 1 deletion src/dbcp/constants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""DBCP constants."""

import os
from io import StringIO
from pathlib import Path

Expand Down Expand Up @@ -119,4 +120,13 @@
)
US_STATES_TERRITORIES = US_STATES.union(US_TERRITORIES)

OUTPUT_DIR = Path("/app/data/output")
try:
DATA_DIR_ENV_VAR = os.environ["DATA_DIR"]
except KeyError:
raise KeyError(
"Please set the DATA_DIR environment variable to the path"
"of the data directory.\n"
"This is typically set in the .env file."
)
DATA_DIR = Path(DATA_DIR_ENV_VAR)
OUTPUT_DIR = DATA_DIR / "output"
27 changes: 14 additions & 13 deletions src/dbcp/etl.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""The ETL module create the data warehouse tables."""

import logging
from pathlib import Path
from typing import Callable, Dict

import pandas as pd
Expand All @@ -11,7 +10,7 @@

import dbcp
from dbcp.archivers.utils import ExtractionSettings
from dbcp.constants import OUTPUT_DIR
from dbcp.constants import DATA_DIR, OUTPUT_DIR
from dbcp.extract.fips_tables import CENSUS_URI, TRIBAL_LANDS_URI
from dbcp.extract.ncsl_state_permitting import NCSLScraper
from dbcp.helpers import enforce_dtypes, psql_insert_copy
Expand All @@ -25,7 +24,7 @@
def etl_eip_infrastructure() -> Dict[str, pd.DataFrame]:
"""EIP Infrastructure ETL."""
# Extract
source_path = Path("/app/data/raw/2023.05.24 OGW database.xlsx")
source_path = DATA_DIR / "raw/2023.05.24 OGW database.xlsx"
eip_raw_dfs = dbcp.extract.eip_infrastructure.extract(source_path)

# Transform
Expand All @@ -46,9 +45,11 @@ def etl_lbnl_iso_queue() -> Dict[str, pd.DataFrame]:
def etl_columbia_local_opp() -> Dict[str, pd.DataFrame]:
"""Columbia Local Opposition ETL."""
# Extract
source_path = Path(
"/app/data/raw/2023.05.30 Opposition to Renewable Energy Facilities - FINAL.docx"
source_path = (
DATA_DIR
/ "raw/2023.05.30 Opposition to Renewable Energy Facilities - FINAL.docx"
)

extractor = dbcp.extract.local_opposition.ColumbiaDocxParser()
extractor.load_docx(source_path)
docx_dfs = extractor.extract()
Expand All @@ -67,7 +68,7 @@ def etl_pudl_tables() -> Dict[str, pd.DataFrame]:

def etl_ncsl_state_permitting() -> Dict[str, pd.DataFrame]:
"""NCSL State Permitting for Wind ETL."""
source_path = Path("/app/data/raw/ncsl_state_permitting_wind.csv")
source_path = DATA_DIR / "raw/ncsl_state_permitting_wind.csv"
if not source_path.exists():
NCSLScraper().scrape_and_save_to_disk(source_path)
raw_df = dbcp.extract.ncsl_state_permitting.extract(source_path)
Expand All @@ -92,16 +93,16 @@ def etl_fips_tables() -> Dict[str, pd.DataFrame]:

def etl_justice40() -> dict[str, pd.DataFrame]:
"""ETL white house environmental justice dataset."""
source_path = Path("/app/data/raw/1.0-communities.csv")
source_path = DATA_DIR / "raw/1.0-communities.csv"
raw = dbcp.extract.justice40.extract(source_path)
out = dbcp.transform.justice40.transform(raw)
return out


def etl_nrel_ordinances() -> dict[str, pd.DataFrame]:
"""ETL NREL state and local ordinances for wind and solar."""
wind_source_path = Path("/app/data/raw/NREL_Wind_Ordinances.xlsx")
solar_source_path = Path("/app/data/raw/NREL_Solar_Ordinances.xlsx")
wind_source_path = DATA_DIR / "raw/NREL_Wind_Ordinances.xlsx"
solar_source_path = DATA_DIR / "raw/NREL_Solar_Ordinances.xlsx"
wind_raw_dfs = dbcp.extract.nrel_wind_solar_ordinances.extract(
wind_source_path, wind_or_solar="wind"
)
Expand Down Expand Up @@ -139,15 +140,15 @@ def etl_offshore_wind() -> dict[str, pd.DataFrame]:

def etl_protected_area_by_county() -> dict[str, pd.DataFrame]:
"""ETL the PAD-US intersection with TIGER county geometries."""
source_path = Path("/app/data/raw/padus_intersect_counties.parquet")
source_path = DATA_DIR / "raw/padus_intersect_counties.parquet"
raw_df = dbcp.extract.protected_area_by_county.extract(source_path)
transformed = dbcp.transform.protected_area_by_county.transform(raw_df)
return transformed


def etl_energy_communities_by_county() -> dict[str, pd.DataFrame]:
"""ETL RMI's energy communities analysis."""
source_path = Path("/app/data/raw/rmi_energy_communities_counties.parquet")
source_path = DATA_DIR / "raw/rmi_energy_communities_counties.parquet"
raw_df = dbcp.extract.rmi_energy_communities.extract(source_path)
transformed = dbcp.transform.rmi_energy_communities.transform(raw_df)
return transformed
Expand All @@ -164,9 +165,9 @@ def etl_ballot_ready() -> dict[str, pd.DataFrame]:
def etl_epa_avert() -> dict[str, pd.DataFrame]:
"""ETL EPA AVERT avoided emissions data."""
# https://github.com/USEPA/AVERT/blob/v4.1.0/utilities/data/county-fips.txt
path_county_region_xwalk = Path("/app/data/raw/avert_county-fips.txt")
path_county_region_xwalk = DATA_DIR / "raw/avert_county-fips.txt"
# https://www.epa.gov/avert/avoided-emission-rates-generated-avert
path_emission_rates = Path("/app/data/raw/avert_emission_rates_04-25-23.xlsx")
path_emission_rates = DATA_DIR / "raw/avert_emission_rates_04-25-23.xlsx"
raw_dfs = dbcp.extract.epa_avert.extract(
county_crosswalk_path=path_county_region_xwalk,
emission_rates_path=path_emission_rates,
Expand Down
7 changes: 3 additions & 4 deletions src/dbcp/extract/local_opposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@
formatting details (paragraph level, font, etc), but is surprisingly consistent. It is
infrequently updated by a research group at Columbia University.
"""
from pathlib import Path
from typing import Dict, List, Optional

import docx
import pandas as pd

from dbcp.constants import US_STATES
from dbcp.constants import DATA_DIR, US_STATES


class ColumbiaDocxParser(object):
Expand Down Expand Up @@ -63,7 +62,7 @@ def __init__(self) -> None:
}

def load_docx(
self, source_path=Path("/app/data/raw/RELDI report updated 9.10.21 (1).docx")
self, source_path=DATA_DIR / "raw/RELDI report updated 9.10.21 (1).docx"
) -> None:
"""Read the .docx file with python-docx.
Expand Down Expand Up @@ -91,7 +90,7 @@ def _remove_intro(
return paragraphs[idx:]
raise ValueError("Could not find starting state")

def _parse_values(self, text: str) -> None:
def _parse_values(self, text: str) -> None: # noqa: C901
"""Parse and assign values to the correct dataset based on the current hierarchical headings.
Args:
Expand Down
3 changes: 2 additions & 1 deletion src/dbcp/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from tqdm import tqdm

import dbcp
from dbcp.constants import DATA_DIR

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -152,7 +153,7 @@ def get_pudl_resource(
"""
PUDL_VERSION = os.environ["PUDL_VERSION"]

pudl_cache = Path("/app/data/data_cache/pudl/")
pudl_cache = DATA_DIR / "data_cache/pudl/"
pudl_cache.mkdir(exist_ok=True)
pudl_version_cache = pudl_cache / PUDL_VERSION
pudl_version_cache.mkdir(exist_ok=True)
Expand Down
4 changes: 2 additions & 2 deletions src/dbcp/transform/eip_infrastructure.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import pandas as pd

from dbcp.constants import DATA_DIR
from dbcp.transform.helpers import (
add_county_fips_with_backup_geocoding,
replace_value_with_count_validation,
Expand Down Expand Up @@ -447,11 +448,10 @@ def transform(raw_eip_dfs: Dict[str, pd.DataFrame]) -> Dict[str, pd.DataFrame]:

if __name__ == "__main__":
# debugging entry point
from pathlib import Path

from dbcp.extract.eip_infrastructure import extract

source_path = Path("/app/data/raw/2023.05.24 OGW database.xlsx")
source_path = DATA_DIR / "raw/2023.05.24 OGW database.xlsx"
eip_raw_dfs = extract(source_path)
eip_transformed_dfs = transform(eip_raw_dfs)
print("yay")
5 changes: 3 additions & 2 deletions src/dbcp/transform/fips_tables.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
"""Tranform raw FIPS tables to a database-ready form."""
import logging
from pathlib import Path
from typing import Dict, Sequence

import geopandas as gpd
import pandas as pd
from joblib import Memory

from dbcp.constants import DATA_DIR

logger = logging.getLogger(__name__)

# cache needs to be accessed outside this module to call .clear()
# limit cache size to 1 MB, keeps most recently accessed first
SPATIAL_CACHE = Memory(location=Path("/app/data/spatial_cache"), bytes_limit=2**20)
SPATIAL_CACHE = Memory(location=DATA_DIR / "spatial_cache", bytes_limit=2**20)


@SPATIAL_CACHE.cache()
Expand Down
5 changes: 3 additions & 2 deletions src/dbcp/transform/geocodio.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@

import os
from enum import Enum
from pathlib import Path

import pandas as pd
from geocodio import GeocodioClient
from geocodio.exceptions import GeocodioAuthError
from joblib import Memory
from pydantic import BaseModel, confloat

geocoder_local_cache = Path("/app/data/geocodio_cache")
from dbcp.constants import DATA_DIR

geocoder_local_cache = DATA_DIR / "geocodio_cache"
# create geocoder_local_cache if it doesn't exist
geocoder_local_cache.mkdir(parents=True, exist_ok=True)
assert geocoder_local_cache.exists()
Expand Down
10 changes: 5 additions & 5 deletions src/dbcp/transform/google_maps.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,19 @@
import os
from functools import lru_cache
from logging import getLogger
from pathlib import Path
from typing import Dict, List, Optional
from warnings import warn

import googlemaps
import pandas as pd
from joblib import Memory

from dbcp.constants import DATA_DIR

logger = getLogger("__name__")


geocoder_local_cache = Path("/app/data/google_geocoder_cache")
geocoder_local_cache = DATA_DIR / "google_geocoder_cache"
geocoder_local_cache.mkdir(parents=True, exist_ok=True)
assert geocoder_local_cache.exists()
# cache needs to be accessed outside this module to call .clear()
Expand All @@ -41,10 +42,9 @@ def __init__(self, key=None) -> None:
key = os.environ["API_KEY_GOOGLE_MAPS"]
except ValueError as e:
if "google.com" in e.args[0]:
# local.env wasn't updated properly
raise ValueError(
"API_KEY_GOOGLE_MAPS must be defined in your local.env file."
" See README.md for instructions."
"API_KEY_GOOGLE_MAPS environment variable not set. "
" See README.md for how to set it."
)
else:
raise e
Expand Down

0 comments on commit f0eca90

Please sign in to comment.