WIP data update notebook, add raw data and update extraction

deployment-gap-model-education-fund · Dec 13, 2024 · b2cc3df · b2cc3df
1 parent 31a66cf
commit b2cc3df
Show file tree

Hide file tree

Showing 12 changed files with 30,241 additions and 34 deletions.
diff --git a/data/raw/eip_infrastructure/eip_air_construction_permits_2024-12-13.csv b/data/raw/eip_infrastructure/eip_air_construction_permits_2024-12-13.csv
diff --git a/data/raw/eip_infrastructure/eip_air_construction_project_assn_2024-12-13.csv b/data/raw/eip_infrastructure/eip_air_construction_project_assn_2024-12-13.csv
diff --git a/data/raw/eip_infrastructure/eip_facilities_2024-12-13.csv b/data/raw/eip_infrastructure/eip_facilities_2024-12-13.csv
diff --git a/data/raw/eip_infrastructure/eip_facility_project_assn_2024-12-13.csv b/data/raw/eip_infrastructure/eip_facility_project_assn_2024-12-13.csv
diff --git a/data/raw/eip_infrastructure/eip_projects_2024-12-13.csv b/data/raw/eip_infrastructure/eip_projects_2024-12-13.csv
diff --git a/notebooks/data_updates/eip_infrastructure/eip_air_construction_permits_2024-12-13.csv b/notebooks/data_updates/eip_infrastructure/eip_air_construction_permits_2024-12-13.csv
diff --git a/notebooks/data_updates/eip_infrastructure/eip_air_construction_project_assn_2024-12-13.csv b/notebooks/data_updates/eip_infrastructure/eip_air_construction_project_assn_2024-12-13.csv
diff --git a/notebooks/data_updates/eip_infrastructure/eip_facilities_2024-12-13.csv b/notebooks/data_updates/eip_infrastructure/eip_facilities_2024-12-13.csv
diff --git a/notebooks/data_updates/eip_infrastructure/eip_facility_project_assn_2024-12-13.csv b/notebooks/data_updates/eip_infrastructure/eip_facility_project_assn_2024-12-13.csv
diff --git a/notebooks/data_updates/eip_infrastructure/eip_projects_2024-12-13.csv b/notebooks/data_updates/eip_infrastructure/eip_projects_2024-12-13.csv
diff --git a/notebooks/data_updates/eip_infrastructure/eip_update.ipynb b/notebooks/data_updates/eip_infrastructure/eip_update.ipynb
diff --git a/src/dbcp/extract/eip_infrastructure.py b/src/dbcp/extract/eip_infrastructure.py
@@ -1,9 +1,26 @@
 """Retrieve data from EIP Infrastructure spreadsheets for analysis.
 
-This data was updated by contacting EIP directly for the latest version, but now they
-host an Excel file at oilandgaswatch.org -> Resources -> Downloads, which points to:
-https://drive.google.com/drive/folders/1udtw3XeezA5Lkb8Mfc_cntNTcV4oPuKi
-Note that this new data version has changed structure from the one extracted below.
+This data is accessed through a xata API hosted by EIP. Each entity (facility, project,
+air construction permit) has its own CSV file, and then there are two additional files
+with IDs linking facilities to projects and projects to air construction permits.
+
+The following datasets are available but not currently downloaded:
+# TODO - update this!
+# 'Pipelines',
+# 'NGA',
+# 'NAICS',
+# 'CWA-NPDES',
+# 'CWA Wetland',
+# 'Air Operating',
+# 'Glossary',  # useful for data dictionary
+# 'Data Sources',
+# 'Map Layers',
+# 'Other Permits',
+# 'Test Collection',
+# 'Featured Facility Descriptors',
+# 'MARAD',
+# 'TEST',
+# 'Pipeline Digitization',
 """
 from pathlib import Path
 from typing import Dict
@@ -29,44 +46,22 @@ def _downcast_ints(df: pd.DataFrame) -> None:
 
 
 def extract(path: Path) -> Dict[str, pd.DataFrame]:
-    """Read EIP excel database.
+    """Read in EIP CSV files from a provided path to a folder.
 
     Args:
         path (Path): filepath
 
     Returns:
         Dict[str, pd.DataFrame]: output dictionary of dataframes
     """
-    sheets_to_read = [
-        "Facility",
-        # 'Company',
-        "Project",
-        "Air Construction",  # permit status is key to identifying actionable projects
-        # 'Pipelines',
-        # 'NGA',
-        # 'NAICS',
-        # 'CWA-NPDES',
-        # 'CWA Wetland',
-        # 'Air Operating',
-        # 'Glossary',  # useful for data dictionary
-        # 'Data Sources',
-        # 'Map Layers',
-        # 'Other Permits',
-        # 'Test Collection',
-        # 'Featured Facility Descriptors',
-        # 'MARAD',
-        # 'TEST',
-        # 'Pipeline Digitization',
-    ]
-    raw_dfs = pd.read_excel(path, sheet_name=sheets_to_read)
-    rename_dict = {
-        "Facility": "eip_facilities",
-        "Project": "eip_projects",
-        "Air Construction": "eip_air_constr_permits",
-    }
-    raw_dfs = {rename_dict[key]: df for key, df in raw_dfs.items()}
-    for df in raw_dfs.values():
+    files = Path(path).glob("*.csv")  # Get all CSV files in folder
+    raw_dfs = {}
+
+    for file in files:
+        df = pd.read_csv(file)
         _convert_object_to_string_dtypes(df)
         _downcast_ints(df)
+        # Get the first part of the name (e.g. eip_air_construction_permits) as the key
+        raw_dfs[file.name.rsplit("_", 1)[0]] = df
 
     return raw_dfs