catalyst-cooperative · aesharpe · Feb 23, 2023 · Feb 23, 2023 · Feb 25, 2023 · Feb 25, 2023
diff --git a/docs/conf.py b/docs/conf.py
@@ -14,7 +14,7 @@
 
 import pkg_resources
 
-from pudl.metadata.classes import CodeMetadata, DataSource, Package
+from pudl.metadata.classes import CodeMetadata, DataSource, ETLGroup, Package
 from pudl.metadata.codes import CODE_METADATA
 from pudl.metadata.resources import RESOURCE_METADATA
 
@@ -136,18 +136,32 @@ def data_dictionary_metadata_to_rst(app):
     # Sort fields within each resource by name:
     for resource in package.resources:
         resource.schema.fields = sorted(resource.schema.fields, key=lambda x: x.name)
-    package.to_rst(docs_dir=DOCS_DIR, path=DOCS_DIR / "data_dictionaries/pudl_db.rst")
+    package.to_rst(
+        docs_dir=DOCS_DIR,
+        path=DOCS_DIR / "data_dictionaries/pudl_db.rst",
+        template="etl_group.rst.jinja",
+    )
+    package.to_rst(
+        docs_dir=DOCS_DIR,
+        path=DOCS_DIR / "data_dictionaries/pudl_db_fields.rst",
+        template="package.rst.jinja",
+    )
 
 
 def data_sources_metadata_to_rst(app):
     """Export data source metadata to RST for inclusion in the documentation."""
     print("Exporting data source metadata to RST.")
     included_sources = ["eia860", "eia923", "ferc1", "epacems"]
     package = Package.from_resource_ids()
-    extra_etl_groups = {"eia860": ["entity_eia"], "ferc1": ["glue"]}
+    extra_etl_groups = {
+        "eia860": [ETLGroup.from_id("entity_eia")],
+        "ferc1": [ETLGroup.from_id("glue")],
+    }
     for name in included_sources:
         source = DataSource.from_id(name)
-        source_resources = [res for res in package.resources if res.etl_group == name]
+        source_resources = [
+            res for res in package.resources if res.etl_group == ETLGroup.from_id(name)
+        ]
         extra_resources = None
         if name in extra_etl_groups:
             # get resources for this source from extra etl groups
@@ -183,6 +197,7 @@ def static_dfs_to_rst(app):
 def cleanup_rsts(app, exception):
     """Remove generated RST files when the build is finished."""
     (DOCS_DIR / "data_dictionaries/pudl_db.rst").unlink()
+    (DOCS_DIR / "data_dictionaries/pudl_db_fields.rst").unlink()
     (DOCS_DIR / "data_dictionaries/codes_and_labels.rst").unlink()
     (DOCS_DIR / "data_sources/eia860.rst").unlink()
     (DOCS_DIR / "data_sources/eia923.rst").unlink()

diff --git a/docs/data_dictionaries/index.rst b/docs/data_dictionaries/index.rst
@@ -5,7 +5,7 @@ Data Dictionaries
 
 .. toctree::
    :caption: Data Processed & Cleaned by PUDL
-   :maxdepth: 1
+   :maxdepth: 2
 
    pudl_db
 

diff --git a/docs/templates/etl_group.rst.jinja b/docs/templates/etl_group.rst.jinja
@@ -0,0 +1,19 @@
+===============================================================================
+PUDL Data Dictionary
+===============================================================================
+
+The following data tables have been cleaned and transformed by our ETL process.
+They're seperated into categories based on their original source and the type of
+content they contain.
+
+{% for group, resources in package.get_resource_by_etl_group().items() %}
+{{ package.get_etl_group()[group].title }}
+-------------------------------------------------------------------------------
+
+{{ package.get_etl_group()[group].description }}
+
+{% for resource in resources %}
+- :ref:`{{ resource.name }}`
+
+{% endfor %}
+{% endfor %}
diff --git a/docs/templates/package.rst.jinja b/docs/templates/package.rst.jinja
@@ -1,11 +1,24 @@
+:orphan:
+
 ===============================================================================
 PUDL Data Dictionary
 ===============================================================================
 
 The following data tables have been cleaned and transformed by our ETL process.
 
-{% for resource in package.resources %}
+{% for group, resources in package.get_resource_by_etl_group().items() %}
+{% for resource in resources %}
+
 .. _{{ resource.name }}:
 
+{{ resource.name }}
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+{{ resource.description | wordwrap(78) if resource.description else
+'No table description available.' }}
+`Browse or query this table in Datasette. <https://data.catalyst.coop/pudl/{{ resource.name }}>`__
+
 {% include 'resource.rst.jinja' %}
+
+{% endfor %}
 {% endfor %}
diff --git a/docs/templates/resource.rst.jinja b/docs/templates/resource.rst.jinja
@@ -1,11 +1,3 @@
--------------------------------------------------------------------------------
-{{ resource.name }}
--------------------------------------------------------------------------------
-
-{{ resource.description | wordwrap(78) if resource.description else
-'No table description available.' }}
-`Browse or query this table in Datasette. <https://data.catalyst.coop/pudl/{{ resource.name }}>`__
-
 .. list-table::
   :widths: auto
   :header-rows: 1
@@ -17,4 +9,5 @@
   * - {{ field.name }}
     - {{ field.type }}
     - {{ field.description if field.description else "N/A" }}
+
 {%- endfor %}
diff --git a/src/pudl/metadata/classes.py b/src/pudl/metadata/classes.py
@@ -22,6 +22,7 @@
 from pudl.metadata.constants import (
     CONSTRAINT_DTYPES,
     CONTRIBUTORS,
+    ETL_GROUPS,
     FIELD_DTYPES_PANDAS,
     FIELD_DTYPES_PYARROW,
     FIELD_DTYPES_SQL,
@@ -1021,6 +1022,24 @@ def from_id(cls, x: str) -> "DataSource":
         return cls(**cls.dict_from_id(x))
 
 
+class ETLGroup(Base):
+    """Blah."""
+
+    # name: SnakeCase
+    title: String = None
+    description: String = None
+
+    @staticmethod
+    def dict_from_id(x: str) -> dict:
+        """Construct dictionary from PUDL identifier."""
+        return copy.deepcopy(ETL_GROUPS[x])
+
+    @classmethod
+    def from_id(cls, x: str) -> "ETLGroup":
+        """Construct Source by source name in the metadata."""
+        return cls(**cls.dict_from_id(x))
+
+
 class ResourceHarvest(Base):
     """Resource harvest parameters (`resource.harvest`)."""
 
@@ -1046,7 +1065,8 @@ class Resource(Base):
         >>> fields = [{'name': 'x', 'type': 'year'}, {'name': 'y', 'type': 'string'}]
         >>> fkeys = [{'fields': ['x', 'y'], 'reference': {'resource': 'b', 'fields': ['x', 'y']}}]
         >>> schema = {'fields': fields, 'primary_key': ['x'], 'foreign_keys': fkeys}
-        >>> resource = Resource(name='a', schema=schema)
+        >>> etl_group = ETLGroup(title='a', description='a')
+        >>> resource = Resource(name='a', schema=schema, etl_group=etl_group)
         >>> table = resource.to_sql()
         >>> table.columns.x
         Column('x', Integer(), ForeignKey('b.x'), CheckConstraint(...), table=<a>, primary_key=True, nullable=False)
@@ -1065,7 +1085,8 @@ class Resource(Base):
         >>> resource = Resource(**{
         ...     'name': 'a',
         ...     'harvest': {'harvest': True},
-        ...     'schema': {'fields': fields, 'primary_key': ['id']}
+        ...     'schema': {'fields': fields, 'primary_key': ['id']},
+        ...     'etl_group': ETLGroup(title='a', description='b')
         ... })
         >>> dfs = {
         ...     'a': pd.DataFrame({'id': [1, 1, 2, 2], 'x': [1, 1, 2, 2]}),
@@ -1143,7 +1164,8 @@ class Resource(Base):
         >>> fields = [{'name': 'report_year', 'type': 'year'}]
         >>> resource = Resource(**{
         ...     'name': 'table', 'harvest': {'harvest': True},
-        ...     'schema': {'fields': fields, 'primary_key': ['report_year']}
+        ...     'schema': {'fields': fields, 'primary_key': ['report_year']},
+        ...     'etl_group': ETLGroup(title='a', description='b')
         ... })
         >>> df = pd.DataFrame({'report_date': ['2000-02-02', '2000-03-03']})
         >>> resource.format_df(df)
@@ -1182,23 +1204,7 @@ class Resource(Base):
         "ppe",
         "eia_bulk_elec",
     ] = None
-    etl_group: Literal[
-        "eia860",
-        "eia861",
-        "eia923",
-        "entity_eia",
-        "epacems",
-        "ferc1",
-        "ferc1_disabled",
-        "ferc714",
-        "glue",
-        "outputs",
-        "static_ferc1",
-        "static_eia",
-        "static_eia_disabled",
-        "eia_bulk_elec",
-        "static_pudl",
-    ] = None
+    etl_group: ETLGroup
 
     _check_unique = _validator(
         "contributors", "keywords", "licenses", "sources", fn=_check_unique
@@ -1279,6 +1285,7 @@ def dict_from_id(x: str) -> dict:  # noqa: C901
         # Delete foreign key rules
         if "foreign_key_rules" in schema:
             del schema["foreign_key_rules"]
+        obj["etl_group"] = ETLGroup.from_id(obj["etl_group"])
 
         # Add encoders to columns as appropriate, based on FKs.
         # Foreign key relationships determine the set of codes to use
@@ -1379,7 +1386,8 @@ def match_primary_key(self, names: Iterable[str]) -> dict[str, str] | None:
         Examples:
             >>> fields = [{'name': 'x_year', 'type': 'year'}]
             >>> schema = {'fields': fields, 'primary_key': ['x_year']}
-            >>> resource = Resource(name='r', schema=schema)
+            >>> etl_group = ETLGroup(title='r', description='r')
+            >>> resource = Resource(name='r', schema=schema, etl_group=etl_group)
 
             By default, when :attr:`harvest` .`harvest=False`,
             exact matches are required.
@@ -1691,8 +1699,9 @@ class Package(Base):
         >>> fields = [{'name': 'x', 'type': 'year'}, {'name': 'y', 'type': 'string'}]
         >>> fkey = {'fields': ['x', 'y'], 'reference': {'resource': 'b', 'fields': ['x', 'y']}}
         >>> schema = {'fields': fields, 'primary_key': ['x'], 'foreign_keys': [fkey]}
-        >>> a = Resource(name='a', schema=schema)
-        >>> b = Resource(name='b', schema=Schema(fields=fields, primary_key=['x']))
+        >>> etl_group = ETLGroup(title='a', description='b')
+        >>> a = Resource(name='a', schema=schema, etl_group=etl_group)
+        >>> b = Resource(name='b', schema=Schema(fields=fields, primary_key=['x']), etl_group=etl_group)
         >>> Package(name='ab', resources=[a, b])
         Traceback (most recent call last):
         ValidationError: ...
@@ -1792,17 +1801,34 @@ def from_resource_ids(  # noqa: C901
                 i = len(resources)
                 if len(names) > i:
                     resources += [Resource.dict_from_id(x) for x in names[i:]]
-
         return cls(name="pudl", resources=resources)
 
     def get_resource(self, name: str) -> Resource:
         """Return the resource with the given name if it is in the Package."""
         names = [resource.name for resource in self.resources]
         return self.resources[names.index(name)]
 
-    def to_rst(self, docs_dir: DirectoryPath, path: str) -> None:
+    def get_etl_group(self) -> dict[str, ETLGroup]:
+        """blah."""
+        etl_group_dict = {}
+        for etl_group in ETL_GROUPS.keys():
+            etl_group_dict[etl_group] = ETLGroup.from_id(etl_group)
+        return etl_group_dict
+
+    def get_resource_by_etl_group(self) -> dict[str, list[Resource]]:
+        """blah."""
+        resource_dict = {}
+        for etl_group in ETL_GROUPS.keys():
+            resource_dict[etl_group] = [
+                resource
+                for resource in self.resources
+                if resource.etl_group == ETLGroup.from_id(etl_group)
+            ]
+        return resource_dict
+
+    def to_rst(self, docs_dir: DirectoryPath, path: str, template: str) -> None:
         """Output to an RST file."""
-        template = _get_jinja_environment(docs_dir).get_template("package.rst.jinja")
+        template = _get_jinja_environment(docs_dir).get_template(template)
         rendered = template.render(package=self)
         if path:
             Path(path).write_text(rendered)

diff --git a/src/pudl/metadata/constants.py b/src/pudl/metadata/constants.py
@@ -153,6 +153,93 @@
 }
 """PUDL Contributors for attribution."""
 
+ETL_GROUPS: dict[str, dict[str, str]] = {
+    "entity_eia": {
+        "title": "EIA Entity Tables",
+        "description": """EIA entity tables combine information reported in multiple EIA
+tables into a single source of truth. "Entities" (boilers, generators, plants and
+utilities) are referred to repeatedly throughout EIA resulting in data duplication and
+human-error. For example, one year a plant's latitude is ``55.339722`` and
+another year it's ``55.339725`` or in 860 a plant is called ``Barry`` but in 923 it's
+once referred to as ``Bary``. We use a "harvesting" process to extract this static (not
+changing on an annual basis) information, determine the true value, and rehome it in our
+entity tables. This reduces the amount of data we have to store and provides users with
+a master list of all entitiy types and their characteristics that are reported to a
+given source. Read more about our harvesting process in :mod:`pudl.transform.eia`.
+""",
+    },
+    "static_eia": {
+        "title": "EIA Static Tables",
+        "description": """Static EIA tables are like EIA entity tables in that they pull
+from multiple EIA tables, but their purpose is to elucidate encoded language. This is
+where acromyns are connected to their full spelling and descriptions. These tables did
+not originate as raw EIA tables, rather they are created by us to link commonly used
+codes to important descriptors.""",
+    },
+    "static_eia_disabled": {
+        "title": "EIA Static Tables (disabled)",
+        "description": "Disabled tables are no longer included in the PUDL DB.",
+    },
+    "eia860": {
+        "title": "EIA 860",
+        "description": """Tables derrived from the EIA Form 860. See our
+:doc:`../data_sources/eia860` page for more information.""",
+    },
+    "eia861": {
+        "title": "EIA 861",
+        "description": "Tables derrived from the EIA Form 861.",
+    },
+    "eia923": {
+        "title": "EIA 923",
+        "description": """Tables derrived from the EIA Form 923.  See our
+:doc:`../data_sources/eia923` page for more information.""",
+    },
+    "eia_bulk_elec": {
+        "title": "EIA Bulk Electricity Tables",
+        "description": "Blah",
+    },
+    "epacems": {
+        "title": "EPA CEMS",
+        "description": """Tables derrived from the EPA CEMS data. See our
+:doc:`../data_sources/epacems` page for more information""",
+    },
+    "static_ferc1": {
+        "title": "FERC Form 1 Static Tables",
+        "description": """Static FERC Form 1 tables elucidate encoded language. This is
+where acromyns are connected to their full spelling and descriptions. These tables did
+not originate as raw FERC Form 1 tables, rather they are created by us to link commonly
+used codes to important descriptors.""",
+    },
+    "ferc1": {
+        "title": "FERC Form 1",
+        "description": """Tables derrived from the FERC Form 1 tables. See our
+:doc:`../data_sources/ferc1` page for more information""",
+    },
+    "ferc1_disabled": {
+        "title": "FERC Form 1 disabled",
+        "description": "Disabled tables are no longer included in the PUDL DB.",
+    },
+    "ferc714": {
+        "title": "FERC Form 714",
+        "description": "Tables derrived from the FERC Form 714 tables",
+    },
+    "glue": {
+        "title": "Glue Tables",
+        "description": "Tables connecting information from multiple sources",
+    },
+    "outputs": {
+        "title": "Output Tables",
+        "description": "Blah",
+    },
+    "static_pudl": {
+        "title": "Static PUDL Tables",
+        "description": """Static PUDL tables elucidate encoded language. This is
+where acromyns are connected to their full spelling and descriptions. They're created
+to link commonly used codes to important descriptors.""",
+    },
+}
+"""Table categorization by ETL group."""
+
 KEYWORDS: dict[str, list[str]] = {
     "electricity": [
         "electricity",