diff --git a/docs/conf.py b/docs/conf.py index 034cc70b16..f03a6be1b4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,7 +14,7 @@ import pkg_resources -from pudl.metadata.classes import CodeMetadata, DataSource, Package +from pudl.metadata.classes import CodeMetadata, DataSource, ETLGroup, Package from pudl.metadata.codes import CODE_METADATA from pudl.metadata.resources import RESOURCE_METADATA @@ -136,7 +136,16 @@ def data_dictionary_metadata_to_rst(app): # Sort fields within each resource by name: for resource in package.resources: resource.schema.fields = sorted(resource.schema.fields, key=lambda x: x.name) - package.to_rst(docs_dir=DOCS_DIR, path=DOCS_DIR / "data_dictionaries/pudl_db.rst") + package.to_rst( + docs_dir=DOCS_DIR, + path=DOCS_DIR / "data_dictionaries/pudl_db.rst", + template="etl_group.rst.jinja", + ) + package.to_rst( + docs_dir=DOCS_DIR, + path=DOCS_DIR / "data_dictionaries/pudl_db_fields.rst", + template="package.rst.jinja", + ) def data_sources_metadata_to_rst(app): @@ -144,10 +153,15 @@ def data_sources_metadata_to_rst(app): print("Exporting data source metadata to RST.") included_sources = ["eia860", "eia923", "ferc1", "epacems"] package = Package.from_resource_ids() - extra_etl_groups = {"eia860": ["entity_eia"], "ferc1": ["glue"]} + extra_etl_groups = { + "eia860": [ETLGroup.from_id("entity_eia")], + "ferc1": [ETLGroup.from_id("glue")], + } for name in included_sources: source = DataSource.from_id(name) - source_resources = [res for res in package.resources if res.etl_group == name] + source_resources = [ + res for res in package.resources if res.etl_group == ETLGroup.from_id(name) + ] extra_resources = None if name in extra_etl_groups: # get resources for this source from extra etl groups @@ -183,6 +197,7 @@ def static_dfs_to_rst(app): def cleanup_rsts(app, exception): """Remove generated RST files when the build is finished.""" (DOCS_DIR / "data_dictionaries/pudl_db.rst").unlink() + (DOCS_DIR / "data_dictionaries/pudl_db_fields.rst").unlink() (DOCS_DIR / "data_dictionaries/codes_and_labels.rst").unlink() (DOCS_DIR / "data_sources/eia860.rst").unlink() (DOCS_DIR / "data_sources/eia923.rst").unlink() diff --git a/docs/data_dictionaries/index.rst b/docs/data_dictionaries/index.rst index 0a5c8eda60..59da5a54d5 100644 --- a/docs/data_dictionaries/index.rst +++ b/docs/data_dictionaries/index.rst @@ -5,7 +5,7 @@ Data Dictionaries .. toctree:: :caption: Data Processed & Cleaned by PUDL - :maxdepth: 1 + :maxdepth: 2 pudl_db diff --git a/docs/templates/etl_group.rst.jinja b/docs/templates/etl_group.rst.jinja new file mode 100644 index 0000000000..8da45bee8d --- /dev/null +++ b/docs/templates/etl_group.rst.jinja @@ -0,0 +1,19 @@ +=============================================================================== +PUDL Data Dictionary +=============================================================================== + +The following data tables have been cleaned and transformed by our ETL process. +They're seperated into categories based on their original source and the type of +content they contain. + +{% for group, resources in package.get_resource_by_etl_group().items() %} +{{ package.get_etl_group()[group].title }} +------------------------------------------------------------------------------- + +{{ package.get_etl_group()[group].description }} + +{% for resource in resources %} +- :ref:`{{ resource.name }}` + +{% endfor %} +{% endfor %} diff --git a/docs/templates/package.rst.jinja b/docs/templates/package.rst.jinja index 0184b70c89..d933a76072 100644 --- a/docs/templates/package.rst.jinja +++ b/docs/templates/package.rst.jinja @@ -1,11 +1,24 @@ +:orphan: + =============================================================================== PUDL Data Dictionary =============================================================================== The following data tables have been cleaned and transformed by our ETL process. -{% for resource in package.resources %} +{% for group, resources in package.get_resource_by_etl_group().items() %} +{% for resource in resources %} + .. _{{ resource.name }}: +{{ resource.name }} +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +{{ resource.description | wordwrap(78) if resource.description else +'No table description available.' }} +`Browse or query this table in Datasette. `__ + {% include 'resource.rst.jinja' %} + +{% endfor %} {% endfor %} diff --git a/docs/templates/resource.rst.jinja b/docs/templates/resource.rst.jinja index 9a986bf5da..0c0f06f62a 100644 --- a/docs/templates/resource.rst.jinja +++ b/docs/templates/resource.rst.jinja @@ -1,11 +1,3 @@ -------------------------------------------------------------------------------- -{{ resource.name }} -------------------------------------------------------------------------------- - -{{ resource.description | wordwrap(78) if resource.description else -'No table description available.' }} -`Browse or query this table in Datasette. `__ - .. list-table:: :widths: auto :header-rows: 1 @@ -17,4 +9,5 @@ * - {{ field.name }} - {{ field.type }} - {{ field.description if field.description else "N/A" }} + {%- endfor %} diff --git a/src/pudl/metadata/classes.py b/src/pudl/metadata/classes.py index a1b445a4bb..a20728d4c9 100644 --- a/src/pudl/metadata/classes.py +++ b/src/pudl/metadata/classes.py @@ -22,6 +22,7 @@ from pudl.metadata.constants import ( CONSTRAINT_DTYPES, CONTRIBUTORS, + ETL_GROUPS, FIELD_DTYPES_PANDAS, FIELD_DTYPES_PYARROW, FIELD_DTYPES_SQL, @@ -1021,6 +1022,24 @@ def from_id(cls, x: str) -> "DataSource": return cls(**cls.dict_from_id(x)) +class ETLGroup(Base): + """Blah.""" + + # name: SnakeCase + title: String = None + description: String = None + + @staticmethod + def dict_from_id(x: str) -> dict: + """Construct dictionary from PUDL identifier.""" + return copy.deepcopy(ETL_GROUPS[x]) + + @classmethod + def from_id(cls, x: str) -> "ETLGroup": + """Construct Source by source name in the metadata.""" + return cls(**cls.dict_from_id(x)) + + class ResourceHarvest(Base): """Resource harvest parameters (`resource.harvest`).""" @@ -1046,7 +1065,8 @@ class Resource(Base): >>> fields = [{'name': 'x', 'type': 'year'}, {'name': 'y', 'type': 'string'}] >>> fkeys = [{'fields': ['x', 'y'], 'reference': {'resource': 'b', 'fields': ['x', 'y']}}] >>> schema = {'fields': fields, 'primary_key': ['x'], 'foreign_keys': fkeys} - >>> resource = Resource(name='a', schema=schema) + >>> etl_group = ETLGroup(title='a', description='a') + >>> resource = Resource(name='a', schema=schema, etl_group=etl_group) >>> table = resource.to_sql() >>> table.columns.x Column('x', Integer(), ForeignKey('b.x'), CheckConstraint(...), table=, primary_key=True, nullable=False) @@ -1065,7 +1085,8 @@ class Resource(Base): >>> resource = Resource(**{ ... 'name': 'a', ... 'harvest': {'harvest': True}, - ... 'schema': {'fields': fields, 'primary_key': ['id']} + ... 'schema': {'fields': fields, 'primary_key': ['id']}, + ... 'etl_group': ETLGroup(title='a', description='b') ... }) >>> dfs = { ... 'a': pd.DataFrame({'id': [1, 1, 2, 2], 'x': [1, 1, 2, 2]}), @@ -1143,7 +1164,8 @@ class Resource(Base): >>> fields = [{'name': 'report_year', 'type': 'year'}] >>> resource = Resource(**{ ... 'name': 'table', 'harvest': {'harvest': True}, - ... 'schema': {'fields': fields, 'primary_key': ['report_year']} + ... 'schema': {'fields': fields, 'primary_key': ['report_year']}, + ... 'etl_group': ETLGroup(title='a', description='b') ... }) >>> df = pd.DataFrame({'report_date': ['2000-02-02', '2000-03-03']}) >>> resource.format_df(df) @@ -1182,23 +1204,7 @@ class Resource(Base): "ppe", "eia_bulk_elec", ] = None - etl_group: Literal[ - "eia860", - "eia861", - "eia923", - "entity_eia", - "epacems", - "ferc1", - "ferc1_disabled", - "ferc714", - "glue", - "outputs", - "static_ferc1", - "static_eia", - "static_eia_disabled", - "eia_bulk_elec", - "static_pudl", - ] = None + etl_group: ETLGroup _check_unique = _validator( "contributors", "keywords", "licenses", "sources", fn=_check_unique @@ -1279,6 +1285,7 @@ def dict_from_id(x: str) -> dict: # noqa: C901 # Delete foreign key rules if "foreign_key_rules" in schema: del schema["foreign_key_rules"] + obj["etl_group"] = ETLGroup.from_id(obj["etl_group"]) # Add encoders to columns as appropriate, based on FKs. # Foreign key relationships determine the set of codes to use @@ -1379,7 +1386,8 @@ def match_primary_key(self, names: Iterable[str]) -> dict[str, str] | None: Examples: >>> fields = [{'name': 'x_year', 'type': 'year'}] >>> schema = {'fields': fields, 'primary_key': ['x_year']} - >>> resource = Resource(name='r', schema=schema) + >>> etl_group = ETLGroup(title='r', description='r') + >>> resource = Resource(name='r', schema=schema, etl_group=etl_group) By default, when :attr:`harvest` .`harvest=False`, exact matches are required. @@ -1691,8 +1699,9 @@ class Package(Base): >>> fields = [{'name': 'x', 'type': 'year'}, {'name': 'y', 'type': 'string'}] >>> fkey = {'fields': ['x', 'y'], 'reference': {'resource': 'b', 'fields': ['x', 'y']}} >>> schema = {'fields': fields, 'primary_key': ['x'], 'foreign_keys': [fkey]} - >>> a = Resource(name='a', schema=schema) - >>> b = Resource(name='b', schema=Schema(fields=fields, primary_key=['x'])) + >>> etl_group = ETLGroup(title='a', description='b') + >>> a = Resource(name='a', schema=schema, etl_group=etl_group) + >>> b = Resource(name='b', schema=Schema(fields=fields, primary_key=['x']), etl_group=etl_group) >>> Package(name='ab', resources=[a, b]) Traceback (most recent call last): ValidationError: ... @@ -1792,7 +1801,6 @@ def from_resource_ids( # noqa: C901 i = len(resources) if len(names) > i: resources += [Resource.dict_from_id(x) for x in names[i:]] - return cls(name="pudl", resources=resources) def get_resource(self, name: str) -> Resource: @@ -1800,9 +1808,27 @@ def get_resource(self, name: str) -> Resource: names = [resource.name for resource in self.resources] return self.resources[names.index(name)] - def to_rst(self, docs_dir: DirectoryPath, path: str) -> None: + def get_etl_group(self) -> dict[str, ETLGroup]: + """blah.""" + etl_group_dict = {} + for etl_group in ETL_GROUPS.keys(): + etl_group_dict[etl_group] = ETLGroup.from_id(etl_group) + return etl_group_dict + + def get_resource_by_etl_group(self) -> dict[str, list[Resource]]: + """blah.""" + resource_dict = {} + for etl_group in ETL_GROUPS.keys(): + resource_dict[etl_group] = [ + resource + for resource in self.resources + if resource.etl_group == ETLGroup.from_id(etl_group) + ] + return resource_dict + + def to_rst(self, docs_dir: DirectoryPath, path: str, template: str) -> None: """Output to an RST file.""" - template = _get_jinja_environment(docs_dir).get_template("package.rst.jinja") + template = _get_jinja_environment(docs_dir).get_template(template) rendered = template.render(package=self) if path: Path(path).write_text(rendered) diff --git a/src/pudl/metadata/constants.py b/src/pudl/metadata/constants.py index 6ba46dc6bb..eaf6c2eb30 100644 --- a/src/pudl/metadata/constants.py +++ b/src/pudl/metadata/constants.py @@ -153,6 +153,93 @@ } """PUDL Contributors for attribution.""" +ETL_GROUPS: dict[str, dict[str, str]] = { + "entity_eia": { + "title": "EIA Entity Tables", + "description": """EIA entity tables combine information reported in multiple EIA +tables into a single source of truth. "Entities" (boilers, generators, plants and +utilities) are referred to repeatedly throughout EIA resulting in data duplication and +human-error. For example, one year a plant's latitude is ``55.339722`` and +another year it's ``55.339725`` or in 860 a plant is called ``Barry`` but in 923 it's +once referred to as ``Bary``. We use a "harvesting" process to extract this static (not +changing on an annual basis) information, determine the true value, and rehome it in our +entity tables. This reduces the amount of data we have to store and provides users with +a master list of all entitiy types and their characteristics that are reported to a +given source. Read more about our harvesting process in :mod:`pudl.transform.eia`. +""", + }, + "static_eia": { + "title": "EIA Static Tables", + "description": """Static EIA tables are like EIA entity tables in that they pull +from multiple EIA tables, but their purpose is to elucidate encoded language. This is +where acromyns are connected to their full spelling and descriptions. These tables did +not originate as raw EIA tables, rather they are created by us to link commonly used +codes to important descriptors.""", + }, + "static_eia_disabled": { + "title": "EIA Static Tables (disabled)", + "description": "Disabled tables are no longer included in the PUDL DB.", + }, + "eia860": { + "title": "EIA 860", + "description": """Tables derrived from the EIA Form 860. See our +:doc:`../data_sources/eia860` page for more information.""", + }, + "eia861": { + "title": "EIA 861", + "description": "Tables derrived from the EIA Form 861.", + }, + "eia923": { + "title": "EIA 923", + "description": """Tables derrived from the EIA Form 923. See our +:doc:`../data_sources/eia923` page for more information.""", + }, + "eia_bulk_elec": { + "title": "EIA Bulk Electricity Tables", + "description": "Blah", + }, + "epacems": { + "title": "EPA CEMS", + "description": """Tables derrived from the EPA CEMS data. See our +:doc:`../data_sources/epacems` page for more information""", + }, + "static_ferc1": { + "title": "FERC Form 1 Static Tables", + "description": """Static FERC Form 1 tables elucidate encoded language. This is +where acromyns are connected to their full spelling and descriptions. These tables did +not originate as raw FERC Form 1 tables, rather they are created by us to link commonly +used codes to important descriptors.""", + }, + "ferc1": { + "title": "FERC Form 1", + "description": """Tables derrived from the FERC Form 1 tables. See our +:doc:`../data_sources/ferc1` page for more information""", + }, + "ferc1_disabled": { + "title": "FERC Form 1 disabled", + "description": "Disabled tables are no longer included in the PUDL DB.", + }, + "ferc714": { + "title": "FERC Form 714", + "description": "Tables derrived from the FERC Form 714 tables", + }, + "glue": { + "title": "Glue Tables", + "description": "Tables connecting information from multiple sources", + }, + "outputs": { + "title": "Output Tables", + "description": "Blah", + }, + "static_pudl": { + "title": "Static PUDL Tables", + "description": """Static PUDL tables elucidate encoded language. This is +where acromyns are connected to their full spelling and descriptions. They're created +to link commonly used codes to important descriptors.""", + }, +} +"""Table categorization by ETL group.""" + KEYWORDS: dict[str, list[str]] = { "electricity": [ "electricity", diff --git a/test/unit/harvest_test.py b/test/unit/harvest_test.py index 6b6a158b3e..23c17b66f9 100644 --- a/test/unit/harvest_test.py +++ b/test/unit/harvest_test.py @@ -5,7 +5,7 @@ import pandas as pd import pytest -from pudl.metadata.classes import Resource +from pudl.metadata.classes import ETLGroup, Resource from pudl.metadata.helpers import most_frequent # ---- Helpers ---- # @@ -35,6 +35,7 @@ def _assert_frame_equal(a: pd.DataFrame, b: pd.DataFrame, **kwargs: Any) -> None ], "primary_key": ["i", "j"], }, + "etl_group": ETLGroup.from_id("static_pudl"), } HARVEST: dict[str, Any] = {**STANDARD, "harvest": {"harvest": True}} @@ -264,6 +265,7 @@ def test_resource_with_only_key_fields_harvests() -> None: "fields": ["plant_id_eia", "state", "balancing_authority_code_eia"], "primary_key": ["plant_id_eia"], }, + "etl_group": "eia860", }, { "name": "generator_entity_eia860", @@ -277,6 +279,7 @@ def test_resource_with_only_key_fields_harvests() -> None: ], "primary_key": ["plant_id_eia", "generator_id"], }, + "etl_group": "eia860", }, { "name": "generators_eia860", @@ -285,6 +288,7 @@ def test_resource_with_only_key_fields_harvests() -> None: "fields": ["plant_id_eia", "generator_id", "report_year", "capacity_mw"], "primary_key": ["plant_id_eia", "generator_id", "report_year"], }, + "etl_group": "eia860", }, { "name": "utility_entity_eia", @@ -293,6 +297,7 @@ def test_resource_with_only_key_fields_harvests() -> None: "fields": ["utility_id_eia", "utility_name_eia"], "primary_key": ["utility_id_eia"], }, + "etl_group": "entity_eia", }, { "name": "utility_assn_eia", @@ -301,6 +306,7 @@ def test_resource_with_only_key_fields_harvests() -> None: "fields": ["utility_id_eia", "report_year", "state", "county"], "primary_key": ["utility_id_eia", "report_year", "state", "county"], }, + "etl_group": "static_eia", }, { "name": "generation_eia923", @@ -314,6 +320,7 @@ def test_resource_with_only_key_fields_harvests() -> None: ], "primary_key": ["plant_id_eia", "generator_id", "report_month"], }, + "etl_group": "eia923", }, { "name": "sales_eia861", @@ -322,6 +329,7 @@ def test_resource_with_only_key_fields_harvests() -> None: "fields": ["utility_id_eia", "report_year", "state", "county", "sales"], "primary_key": ["utility_id_eia", "report_year", "state", "county"], }, + "etl_group": "eia861", }, { "name": "boiler_generator_assn_eia860", @@ -330,6 +338,7 @@ def test_resource_with_only_key_fields_harvests() -> None: "fields": ["plant_id_eia", "generator_id", "report_year", "boiler_id"], "primary_key": ["plant_id_eia", "generator_id", "report_year", "boiler_id"], }, + "etl_group": "eia860", }, ] @@ -338,6 +347,7 @@ def test_resource_with_only_key_fields_harvests() -> None: d["schema"]["fields"] = [ {"name": name, "type": FIELD_DTYPES[name]} for name in d["schema"]["fields"] ] + d["etl_group"] = ETLGroup.from_id(d["etl_group"]) RESOURCES[i] = Resource(**d) EXPECTED_DFS: dict[str, pd.DataFrame] = dict(