From 26aa7412b7aede32751014da53d4a945a7a9340e Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Wed, 22 Feb 2023 22:57:31 -0700 Subject: [PATCH 1/6] Update data dictionary docs page so that there are groups of etl_group types not just one long alphabetical list of tables. Took out the table fields for now, want to add them back in later --- docs/data_dictionaries/index.rst | 2 +- docs/templates/package.rst.jinja | 16 ++++++++++++++-- src/pudl/metadata/classes.py | 11 +++++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/docs/data_dictionaries/index.rst b/docs/data_dictionaries/index.rst index 0a5c8eda60..59da5a54d5 100644 --- a/docs/data_dictionaries/index.rst +++ b/docs/data_dictionaries/index.rst @@ -5,7 +5,7 @@ Data Dictionaries .. toctree:: :caption: Data Processed & Cleaned by PUDL - :maxdepth: 1 + :maxdepth: 2 pudl_db diff --git a/docs/templates/package.rst.jinja b/docs/templates/package.rst.jinja index 0184b70c89..1a33d56f20 100644 --- a/docs/templates/package.rst.jinja +++ b/docs/templates/package.rst.jinja @@ -4,8 +4,20 @@ PUDL Data Dictionary The following data tables have been cleaned and transformed by our ETL process. -{% for resource in package.resources %} +{% for group, resources in package.get_resource_by_etl_group().items() %} +{{ group }} +------------------------------------------------------------------------------- + +{% for resource in resources %} + .. _{{ resource.name }}: -{% include 'resource.rst.jinja' %} +{{ resource.name }} +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +{{ resource.description | wordwrap(78) if resource.description else +'No table description available.' }} +`Browse or query this table in Datasette. `__ + +{% endfor %} {% endfor %} diff --git a/src/pudl/metadata/classes.py b/src/pudl/metadata/classes.py index a1b445a4bb..5ce23443ff 100644 --- a/src/pudl/metadata/classes.py +++ b/src/pudl/metadata/classes.py @@ -1800,6 +1800,17 @@ def get_resource(self, name: str) -> Resource: names = [resource.name for resource in self.resources] return self.resources[names.index(name)] + def get_resource_by_etl_group(self) -> dict[str, list[Resource]]: + """blah.""" + resource_dict = {} + for etl_group in list(self.resources[0].__annotations__["etl_group"].__args__): + resource_dict[etl_group] = [ + resource + for resource in self.resources + if resource.etl_group == etl_group + ] + return resource_dict + def to_rst(self, docs_dir: DirectoryPath, path: str) -> None: """Output to an RST file.""" template = _get_jinja_environment(docs_dir).get_template("package.rst.jinja") From ed394ed7ce678757782e5ec5c0b6d7268335ebe8 Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Thu, 23 Feb 2023 12:37:24 -0700 Subject: [PATCH 2/6] Add pudl_db_fields.rst file and links that allow you to click back and forth between version with columns and version with just table descriptions --- docs/conf.py | 12 +++++++++++- docs/templates/package.rst.jinja | 11 +++++++++++ docs/templates/resource.rst.jinja | 8 -------- src/pudl/metadata/classes.py | 4 ++-- 4 files changed, 24 insertions(+), 11 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 034cc70b16..548169b2a0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -136,7 +136,16 @@ def data_dictionary_metadata_to_rst(app): # Sort fields within each resource by name: for resource in package.resources: resource.schema.fields = sorted(resource.schema.fields, key=lambda x: x.name) - package.to_rst(docs_dir=DOCS_DIR, path=DOCS_DIR / "data_dictionaries/pudl_db.rst") + package.to_rst( + docs_dir=DOCS_DIR, + path=DOCS_DIR / "data_dictionaries/pudl_db_fields.rst", + add_fields=True, + ) + package.to_rst( + docs_dir=DOCS_DIR, + path=DOCS_DIR / "data_dictionaries/pudl_db.rst", + add_fields=False, + ) def data_sources_metadata_to_rst(app): @@ -183,6 +192,7 @@ def static_dfs_to_rst(app): def cleanup_rsts(app, exception): """Remove generated RST files when the build is finished.""" (DOCS_DIR / "data_dictionaries/pudl_db.rst").unlink() + (DOCS_DIR / "data_dictionaries/pudl_db_fields.rst").unlink() (DOCS_DIR / "data_dictionaries/codes_and_labels.rst").unlink() (DOCS_DIR / "data_sources/eia860.rst").unlink() (DOCS_DIR / "data_sources/eia923.rst").unlink() diff --git a/docs/templates/package.rst.jinja b/docs/templates/package.rst.jinja index 1a33d56f20..6512869a31 100644 --- a/docs/templates/package.rst.jinja +++ b/docs/templates/package.rst.jinja @@ -1,3 +1,5 @@ +:orphan: + =============================================================================== PUDL Data Dictionary =============================================================================== @@ -10,14 +12,23 @@ The following data tables have been cleaned and transformed by our ETL process. {% for resource in resources %} +{% if add_fields %} .. _{{ resource.name }}: +{% endif %} {{ resource.name }} ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ {{ resource.description | wordwrap(78) if resource.description else 'No table description available.' }} +{% if not add_fields %} +:ref:`View Columns <{{ resource.name }}>` +{% endif %} `Browse or query this table in Datasette. `__ +{% if add_fields %} +{% include 'resource.rst.jinja' %} +{% endif %} + {% endfor %} {% endfor %} diff --git a/docs/templates/resource.rst.jinja b/docs/templates/resource.rst.jinja index 9a986bf5da..a2dca14038 100644 --- a/docs/templates/resource.rst.jinja +++ b/docs/templates/resource.rst.jinja @@ -1,11 +1,3 @@ -------------------------------------------------------------------------------- -{{ resource.name }} -------------------------------------------------------------------------------- - -{{ resource.description | wordwrap(78) if resource.description else -'No table description available.' }} -`Browse or query this table in Datasette. `__ - .. list-table:: :widths: auto :header-rows: 1 diff --git a/src/pudl/metadata/classes.py b/src/pudl/metadata/classes.py index 5ce23443ff..379afc7cef 100644 --- a/src/pudl/metadata/classes.py +++ b/src/pudl/metadata/classes.py @@ -1811,10 +1811,10 @@ def get_resource_by_etl_group(self) -> dict[str, list[Resource]]: ] return resource_dict - def to_rst(self, docs_dir: DirectoryPath, path: str) -> None: + def to_rst(self, docs_dir: DirectoryPath, path: str, add_fields: bool) -> None: """Output to an RST file.""" template = _get_jinja_environment(docs_dir).get_template("package.rst.jinja") - rendered = template.render(package=self) + rendered = template.render(package=self, add_fields=add_fields) if path: Path(path).write_text(rendered) else: From 01a670df073ef06ad6c83cb752fcc75aa9bf92bd Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Fri, 24 Feb 2023 17:20:02 -0700 Subject: [PATCH 3/6] Make ETLGroup class and add some descriptions to ETL_GROUPS constant. Add etl_group description to the docs, fix the harvest_test.py tests so that etl_groups are accurately represented. Add etl_group to the docstrings in the classes module. --- docs/conf.py | 11 +++-- docs/templates/package.rst.jinja | 4 +- src/pudl/metadata/classes.py | 67 ++++++++++++++++----------- src/pudl/metadata/constants.py | 77 ++++++++++++++++++++++++++++++++ test/unit/harvest_test.py | 12 ++++- 5 files changed, 140 insertions(+), 31 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 548169b2a0..039a5ca877 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,7 +14,7 @@ import pkg_resources -from pudl.metadata.classes import CodeMetadata, DataSource, Package +from pudl.metadata.classes import CodeMetadata, DataSource, ETLGroup, Package from pudl.metadata.codes import CODE_METADATA from pudl.metadata.resources import RESOURCE_METADATA @@ -153,10 +153,15 @@ def data_sources_metadata_to_rst(app): print("Exporting data source metadata to RST.") included_sources = ["eia860", "eia923", "ferc1", "epacems"] package = Package.from_resource_ids() - extra_etl_groups = {"eia860": ["entity_eia"], "ferc1": ["glue"]} + extra_etl_groups = { + "eia860": [ETLGroup.from_id("entity_eia")], + "ferc1": [ETLGroup.from_id("glue")], + } for name in included_sources: source = DataSource.from_id(name) - source_resources = [res for res in package.resources if res.etl_group == name] + source_resources = [ + res for res in package.resources if res.etl_group == ETLGroup.from_id(name) + ] extra_resources = None if name in extra_etl_groups: # get resources for this source from extra etl groups diff --git a/docs/templates/package.rst.jinja b/docs/templates/package.rst.jinja index 6512869a31..7bda7d56f1 100644 --- a/docs/templates/package.rst.jinja +++ b/docs/templates/package.rst.jinja @@ -7,9 +7,11 @@ PUDL Data Dictionary The following data tables have been cleaned and transformed by our ETL process. {% for group, resources in package.get_resource_by_etl_group().items() %} -{{ group }} +{{ package.get_etl_group()[group].title }} ------------------------------------------------------------------------------- +{{ package.get_etl_group()[group].description }} + {% for resource in resources %} {% if add_fields %} diff --git a/src/pudl/metadata/classes.py b/src/pudl/metadata/classes.py index 379afc7cef..c0192d511b 100644 --- a/src/pudl/metadata/classes.py +++ b/src/pudl/metadata/classes.py @@ -22,6 +22,7 @@ from pudl.metadata.constants import ( CONSTRAINT_DTYPES, CONTRIBUTORS, + ETL_GROUPS, FIELD_DTYPES_PANDAS, FIELD_DTYPES_PYARROW, FIELD_DTYPES_SQL, @@ -1021,6 +1022,24 @@ def from_id(cls, x: str) -> "DataSource": return cls(**cls.dict_from_id(x)) +class ETLGroup(Base): + """Blah.""" + + # name: SnakeCase + title: String = None + description: String = None + + @staticmethod + def dict_from_id(x: str) -> dict: + """Construct dictionary from PUDL identifier.""" + return copy.deepcopy(ETL_GROUPS[x]) + + @classmethod + def from_id(cls, x: str) -> "ETLGroup": + """Construct Source by source name in the metadata.""" + return cls(**cls.dict_from_id(x)) + + class ResourceHarvest(Base): """Resource harvest parameters (`resource.harvest`).""" @@ -1046,7 +1065,8 @@ class Resource(Base): >>> fields = [{'name': 'x', 'type': 'year'}, {'name': 'y', 'type': 'string'}] >>> fkeys = [{'fields': ['x', 'y'], 'reference': {'resource': 'b', 'fields': ['x', 'y']}}] >>> schema = {'fields': fields, 'primary_key': ['x'], 'foreign_keys': fkeys} - >>> resource = Resource(name='a', schema=schema) + >>> etl_group = ETLGroup(title='a', description='a') + >>> resource = Resource(name='a', schema=schema, etl_group=etl_group) >>> table = resource.to_sql() >>> table.columns.x Column('x', Integer(), ForeignKey('b.x'), CheckConstraint(...), table=, primary_key=True, nullable=False) @@ -1065,7 +1085,8 @@ class Resource(Base): >>> resource = Resource(**{ ... 'name': 'a', ... 'harvest': {'harvest': True}, - ... 'schema': {'fields': fields, 'primary_key': ['id']} + ... 'schema': {'fields': fields, 'primary_key': ['id']}, + ... 'etl_group': ETLGroup(title='a', description='b') ... }) >>> dfs = { ... 'a': pd.DataFrame({'id': [1, 1, 2, 2], 'x': [1, 1, 2, 2]}), @@ -1143,7 +1164,8 @@ class Resource(Base): >>> fields = [{'name': 'report_year', 'type': 'year'}] >>> resource = Resource(**{ ... 'name': 'table', 'harvest': {'harvest': True}, - ... 'schema': {'fields': fields, 'primary_key': ['report_year']} + ... 'schema': {'fields': fields, 'primary_key': ['report_year']}, + ... 'etl_group': ETLGroup(title='a', description='b') ... }) >>> df = pd.DataFrame({'report_date': ['2000-02-02', '2000-03-03']}) >>> resource.format_df(df) @@ -1182,23 +1204,7 @@ class Resource(Base): "ppe", "eia_bulk_elec", ] = None - etl_group: Literal[ - "eia860", - "eia861", - "eia923", - "entity_eia", - "epacems", - "ferc1", - "ferc1_disabled", - "ferc714", - "glue", - "outputs", - "static_ferc1", - "static_eia", - "static_eia_disabled", - "eia_bulk_elec", - "static_pudl", - ] = None + etl_group: ETLGroup _check_unique = _validator( "contributors", "keywords", "licenses", "sources", fn=_check_unique @@ -1279,6 +1285,7 @@ def dict_from_id(x: str) -> dict: # noqa: C901 # Delete foreign key rules if "foreign_key_rules" in schema: del schema["foreign_key_rules"] + obj["etl_group"] = ETLGroup.from_id(obj["etl_group"]) # Add encoders to columns as appropriate, based on FKs. # Foreign key relationships determine the set of codes to use @@ -1379,7 +1386,8 @@ def match_primary_key(self, names: Iterable[str]) -> dict[str, str] | None: Examples: >>> fields = [{'name': 'x_year', 'type': 'year'}] >>> schema = {'fields': fields, 'primary_key': ['x_year']} - >>> resource = Resource(name='r', schema=schema) + >>> etl_group = ETLGroup(title='r', description='r') + >>> resource = Resource(name='r', schema=schema, etl_group=etl_group) By default, when :attr:`harvest` .`harvest=False`, exact matches are required. @@ -1691,8 +1699,9 @@ class Package(Base): >>> fields = [{'name': 'x', 'type': 'year'}, {'name': 'y', 'type': 'string'}] >>> fkey = {'fields': ['x', 'y'], 'reference': {'resource': 'b', 'fields': ['x', 'y']}} >>> schema = {'fields': fields, 'primary_key': ['x'], 'foreign_keys': [fkey]} - >>> a = Resource(name='a', schema=schema) - >>> b = Resource(name='b', schema=Schema(fields=fields, primary_key=['x'])) + >>> etl_group = ETLGroup(title='a', description='b') + >>> a = Resource(name='a', schema=schema, etl_group=etl_group) + >>> b = Resource(name='b', schema=Schema(fields=fields, primary_key=['x']), etl_group=etl_group) >>> Package(name='ab', resources=[a, b]) Traceback (most recent call last): ValidationError: ... @@ -1792,7 +1801,6 @@ def from_resource_ids( # noqa: C901 i = len(resources) if len(names) > i: resources += [Resource.dict_from_id(x) for x in names[i:]] - return cls(name="pudl", resources=resources) def get_resource(self, name: str) -> Resource: @@ -1800,14 +1808,21 @@ def get_resource(self, name: str) -> Resource: names = [resource.name for resource in self.resources] return self.resources[names.index(name)] + def get_etl_group(self) -> dict[str, ETLGroup]: + """blah.""" + etl_group_dict = {} + for etl_group in ETL_GROUPS.keys(): + etl_group_dict[etl_group] = ETLGroup.from_id(etl_group) + return etl_group_dict + def get_resource_by_etl_group(self) -> dict[str, list[Resource]]: """blah.""" resource_dict = {} - for etl_group in list(self.resources[0].__annotations__["etl_group"].__args__): + for etl_group in ETL_GROUPS.keys(): resource_dict[etl_group] = [ resource for resource in self.resources - if resource.etl_group == etl_group + if resource.etl_group == ETLGroup.from_id(etl_group) ] return resource_dict diff --git a/src/pudl/metadata/constants.py b/src/pudl/metadata/constants.py index 6ba46dc6bb..a1996b8987 100644 --- a/src/pudl/metadata/constants.py +++ b/src/pudl/metadata/constants.py @@ -153,6 +153,83 @@ } """PUDL Contributors for attribution.""" +ETL_GROUPS: dict[str, dict[str, str]] = { + "eia860": { + "title": "EIA 860", + "description": """Tables derrived from the EIA Form 860. See our +:doc:`../data_sources/eia860` page for more information.""", + }, + "eia861": { + "title": "EIA 861", + "description": "Tables derrived from the EIA Form 861.", + }, + "eia923": { + "title": "EIA 923", + "description": """Tables derrived from the EIA Form 923. See our +:doc:`../data_sources/eia923` page for more information.""", + }, + "entity_eia": { + "title": "EIA Entity Tables", + "description": """EIA entity tables combine information reported in multiple EIA +tables into a single source of truth. "Entities" (boilers, generators, plants and +utilities) are referred to repeatedly throughout EIA resulting in data duplication and +human-error. For example, one year a plant's latitude is ``55.339722`` and +another year it's ``55.339725`` or in 860 a plant is called ``Barry`` but in 923 it's +once referred to as ``Bary``. We use a "harvesting" process to extract this static (not +changing on an annual basis) information, determine the true value, and rehome it in our +entity tables. This reduces the amount of data we have to store and provides users with +a master list of all entitiy types and their characteristics that are reported to a +given source. Read more about our harvesting process in :mod:`pudl.transform.eia`. +""", + }, + "epacems": { + "title": "EPA CEMS", + "description": """Tables derrived from the EPA CEMS data. See our +:doc:`../data_sources/epacems` page for more information""", + }, + "ferc1": { + "title": "FERC Form 1", + "description": """Tables derrived from the FERC Form 1 tables. See our +:doc:`../data_sources/ferc1` page for more information""", + }, + "ferc1_disabled": { + "title": "FERC Form 1 disabled", + "description": "BLAH", + }, + "ferc714": { + "title": "FERC Form 714", + "description": "Tables derrived from the FERC Form 714 tables", + }, + "glue": { + "title": "Glue Tables", + "description": "Tables connecting information from multiple sources", + }, + "outputs": { + "title": "Output Tables", + "description": "Blah", + }, + "static_ferc1": { + "title": "Static FERC 1 Tables", + "description": "Blah", + }, + "static_eia": { + "title": "Static EIA Tables", + "description": """Static EIA tables are like EIA entity tables in that they pull +from multiple EIA tables, but their purpose is to elucidate encoded language. This is +where acromyns are connected to their full spelling and descriptions.""", + }, + "static_eia_disabled": { + "title": "Static EIA Tables (disabled)", + "description": "blah", + }, + "eia_bulk_elec": { + "title": "EIA Bulk Electricity Tables", + "description": "Blah", + }, + "static_pudl": {"title": "Static PUDL Tables", "description": "Blah"}, +} +"""Table categorization by ETL group.""" + KEYWORDS: dict[str, list[str]] = { "electricity": [ "electricity", diff --git a/test/unit/harvest_test.py b/test/unit/harvest_test.py index 6b6a158b3e..23c17b66f9 100644 --- a/test/unit/harvest_test.py +++ b/test/unit/harvest_test.py @@ -5,7 +5,7 @@ import pandas as pd import pytest -from pudl.metadata.classes import Resource +from pudl.metadata.classes import ETLGroup, Resource from pudl.metadata.helpers import most_frequent # ---- Helpers ---- # @@ -35,6 +35,7 @@ def _assert_frame_equal(a: pd.DataFrame, b: pd.DataFrame, **kwargs: Any) -> None ], "primary_key": ["i", "j"], }, + "etl_group": ETLGroup.from_id("static_pudl"), } HARVEST: dict[str, Any] = {**STANDARD, "harvest": {"harvest": True}} @@ -264,6 +265,7 @@ def test_resource_with_only_key_fields_harvests() -> None: "fields": ["plant_id_eia", "state", "balancing_authority_code_eia"], "primary_key": ["plant_id_eia"], }, + "etl_group": "eia860", }, { "name": "generator_entity_eia860", @@ -277,6 +279,7 @@ def test_resource_with_only_key_fields_harvests() -> None: ], "primary_key": ["plant_id_eia", "generator_id"], }, + "etl_group": "eia860", }, { "name": "generators_eia860", @@ -285,6 +288,7 @@ def test_resource_with_only_key_fields_harvests() -> None: "fields": ["plant_id_eia", "generator_id", "report_year", "capacity_mw"], "primary_key": ["plant_id_eia", "generator_id", "report_year"], }, + "etl_group": "eia860", }, { "name": "utility_entity_eia", @@ -293,6 +297,7 @@ def test_resource_with_only_key_fields_harvests() -> None: "fields": ["utility_id_eia", "utility_name_eia"], "primary_key": ["utility_id_eia"], }, + "etl_group": "entity_eia", }, { "name": "utility_assn_eia", @@ -301,6 +306,7 @@ def test_resource_with_only_key_fields_harvests() -> None: "fields": ["utility_id_eia", "report_year", "state", "county"], "primary_key": ["utility_id_eia", "report_year", "state", "county"], }, + "etl_group": "static_eia", }, { "name": "generation_eia923", @@ -314,6 +320,7 @@ def test_resource_with_only_key_fields_harvests() -> None: ], "primary_key": ["plant_id_eia", "generator_id", "report_month"], }, + "etl_group": "eia923", }, { "name": "sales_eia861", @@ -322,6 +329,7 @@ def test_resource_with_only_key_fields_harvests() -> None: "fields": ["utility_id_eia", "report_year", "state", "county", "sales"], "primary_key": ["utility_id_eia", "report_year", "state", "county"], }, + "etl_group": "eia861", }, { "name": "boiler_generator_assn_eia860", @@ -330,6 +338,7 @@ def test_resource_with_only_key_fields_harvests() -> None: "fields": ["plant_id_eia", "generator_id", "report_year", "boiler_id"], "primary_key": ["plant_id_eia", "generator_id", "report_year", "boiler_id"], }, + "etl_group": "eia860", }, ] @@ -338,6 +347,7 @@ def test_resource_with_only_key_fields_harvests() -> None: d["schema"]["fields"] = [ {"name": name, "type": FIELD_DTYPES[name]} for name in d["schema"]["fields"] ] + d["etl_group"] = ETLGroup.from_id(d["etl_group"]) RESOURCES[i] = Resource(**d) EXPECTED_DFS: dict[str, pd.DataFrame] = dict( From 868a67a9f78c41b291082ff70d86a46c7b974c0a Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Fri, 24 Feb 2023 19:37:20 -0700 Subject: [PATCH 4/6] Rename test.rst.jinja to etl_group.rst.jinja and remove add_fields parat for Package.to_rst() --- docs/conf.py | 10 ++--- docs/templates/package.rst.jinja | 12 ------ docs/templates/resource.rst.jinja | 1 + src/pudl/metadata/classes.py | 6 +-- src/pudl/metadata/constants.py | 62 +++++++++++++++---------------- 5 files changed, 40 insertions(+), 51 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 039a5ca877..ef73f61727 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -138,13 +138,13 @@ def data_dictionary_metadata_to_rst(app): resource.schema.fields = sorted(resource.schema.fields, key=lambda x: x.name) package.to_rst( docs_dir=DOCS_DIR, - path=DOCS_DIR / "data_dictionaries/pudl_db_fields.rst", - add_fields=True, + path=DOCS_DIR / "data_dictionaries/pudl_db.rst", + template="etl_group.rst.jinja", ) package.to_rst( docs_dir=DOCS_DIR, - path=DOCS_DIR / "data_dictionaries/pudl_db.rst", - add_fields=False, + path=DOCS_DIR / "data_dictionaries/pudl_db_fields.rst", + template="package.rst.jinja", ) @@ -197,7 +197,7 @@ def static_dfs_to_rst(app): def cleanup_rsts(app, exception): """Remove generated RST files when the build is finished.""" (DOCS_DIR / "data_dictionaries/pudl_db.rst").unlink() - (DOCS_DIR / "data_dictionaries/pudl_db_fields.rst").unlink() + # (DOCS_DIR / "data_dictionaries/pudl_db_fields.rst").unlink() (DOCS_DIR / "data_dictionaries/codes_and_labels.rst").unlink() (DOCS_DIR / "data_sources/eia860.rst").unlink() (DOCS_DIR / "data_sources/eia923.rst").unlink() diff --git a/docs/templates/package.rst.jinja b/docs/templates/package.rst.jinja index 7bda7d56f1..d933a76072 100644 --- a/docs/templates/package.rst.jinja +++ b/docs/templates/package.rst.jinja @@ -7,30 +7,18 @@ PUDL Data Dictionary The following data tables have been cleaned and transformed by our ETL process. {% for group, resources in package.get_resource_by_etl_group().items() %} -{{ package.get_etl_group()[group].title }} -------------------------------------------------------------------------------- - -{{ package.get_etl_group()[group].description }} - {% for resource in resources %} -{% if add_fields %} .. _{{ resource.name }}: -{% endif %} {{ resource.name }} ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ {{ resource.description | wordwrap(78) if resource.description else 'No table description available.' }} -{% if not add_fields %} -:ref:`View Columns <{{ resource.name }}>` -{% endif %} `Browse or query this table in Datasette. `__ -{% if add_fields %} {% include 'resource.rst.jinja' %} -{% endif %} {% endfor %} {% endfor %} diff --git a/docs/templates/resource.rst.jinja b/docs/templates/resource.rst.jinja index a2dca14038..0c0f06f62a 100644 --- a/docs/templates/resource.rst.jinja +++ b/docs/templates/resource.rst.jinja @@ -9,4 +9,5 @@ * - {{ field.name }} - {{ field.type }} - {{ field.description if field.description else "N/A" }} + {%- endfor %} diff --git a/src/pudl/metadata/classes.py b/src/pudl/metadata/classes.py index c0192d511b..a20728d4c9 100644 --- a/src/pudl/metadata/classes.py +++ b/src/pudl/metadata/classes.py @@ -1826,10 +1826,10 @@ def get_resource_by_etl_group(self) -> dict[str, list[Resource]]: ] return resource_dict - def to_rst(self, docs_dir: DirectoryPath, path: str, add_fields: bool) -> None: + def to_rst(self, docs_dir: DirectoryPath, path: str, template: str) -> None: """Output to an RST file.""" - template = _get_jinja_environment(docs_dir).get_template("package.rst.jinja") - rendered = template.render(package=self, add_fields=add_fields) + template = _get_jinja_environment(docs_dir).get_template(template) + rendered = template.render(package=self) if path: Path(path).write_text(rendered) else: diff --git a/src/pudl/metadata/constants.py b/src/pudl/metadata/constants.py index a1996b8987..aa030e6dea 100644 --- a/src/pudl/metadata/constants.py +++ b/src/pudl/metadata/constants.py @@ -154,6 +154,30 @@ """PUDL Contributors for attribution.""" ETL_GROUPS: dict[str, dict[str, str]] = { + "entity_eia": { + "title": "EIA Entity Tables", + "description": """EIA entity tables combine information reported in multiple EIA +tables into a single source of truth. "Entities" (boilers, generators, plants and +utilities) are referred to repeatedly throughout EIA resulting in data duplication and +human-error. For example, one year a plant's latitude is ``55.339722`` and +another year it's ``55.339725`` or in 860 a plant is called ``Barry`` but in 923 it's +once referred to as ``Bary``. We use a "harvesting" process to extract this static (not +changing on an annual basis) information, determine the true value, and rehome it in our +entity tables. This reduces the amount of data we have to store and provides users with +a master list of all entitiy types and their characteristics that are reported to a +given source. Read more about our harvesting process in :mod:`pudl.transform.eia`. +""", + }, + "static_eia": { + "title": "EIA Static Tables", + "description": """Static EIA tables are like EIA entity tables in that they pull +from multiple EIA tables, but their purpose is to elucidate encoded language. This is +where acromyns are connected to their full spelling and descriptions.""", + }, + "static_eia_disabled": { + "title": "EIA Static Tables (disabled)", + "description": "blah", + }, "eia860": { "title": "EIA 860", "description": """Tables derrived from the EIA Form 860. See our @@ -168,25 +192,19 @@ "description": """Tables derrived from the EIA Form 923. See our :doc:`../data_sources/eia923` page for more information.""", }, - "entity_eia": { - "title": "EIA Entity Tables", - "description": """EIA entity tables combine information reported in multiple EIA -tables into a single source of truth. "Entities" (boilers, generators, plants and -utilities) are referred to repeatedly throughout EIA resulting in data duplication and -human-error. For example, one year a plant's latitude is ``55.339722`` and -another year it's ``55.339725`` or in 860 a plant is called ``Barry`` but in 923 it's -once referred to as ``Bary``. We use a "harvesting" process to extract this static (not -changing on an annual basis) information, determine the true value, and rehome it in our -entity tables. This reduces the amount of data we have to store and provides users with -a master list of all entitiy types and their characteristics that are reported to a -given source. Read more about our harvesting process in :mod:`pudl.transform.eia`. -""", + "eia_bulk_elec": { + "title": "EIA Bulk Electricity Tables", + "description": "Blah", }, "epacems": { "title": "EPA CEMS", "description": """Tables derrived from the EPA CEMS data. See our :doc:`../data_sources/epacems` page for more information""", }, + "static_ferc1": { + "title": "FERC Form 1 Static Tables", + "description": "Blah", + }, "ferc1": { "title": "FERC Form 1", "description": """Tables derrived from the FERC Form 1 tables. See our @@ -208,24 +226,6 @@ "title": "Output Tables", "description": "Blah", }, - "static_ferc1": { - "title": "Static FERC 1 Tables", - "description": "Blah", - }, - "static_eia": { - "title": "Static EIA Tables", - "description": """Static EIA tables are like EIA entity tables in that they pull -from multiple EIA tables, but their purpose is to elucidate encoded language. This is -where acromyns are connected to their full spelling and descriptions.""", - }, - "static_eia_disabled": { - "title": "Static EIA Tables (disabled)", - "description": "blah", - }, - "eia_bulk_elec": { - "title": "EIA Bulk Electricity Tables", - "description": "Blah", - }, "static_pudl": {"title": "Static PUDL Tables", "description": "Blah"}, } """Table categorization by ETL group.""" From 79e07b7bbe2dba03f9d74609d3d4cd6324109fdf Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Fri, 24 Feb 2023 19:45:54 -0700 Subject: [PATCH 5/6] Add more descriptions to etl_groups constants --- src/pudl/metadata/constants.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/pudl/metadata/constants.py b/src/pudl/metadata/constants.py index aa030e6dea..eaf6c2eb30 100644 --- a/src/pudl/metadata/constants.py +++ b/src/pudl/metadata/constants.py @@ -172,11 +172,13 @@ "title": "EIA Static Tables", "description": """Static EIA tables are like EIA entity tables in that they pull from multiple EIA tables, but their purpose is to elucidate encoded language. This is -where acromyns are connected to their full spelling and descriptions.""", +where acromyns are connected to their full spelling and descriptions. These tables did +not originate as raw EIA tables, rather they are created by us to link commonly used +codes to important descriptors.""", }, "static_eia_disabled": { "title": "EIA Static Tables (disabled)", - "description": "blah", + "description": "Disabled tables are no longer included in the PUDL DB.", }, "eia860": { "title": "EIA 860", @@ -203,7 +205,10 @@ }, "static_ferc1": { "title": "FERC Form 1 Static Tables", - "description": "Blah", + "description": """Static FERC Form 1 tables elucidate encoded language. This is +where acromyns are connected to their full spelling and descriptions. These tables did +not originate as raw FERC Form 1 tables, rather they are created by us to link commonly +used codes to important descriptors.""", }, "ferc1": { "title": "FERC Form 1", @@ -212,7 +217,7 @@ }, "ferc1_disabled": { "title": "FERC Form 1 disabled", - "description": "BLAH", + "description": "Disabled tables are no longer included in the PUDL DB.", }, "ferc714": { "title": "FERC Form 714", @@ -226,7 +231,12 @@ "title": "Output Tables", "description": "Blah", }, - "static_pudl": {"title": "Static PUDL Tables", "description": "Blah"}, + "static_pudl": { + "title": "Static PUDL Tables", + "description": """Static PUDL tables elucidate encoded language. This is +where acromyns are connected to their full spelling and descriptions. They're created +to link commonly used codes to important descriptors.""", + }, } """Table categorization by ETL group.""" From 295886adfd3e75d7cf25d1a09127cb88ab93fb57 Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Mon, 27 Feb 2023 16:49:56 -0700 Subject: [PATCH 6/6] Add in etl_group.rst.jinja jinja template that I forgot. Also uncomment pudl_db_fields from cleanup_rsts so it gets removed --- docs/conf.py | 2 +- docs/templates/etl_group.rst.jinja | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 docs/templates/etl_group.rst.jinja diff --git a/docs/conf.py b/docs/conf.py index ef73f61727..f03a6be1b4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -197,7 +197,7 @@ def static_dfs_to_rst(app): def cleanup_rsts(app, exception): """Remove generated RST files when the build is finished.""" (DOCS_DIR / "data_dictionaries/pudl_db.rst").unlink() - # (DOCS_DIR / "data_dictionaries/pudl_db_fields.rst").unlink() + (DOCS_DIR / "data_dictionaries/pudl_db_fields.rst").unlink() (DOCS_DIR / "data_dictionaries/codes_and_labels.rst").unlink() (DOCS_DIR / "data_sources/eia860.rst").unlink() (DOCS_DIR / "data_sources/eia923.rst").unlink() diff --git a/docs/templates/etl_group.rst.jinja b/docs/templates/etl_group.rst.jinja new file mode 100644 index 0000000000..8da45bee8d --- /dev/null +++ b/docs/templates/etl_group.rst.jinja @@ -0,0 +1,19 @@ +=============================================================================== +PUDL Data Dictionary +=============================================================================== + +The following data tables have been cleaned and transformed by our ETL process. +They're seperated into categories based on their original source and the type of +content they contain. + +{% for group, resources in package.get_resource_by_etl_group().items() %} +{{ package.get_etl_group()[group].title }} +------------------------------------------------------------------------------- + +{{ package.get_etl_group()[group].description }} + +{% for resource in resources %} +- :ref:`{{ resource.name }}` + +{% endfor %} +{% endfor %}