From 139b6218781cc5fad6316a21edb8b41893fb5818 Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Thu, 27 Jul 2023 17:22:51 -0400 Subject: [PATCH] Test has a bunch of kind of ok looking lost facts --- src/ferc_xbrl_extractor/cli.py | 4 +- src/ferc_xbrl_extractor/datapackage.py | 4 +- src/ferc_xbrl_extractor/instance.py | 3 ++ src/ferc_xbrl_extractor/xbrl.py | 19 ++++++-- tests/integration/lost_facts_test.py | 66 ++++++++++++++++++++++++++ 5 files changed, 88 insertions(+), 8 deletions(-) create mode 100644 tests/integration/lost_facts_test.py diff --git a/src/ferc_xbrl_extractor/cli.py b/src/ferc_xbrl_extractor/cli.py index 49a8574..e906fe3 100644 --- a/src/ferc_xbrl_extractor/cli.py +++ b/src/ferc_xbrl_extractor/cli.py @@ -102,7 +102,7 @@ def instances_from_zip(instance_path: Path) -> list[InstanceBuilder]: Args: instance_path: Path to zipfile containing XBRL filings. """ - allowable_suffixes = ["xbrl", "xml"] + allowable_suffixes = [".xbrl"] # , ".xml"] archive = zipfile.ZipFile(instance_path) @@ -112,7 +112,7 @@ def instances_from_zip(instance_path: Path) -> list[InstanceBuilder]: io.BytesIO(archive.open(filename).read()), filename.split(".")[0] ) for filename in archive.namelist() - if filename.split(".")[1] in allowable_suffixes + if Path(filename).suffix in allowable_suffixes ] diff --git a/src/ferc_xbrl_extractor/datapackage.py b/src/ferc_xbrl_extractor/datapackage.py index 81e80c1..c6e0faa 100644 --- a/src/ferc_xbrl_extractor/datapackage.py +++ b/src/ferc_xbrl_extractor/datapackage.py @@ -360,12 +360,14 @@ def construct_dataframe(self, instance: Instance) -> pd.DataFrame: # Loop through contexts and get facts in each context # Each context corresponds to one unique row + fact_ids = set() for i, (context, facts) in enumerate(contexts.items()): if self.instant != context.period.instant: continue # Construct dictionary to represent row which corresponds to current context row = {fact.name: fact.value for fact in facts if fact.name in df} + fact_ids.update({fact.f_id for fact in facts if fact.name in df}) # If row is empty skip if row: @@ -383,7 +385,7 @@ def construct_dataframe(self, instance: Instance) -> pd.DataFrame: ) # Create dataframe and drop empty rows - return pd.DataFrame(df).dropna(how="all") + return {"df": pd.DataFrame(df).dropna(how="all"), "ids": fact_ids} class Datapackage(BaseModel): diff --git a/src/ferc_xbrl_extractor/instance.py b/src/ferc_xbrl_extractor/instance.py index 4540c18..9acf31e 100644 --- a/src/ferc_xbrl_extractor/instance.py +++ b/src/ferc_xbrl_extractor/instance.py @@ -193,6 +193,7 @@ class Fact(BaseModel): name: str c_id: str + f_id: str value: str | None = None @classmethod @@ -203,6 +204,7 @@ def from_xml(cls, elem: Element) -> "Fact": return cls( name=stringcase.snakecase(elem.tag.replace(prefix, "")), # Strip prefix c_id=elem.attrib["contextRef"], + f_id=elem.attrib["id"], value=elem.text, ) @@ -257,6 +259,7 @@ def __init__( # This is a nested dictionary of dictionaries to locate facts by context self.instant_facts: FactDict = {} self.duration_facts: FactDict = {} + self.contexts = contexts self.filing_name = filing_name diff --git a/src/ferc_xbrl_extractor/xbrl.py b/src/ferc_xbrl_extractor/xbrl.py index b181e11..a2b58b6 100644 --- a/src/ferc_xbrl_extractor/xbrl.py +++ b/src/ferc_xbrl_extractor/xbrl.py @@ -78,14 +78,17 @@ def extract( results = executor.map(process_batches, batched_instances) # Write extracted data to database + ids = {} for i, batch in enumerate(results): logger.info(f"Finished batch {i + 1}/{num_batches}") # Loop through tables and write to database with engine.begin() as conn: - for key, df in batch.items(): + for key, df in batch["dfs"].items(): if not df.empty: df.to_sql(key, conn, if_exists="append") + ids.update(batch["ids"]) + return ids def process_batch( @@ -106,14 +109,17 @@ def process_batch( logger = get_logger(__name__) dfs: dict[str, pd.DataFrame] = {} + ids: dict[str, set] = {} for instance in instances: # Parse XBRL instance. Log/skip if file is empty try: - instance_dfs = process_instance(instance, tables) + instance_info = process_instance(instance, tables) except XMLSyntaxError: logger.info(f"XBRL filing {instance.name} is empty. Skipping.") continue + instance_dfs = instance_info["dfs"] + ids[instance.name] = instance_info["ids"] for key, df in instance_dfs.items(): if key not in dfs: dfs[key] = [] @@ -122,7 +128,7 @@ def process_batch( dfs = {key: pd.concat(df_list) for key, df_list in dfs.items()} - return dfs + return {"dfs": dfs, "ids": ids} def process_instance( @@ -145,10 +151,13 @@ def process_instance( logger.info(f"Extracting {instance.filing_name}") dfs = {} + ids = set() for key, table in tables.items(): - dfs[key] = table.construct_dataframe(instance) + constructed = table.construct_dataframe(instance) + dfs[key] = constructed["df"] + ids.update(constructed["ids"]) - return dfs + return {"dfs": dfs, "ids": ids} def get_fact_tables( diff --git a/tests/integration/lost_facts_test.py b/tests/integration/lost_facts_test.py new file mode 100644 index 0000000..3d134de --- /dev/null +++ b/tests/integration/lost_facts_test.py @@ -0,0 +1,66 @@ +from collections import Counter +import itertools +import os +from pathlib import Path +from sqlalchemy import create_engine + +from ferc_xbrl_extractor.cli import get_instances, TAXONOMY_MAP +from ferc_xbrl_extractor.xbrl import extract, process_instance + + +def test_lost_fact_finder(tmp_path): + instances = get_instances( + Path(os.getenv("PUDL_INPUT")) + / "ferc1" + / "10.5281-zenodo.7314437" + / "ferc1-xbrl-2021.zip" + ) + + used_ids = extract( + instances=instances[:1], + engine=create_engine("sqlite:///:memory:"), + taxonomy=TAXONOMY_MAP[1], + form_number=1, + metadata_path=Path(tmp_path) / "metadata.json", + ) + + instance = instances[0].parse() + instant_facts = itertools.chain.from_iterable( + itertools.chain.from_iterable( + context.values() for context in instance.instant_facts.values() + ) + ) + duration_facts = itertools.chain.from_iterable( + itertools.chain.from_iterable( + context.values() for context in instance.duration_facts.values() + ) + ) + all_facts = list(itertools.chain(instant_facts, duration_facts)) + + def clean_fact(fact, contexts): + return {"name": fact.name, "context": contexts[fact.c_id], "value": fact.value} + + lost_facts = [ + clean_fact(f, instance.contexts) + for f in all_facts + if f.f_id not in used_ids[instances[0].name] + ] + + lostest_names = Counter(f["name"] for f in lost_facts) + breakpoint() + + assert len(lost_facts) / len(all_facts) < 0.1 + # print(lost_facts_info) + """ + date | entity | sorted_dims | fact_name | value + ---------------- + 2021-12-31 | AP | [("dim1", 1), ("dim2", 2)] | Why Don't I have a home? | 17 + 2021-12-31 | AP | [("dim1", 1)] | Why Don't I have a home? | 18 + .... + + """ + + # questions we could ask: + + # most common fact names + # most common (fact name, {non-null dimensions})