From 139b6218781cc5fad6316a21edb8b41893fb5818 Mon Sep 17 00:00:00 2001
From: Dazhong Xia <dazhong.xia@catalyst.coop>
Date: Thu, 27 Jul 2023 17:22:51 -0400
Subject: [PATCH] Test has a bunch of kind of ok looking lost facts

---
 src/ferc_xbrl_extractor/cli.py         |  4 +-
 src/ferc_xbrl_extractor/datapackage.py |  4 +-
 src/ferc_xbrl_extractor/instance.py    |  3 ++
 src/ferc_xbrl_extractor/xbrl.py        | 19 ++++++--
 tests/integration/lost_facts_test.py   | 66 ++++++++++++++++++++++++++
 5 files changed, 88 insertions(+), 8 deletions(-)
 create mode 100644 tests/integration/lost_facts_test.py

diff --git a/src/ferc_xbrl_extractor/cli.py b/src/ferc_xbrl_extractor/cli.py
index 49a8574..e906fe3 100644
--- a/src/ferc_xbrl_extractor/cli.py
+++ b/src/ferc_xbrl_extractor/cli.py
@@ -102,7 +102,7 @@ def instances_from_zip(instance_path: Path) -> list[InstanceBuilder]:
     Args:
         instance_path: Path to zipfile containing XBRL filings.
     """
-    allowable_suffixes = ["xbrl", "xml"]
+    allowable_suffixes = [".xbrl"]  # , ".xml"]
 
     archive = zipfile.ZipFile(instance_path)
 
@@ -112,7 +112,7 @@ def instances_from_zip(instance_path: Path) -> list[InstanceBuilder]:
             io.BytesIO(archive.open(filename).read()), filename.split(".")[0]
         )
         for filename in archive.namelist()
-        if filename.split(".")[1] in allowable_suffixes
+        if Path(filename).suffix in allowable_suffixes
     ]
 
 
diff --git a/src/ferc_xbrl_extractor/datapackage.py b/src/ferc_xbrl_extractor/datapackage.py
index 81e80c1..c6e0faa 100644
--- a/src/ferc_xbrl_extractor/datapackage.py
+++ b/src/ferc_xbrl_extractor/datapackage.py
@@ -360,12 +360,14 @@ def construct_dataframe(self, instance: Instance) -> pd.DataFrame:
 
         # Loop through contexts and get facts in each context
         # Each context corresponds to one unique row
+        fact_ids = set()
         for i, (context, facts) in enumerate(contexts.items()):
             if self.instant != context.period.instant:
                 continue
 
             # Construct dictionary to represent row which corresponds to current context
             row = {fact.name: fact.value for fact in facts if fact.name in df}
+            fact_ids.update({fact.f_id for fact in facts if fact.name in df})
 
             # If row is empty skip
             if row:
@@ -383,7 +385,7 @@ def construct_dataframe(self, instance: Instance) -> pd.DataFrame:
                         )
 
         # Create dataframe and drop empty rows
-        return pd.DataFrame(df).dropna(how="all")
+        return {"df": pd.DataFrame(df).dropna(how="all"), "ids": fact_ids}
 
 
 class Datapackage(BaseModel):
diff --git a/src/ferc_xbrl_extractor/instance.py b/src/ferc_xbrl_extractor/instance.py
index 4540c18..9acf31e 100644
--- a/src/ferc_xbrl_extractor/instance.py
+++ b/src/ferc_xbrl_extractor/instance.py
@@ -193,6 +193,7 @@ class Fact(BaseModel):
 
     name: str
     c_id: str
+    f_id: str
     value: str | None = None
 
     @classmethod
@@ -203,6 +204,7 @@ def from_xml(cls, elem: Element) -> "Fact":
         return cls(
             name=stringcase.snakecase(elem.tag.replace(prefix, "")),  # Strip prefix
             c_id=elem.attrib["contextRef"],
+            f_id=elem.attrib["id"],
             value=elem.text,
         )
 
@@ -257,6 +259,7 @@ def __init__(
         # This is a nested dictionary of dictionaries to locate facts by context
         self.instant_facts: FactDict = {}
         self.duration_facts: FactDict = {}
+        self.contexts = contexts
 
         self.filing_name = filing_name
 
diff --git a/src/ferc_xbrl_extractor/xbrl.py b/src/ferc_xbrl_extractor/xbrl.py
index b181e11..a2b58b6 100644
--- a/src/ferc_xbrl_extractor/xbrl.py
+++ b/src/ferc_xbrl_extractor/xbrl.py
@@ -78,14 +78,17 @@ def extract(
         results = executor.map(process_batches, batched_instances)
 
         # Write extracted data to database
+        ids = {}
         for i, batch in enumerate(results):
             logger.info(f"Finished batch {i + 1}/{num_batches}")
 
             # Loop through tables and write to database
             with engine.begin() as conn:
-                for key, df in batch.items():
+                for key, df in batch["dfs"].items():
                     if not df.empty:
                         df.to_sql(key, conn, if_exists="append")
+                ids.update(batch["ids"])
+        return ids
 
 
 def process_batch(
@@ -106,14 +109,17 @@ def process_batch(
     logger = get_logger(__name__)
 
     dfs: dict[str, pd.DataFrame] = {}
+    ids: dict[str, set] = {}
     for instance in instances:
         # Parse XBRL instance. Log/skip if file is empty
         try:
-            instance_dfs = process_instance(instance, tables)
+            instance_info = process_instance(instance, tables)
         except XMLSyntaxError:
             logger.info(f"XBRL filing {instance.name} is empty. Skipping.")
             continue
 
+        instance_dfs = instance_info["dfs"]
+        ids[instance.name] = instance_info["ids"]
         for key, df in instance_dfs.items():
             if key not in dfs:
                 dfs[key] = []
@@ -122,7 +128,7 @@ def process_batch(
 
     dfs = {key: pd.concat(df_list) for key, df_list in dfs.items()}
 
-    return dfs
+    return {"dfs": dfs, "ids": ids}
 
 
 def process_instance(
@@ -145,10 +151,13 @@ def process_instance(
     logger.info(f"Extracting {instance.filing_name}")
 
     dfs = {}
+    ids = set()
     for key, table in tables.items():
-        dfs[key] = table.construct_dataframe(instance)
+        constructed = table.construct_dataframe(instance)
+        dfs[key] = constructed["df"]
+        ids.update(constructed["ids"])
 
-    return dfs
+    return {"dfs": dfs, "ids": ids}
 
 
 def get_fact_tables(
diff --git a/tests/integration/lost_facts_test.py b/tests/integration/lost_facts_test.py
new file mode 100644
index 0000000..3d134de
--- /dev/null
+++ b/tests/integration/lost_facts_test.py
@@ -0,0 +1,66 @@
+from collections import Counter
+import itertools
+import os
+from pathlib import Path
+from sqlalchemy import create_engine
+
+from ferc_xbrl_extractor.cli import get_instances, TAXONOMY_MAP
+from ferc_xbrl_extractor.xbrl import extract, process_instance
+
+
+def test_lost_fact_finder(tmp_path):
+    instances = get_instances(
+        Path(os.getenv("PUDL_INPUT"))
+        / "ferc1"
+        / "10.5281-zenodo.7314437"
+        / "ferc1-xbrl-2021.zip"
+    )
+
+    used_ids = extract(
+        instances=instances[:1],
+        engine=create_engine("sqlite:///:memory:"),
+        taxonomy=TAXONOMY_MAP[1],
+        form_number=1,
+        metadata_path=Path(tmp_path) / "metadata.json",
+    )
+
+    instance = instances[0].parse()
+    instant_facts = itertools.chain.from_iterable(
+        itertools.chain.from_iterable(
+            context.values() for context in instance.instant_facts.values()
+        )
+    )
+    duration_facts = itertools.chain.from_iterable(
+        itertools.chain.from_iterable(
+            context.values() for context in instance.duration_facts.values()
+        )
+    )
+    all_facts = list(itertools.chain(instant_facts, duration_facts))
+
+    def clean_fact(fact, contexts):
+        return {"name": fact.name, "context": contexts[fact.c_id], "value": fact.value}
+
+    lost_facts = [
+        clean_fact(f, instance.contexts)
+        for f in all_facts
+        if f.f_id not in used_ids[instances[0].name]
+    ]
+
+    lostest_names = Counter(f["name"] for f in lost_facts)
+    breakpoint()
+
+    assert len(lost_facts) / len(all_facts) < 0.1
+    # print(lost_facts_info)
+    """
+    date | entity | sorted_dims | fact_name | value
+    ----------------
+    2021-12-31 | AP | [("dim1", 1), ("dim2", 2)] | Why Don't I have a home? | 17 
+    2021-12-31 | AP | [("dim1", 1)] | Why Don't I have a home? | 18 
+    ....
+
+    """
+
+    # questions we could ask:
+
+    # most common fact names
+    # most common (fact name, {non-null dimensions})