Skip to content

Commit

Permalink
Test has a bunch of kind of ok looking lost facts
Browse files Browse the repository at this point in the history
  • Loading branch information
jdangerx committed Jul 27, 2023
1 parent bb67741 commit 139b621
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 8 deletions.
4 changes: 2 additions & 2 deletions src/ferc_xbrl_extractor/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def instances_from_zip(instance_path: Path) -> list[InstanceBuilder]:
Args:
instance_path: Path to zipfile containing XBRL filings.
"""
allowable_suffixes = ["xbrl", "xml"]
allowable_suffixes = [".xbrl"] # , ".xml"]

archive = zipfile.ZipFile(instance_path)

Expand All @@ -112,7 +112,7 @@ def instances_from_zip(instance_path: Path) -> list[InstanceBuilder]:
io.BytesIO(archive.open(filename).read()), filename.split(".")[0]
)
for filename in archive.namelist()
if filename.split(".")[1] in allowable_suffixes
if Path(filename).suffix in allowable_suffixes
]


Expand Down
4 changes: 3 additions & 1 deletion src/ferc_xbrl_extractor/datapackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,12 +360,14 @@ def construct_dataframe(self, instance: Instance) -> pd.DataFrame:

# Loop through contexts and get facts in each context
# Each context corresponds to one unique row
fact_ids = set()
for i, (context, facts) in enumerate(contexts.items()):
if self.instant != context.period.instant:
continue

# Construct dictionary to represent row which corresponds to current context
row = {fact.name: fact.value for fact in facts if fact.name in df}
fact_ids.update({fact.f_id for fact in facts if fact.name in df})

# If row is empty skip
if row:
Expand All @@ -383,7 +385,7 @@ def construct_dataframe(self, instance: Instance) -> pd.DataFrame:
)

# Create dataframe and drop empty rows
return pd.DataFrame(df).dropna(how="all")
return {"df": pd.DataFrame(df).dropna(how="all"), "ids": fact_ids}


class Datapackage(BaseModel):
Expand Down
3 changes: 3 additions & 0 deletions src/ferc_xbrl_extractor/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ class Fact(BaseModel):

name: str
c_id: str
f_id: str
value: str | None = None

@classmethod
Expand All @@ -203,6 +204,7 @@ def from_xml(cls, elem: Element) -> "Fact":
return cls(
name=stringcase.snakecase(elem.tag.replace(prefix, "")), # Strip prefix
c_id=elem.attrib["contextRef"],
f_id=elem.attrib["id"],
value=elem.text,
)

Expand Down Expand Up @@ -257,6 +259,7 @@ def __init__(
# This is a nested dictionary of dictionaries to locate facts by context
self.instant_facts: FactDict = {}
self.duration_facts: FactDict = {}
self.contexts = contexts

self.filing_name = filing_name

Expand Down
19 changes: 14 additions & 5 deletions src/ferc_xbrl_extractor/xbrl.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,17 @@ def extract(
results = executor.map(process_batches, batched_instances)

# Write extracted data to database
ids = {}
for i, batch in enumerate(results):
logger.info(f"Finished batch {i + 1}/{num_batches}")

# Loop through tables and write to database
with engine.begin() as conn:
for key, df in batch.items():
for key, df in batch["dfs"].items():
if not df.empty:
df.to_sql(key, conn, if_exists="append")
ids.update(batch["ids"])
return ids


def process_batch(
Expand All @@ -106,14 +109,17 @@ def process_batch(
logger = get_logger(__name__)

dfs: dict[str, pd.DataFrame] = {}
ids: dict[str, set] = {}
for instance in instances:
# Parse XBRL instance. Log/skip if file is empty
try:
instance_dfs = process_instance(instance, tables)
instance_info = process_instance(instance, tables)
except XMLSyntaxError:
logger.info(f"XBRL filing {instance.name} is empty. Skipping.")
continue

instance_dfs = instance_info["dfs"]
ids[instance.name] = instance_info["ids"]
for key, df in instance_dfs.items():
if key not in dfs:
dfs[key] = []
Expand All @@ -122,7 +128,7 @@ def process_batch(

dfs = {key: pd.concat(df_list) for key, df_list in dfs.items()}

return dfs
return {"dfs": dfs, "ids": ids}


def process_instance(
Expand All @@ -145,10 +151,13 @@ def process_instance(
logger.info(f"Extracting {instance.filing_name}")

dfs = {}
ids = set()
for key, table in tables.items():
dfs[key] = table.construct_dataframe(instance)
constructed = table.construct_dataframe(instance)
dfs[key] = constructed["df"]
ids.update(constructed["ids"])

return dfs
return {"dfs": dfs, "ids": ids}


def get_fact_tables(
Expand Down
66 changes: 66 additions & 0 deletions tests/integration/lost_facts_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from collections import Counter
import itertools
import os
from pathlib import Path
from sqlalchemy import create_engine

from ferc_xbrl_extractor.cli import get_instances, TAXONOMY_MAP
from ferc_xbrl_extractor.xbrl import extract, process_instance


def test_lost_fact_finder(tmp_path):
instances = get_instances(
Path(os.getenv("PUDL_INPUT"))
/ "ferc1"
/ "10.5281-zenodo.7314437"
/ "ferc1-xbrl-2021.zip"
)

used_ids = extract(
instances=instances[:1],
engine=create_engine("sqlite:///:memory:"),
taxonomy=TAXONOMY_MAP[1],
form_number=1,
metadata_path=Path(tmp_path) / "metadata.json",
)

instance = instances[0].parse()
instant_facts = itertools.chain.from_iterable(
itertools.chain.from_iterable(
context.values() for context in instance.instant_facts.values()
)
)
duration_facts = itertools.chain.from_iterable(
itertools.chain.from_iterable(
context.values() for context in instance.duration_facts.values()
)
)
all_facts = list(itertools.chain(instant_facts, duration_facts))

def clean_fact(fact, contexts):
return {"name": fact.name, "context": contexts[fact.c_id], "value": fact.value}

lost_facts = [
clean_fact(f, instance.contexts)
for f in all_facts
if f.f_id not in used_ids[instances[0].name]
]

lostest_names = Counter(f["name"] for f in lost_facts)
breakpoint()

assert len(lost_facts) / len(all_facts) < 0.1
# print(lost_facts_info)
"""
date | entity | sorted_dims | fact_name | value
----------------
2021-12-31 | AP | [("dim1", 1), ("dim2", 2)] | Why Don't I have a home? | 17
2021-12-31 | AP | [("dim1", 1)] | Why Don't I have a home? | 18
....
"""

# questions we could ask:

# most common fact names
# most common (fact name, {non-null dimensions})

0 comments on commit 139b621

Please sign in to comment.