diff --git a/examples/lost_fact_exploration.ipynb b/examples/lost_fact_exploration.ipynb new file mode 100644 index 0000000..7a444b1 --- /dev/null +++ b/examples/lost_fact_exploration.ipynb @@ -0,0 +1,470 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a6e96483-25c6-488c-a423-558a0f10cfe1", + "metadata": {}, + "outputs": [], + "source": [ + "from ferc_xbrl_extractor.cli import TAXONOMY_MAP\n", + "import pandas as pd\n", + "from ferc_xbrl_extractor.xbrl import get_fact_tables\n", + "from pathlib import Path\n", + "from stringcase import snakecase\n", + "from collections import Counter\n", + "import itertools" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e6d2c82a-457f-4d39-b749-e1e5b9f13bdc", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_pickle(\"../lost_facts.pickle\")\n", + "\n", + "tmp_path=\"./\"\n", + "tables = get_fact_tables(\n", + " taxonomy_path=TAXONOMY_MAP[1],\n", + " form_number=1,\n", + " db_path=\"path\",\n", + " metadata_path=Path(tmp_path) / \"metadata.json\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2688defb-09c0-40cb-968c-7bd5c981ec4f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({'C000537': 3990,\n", + " 'C000746': 3150,\n", + " 'C000532': 2574,\n", + " 'C007667': 2550,\n", + " 'C000530': 2460,\n", + " 'C000041': 2350,\n", + " 'C000536': 2160,\n", + " 'C001009': 2106,\n", + " 'C002196': 1908,\n", + " 'C001555': 1885,\n", + " 'C000620': 1692,\n", + " 'C004995': 1680,\n", + " 'C008999': 1664,\n", + " 'C000535': 1583,\n", + " 'C000533': 1545,\n", + " 'C001016': 1346,\n", + " 'C000538': 1304,\n", + " 'C003184': 1216,\n", + " 'C000135': 1135,\n", + " 'C001646': 1118,\n", + " 'C000136': 1098,\n", + " 'C000534': 1076,\n", + " 'C001789': 1064,\n", + " 'C000913': 1023,\n", + " 'C007565': 994,\n", + " 'C001436': 962,\n", + " 'C000744': 957,\n", + " 'C000317': 952,\n", + " 'C001132': 918,\n", + " 'C000379': 880,\n", + " 'C000825': 848,\n", + " 'C001218': 847,\n", + " 'C000851': 837,\n", + " 'C000318': 827,\n", + " 'C001257': 813,\n", + " 'C000312': 809,\n", + " 'C000905': 809,\n", + " 'C008998': 780,\n", + " 'C000316': 764,\n", + " 'C000522': 756,\n", + " 'C000314': 745,\n", + " 'C000315': 745,\n", + " 'C001465': 742,\n", + " 'C000313': 741,\n", + " 'C000685': 740,\n", + " 'C000292': 724,\n", + " 'C000319': 712,\n", + " 'C007679': 684,\n", + " 'C001306': 674,\n", + " 'C001554': 672,\n", + " 'C001188': 663,\n", + " 'C000852': 661,\n", + " 'C001017': 649,\n", + " 'C000291': 628,\n", + " 'C000524': 622,\n", + " 'C001111': 621,\n", + " 'C001025': 620,\n", + " 'C011285': 620,\n", + " 'C000196': 616,\n", + " 'C002101': 615,\n", + " 'C002525': 614,\n", + " 'C001030': 611,\n", + " 'C002446': 610,\n", + " 'C002012': 608,\n", + " 'C005475': 605,\n", + " 'C002045': 597,\n", + " 'C002089': 594,\n", + " 'C003138': 587,\n", + " 'C004936': 561,\n", + " 'C001464': 559,\n", + " 'C001330': 558,\n", + " 'C000862': 552,\n", + " 'C001673': 547,\n", + " 'C000199': 545,\n", + " 'C001552': 540,\n", + " 'C000911': 533,\n", + " 'C001466': 533,\n", + " 'C001702': 530,\n", + " 'C000527': 528,\n", + " 'C000465': 520,\n", + " 'C000134': 514,\n", + " 'C000824': 513,\n", + " 'C002827': 507,\n", + " 'C000906': 506,\n", + " 'C000290': 499,\n", + " 'C000523': 494,\n", + " 'C000191': 471,\n", + " 'C001553': 469,\n", + " 'C001143': 467,\n", + " 'C001655': 459,\n", + " 'C001316': 449,\n", + " 'C000525': 446,\n", + " 'C001559': 434,\n", + " 'C001130': 432,\n", + " 'C003849': 428,\n", + " 'C001194': 428,\n", + " 'C001252': 422,\n", + " 'C000823': 419,\n", + " 'C000822': 417,\n", + " 'C000500': 417,\n", + " 'C001181': 413,\n", + " 'C000415': 397,\n", + " 'C000241': 391,\n", + " 'C003483': 390,\n", + " 'C001182': 387,\n", + " 'C000388': 381,\n", + " 'C000615': 379,\n", + " 'C001230': 373,\n", + " 'C000772': 369,\n", + " 'C004872': 368,\n", + " 'C000447': 366,\n", + " 'C001775': 357,\n", + " 'C001745': 357,\n", + " 'C000507': 354,\n", + " 'C002308': 350,\n", + " 'C001298': 349,\n", + " 'C001184': 349,\n", + " 'C000171': 348,\n", + " 'C000201': 348,\n", + " 'C001610': 342,\n", + " 'C000618': 341,\n", + " 'C000617': 340,\n", + " 'C000553': 332,\n", + " 'C000691': 332,\n", + " 'C003194': 324,\n", + " 'C004044': 324,\n", + " 'C000555': 323,\n", + " 'C000542': 322,\n", + " 'C001607': 319,\n", + " 'C001609': 319,\n", + " 'C000289': 315,\n", + " 'C000692': 315,\n", + " 'C000602': 311,\n", + " 'C000120': 310,\n", + " 'C001288': 306,\n", + " 'C002498': 303,\n", + " 'C001486': 303,\n", + " 'C001421': 299,\n", + " 'C011163': 298,\n", + " 'C000526': 296,\n", + " 'C001221': 294,\n", + " 'C003646': 288,\n", + " 'C009068': 287,\n", + " 'C001187': 282,\n", + " 'C000116': 281,\n", + " 'C007581': 278,\n", + " 'C001315': 278,\n", + " 'C001322': 277,\n", + " 'C005443': 269,\n", + " 'C001696': 264,\n", + " 'C001308': 264,\n", + " 'C001309': 263,\n", + " 'C001305': 263,\n", + " 'C001222': 260,\n", + " 'C003836': 258,\n", + " 'C010474': 254,\n", + " 'C000045': 244,\n", + " 'C001153': 243,\n", + " 'C001346': 243,\n", + " 'C007582': 242,\n", + " 'C002115': 242,\n", + " 'C005067': 240,\n", + " 'C011302': 240,\n", + " 'C001656': 237,\n", + " 'C005059': 236,\n", + " 'C000030': 235,\n", + " 'C000509': 235,\n", + " 'C011150': 226,\n", + " 'C001454': 220,\n", + " 'C000622': 220,\n", + " 'C011301': 217,\n", + " 'C001183': 201,\n", + " 'C001654': 201,\n", + " 'C001307': 199,\n", + " 'C007584': 193,\n", + " 'C005519': 191,\n", + " 'C003713': 188,\n", + " 'C002116': 185,\n", + " 'C010464': 181,\n", + " 'C000616': 181,\n", + " 'C002335': 179,\n", + " 'C000501': 178,\n", + " 'C001344': 174,\n", + " 'C011423': 172,\n", + " 'C001731': 171,\n", + " 'C010432': 170,\n", + " 'C001446': 168,\n", + " 'C000502': 166,\n", + " 'C001675': 166,\n", + " 'C005423': 161,\n", + " 'C002083': 160,\n", + " 'C004881': 160,\n", + " 'C011100': 159,\n", + " 'C007624': 155,\n", + " 'C011304': 153,\n", + " 'C005424': 152,\n", + " 'C003435': 139,\n", + " 'C001245': 139,\n", + " 'C000200': 138,\n", + " 'C010845': 132,\n", + " 'C000945': 132,\n", + " 'C001444': 129,\n", + " 'C005444': 125,\n", + " 'C010523': 124,\n", + " 'C010388': 114,\n", + " 'C002336': 111,\n", + " 'C002073': 107,\n", + " 'C000367': 102,\n", + " 'C010151': 96,\n", + " 'C003988': 89,\n", + " 'C000771': 85,\n", + " 'C008947': 81,\n", + " 'C002854': 78,\n", + " 'C010473': 77,\n", + " 'C000029': 76,\n", + " 'C000038': 65,\n", + " 'C010446': 43})" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Counter(df.entity)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d4d559ca-8af1-4ecc-b6ca-860961e97000", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Percent lost facts missing utl type axis: 0.8633052612211787\n" + ] + } + ], + "source": [ + "idx=df.table_candidates.apply(lambda x: any([\"utility_type_axis\" in tables[name].axes for name in x]))\n", + "missing_utl_type = df[idx]\n", + "\n", + "print(f\"Percent lost facts missing utl type axis: {len(missing_utl_type)/len(df)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "a9b8e452-2e6c-41a0-a35a-c4b246e004a1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "119573\n" + ] + } + ], + "source": [ + "print(len(df))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "271aef59-eb11-44a0-8653-baafa99a4888", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_30485/4286940243.py:10: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " missing_utl_type[\"refined_candidates\"] = refined_candidates\n" + ] + } + ], + "source": [ + "# Look for facts that are only missing the utility type axis and limit to those tables\n", + "refined_candidates = []\n", + "for idx, row in missing_utl_type.iterrows():\n", + " dims = set([snakecase(name) for name, _ in row.dimensions])\n", + " refined_candidates.append([\n", + " name for name in row.table_candidates\n", + " if (set(tables[name].axes) - dims) == set([\"utility_type_axis\"])\n", + " and set(tables[name].axes) > dims\n", + " ])\n", + "\n", + "missing_utl_type[\"refined_candidates\"] = refined_candidates" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "9ca6bca8-45b2-40f3-9cc5-c9bab94328e8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8029739155160446" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(missing_utl_type.refined_candidates.apply(lambda x: len(x) == 1)) / len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "0e9de8ed-ddd8-46f7-9285-91528ca5fb01", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({'taxes_accrued_prepaid_and_charged_during_year_262_duration': 28809,\n", + " 'taxes_accrued_prepaid_and_charged_during_year_262_instant': 22362,\n", + " 'statement_of_income_114_duration': 21819,\n", + " 'taxes_accrued_prepaid_and_charged_during_year_totals_262_duration': 10054,\n", + " 'taxes_accrued_prepaid_and_charged_during_year_totals_262_instant': 9520,\n", + " 'summary_of_utility_plant_and_accumulated_provisions_for_depreciation_amortization_and_depletion_200_instant': 2227,\n", + " 'accumulated_deferred_income_taxes_other_property_account_282_classified_by_business_activities_274_duration': 968,\n", + " 'accumulated_deferred_income_taxes_accelerated_amortization_property_account_281_classified_by_utility_types_272_duration': 155,\n", + " 'accumulated_deferred_investment_tax_credits_account_255_total_266_duration': 100})" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Counter(itertools.chain.from_iterable(missing_utl_type.refined_candidates))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "cce7a69a-bd67-4e85-8b95-6dcc526f356f", + "metadata": {}, + "outputs": [], + "source": [ + "plant_in_srvce = missing_utl_type[missing_utl_type.refined_candidates.apply(lambda x: 'summary_of_utility_plant_and_accumulated_provisions_for_depreciation_amortization_and_depletion_200_instant' in x)]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "a69e56fc-84d0-4fb9-aa20-05236d28d732", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['utility_plant_in_service_classified',\n", + " 'utility_plant_in_service_property_under_capital_leases',\n", + " 'utility_plant_in_service_completed_construction_not_classified',\n", + " 'utility_plant_in_service_classified_and_unclassified',\n", + " 'utility_plant_held_for_future_use',\n", + " 'utility_plant_acquisition_adjustment',\n", + " 'depreciation_utility_plant_in_service',\n", + " 'amortization_of_other_utility_plant_utility_plant_in_service',\n", + " 'depreciation_amortization_and_depletion_utility_plant_in_service',\n", + " 'depreciation_utility_plant_held_for_future_use',\n", + " 'depreciation_and_amortization_utility_plant_held_for_future_use',\n", + " 'amortization_of_plant_acquisition_adjustment',\n", + " 'utility_plant_leased_to_others',\n", + " 'depreciation_utility_plant_leased_to_others',\n", + " 'depreciation_amortization_and_depletion_utility_plant_leased_to_others',\n", + " 'utility_plant_in_service_plant_purchased_or_sold',\n", + " 'utility_plant_in_service_experimental_plant_unclassified',\n", + " 'amortization_and_depletion_of_producing_natural_gas_land_and_land_rightsutility_plant_in_service',\n", + " 'amortization_of_underground_storage_land_and_land_rightsutility_plant_in_service',\n", + " 'amortization_and_depletion_utility_plant_leased_to_others',\n", + " 'amortization_utility_plant_held_for_future_use',\n", + " 'abandonment_of_leases'], dtype=object)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "plant_in_srvce.name.unique()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/ferc_xbrl_extractor/instance.py b/src/ferc_xbrl_extractor/instance.py index 9acf31e..9a67564 100644 --- a/src/ferc_xbrl_extractor/instance.py +++ b/src/ferc_xbrl_extractor/instance.py @@ -193,7 +193,7 @@ class Fact(BaseModel): name: str c_id: str - f_id: str + f_id: str | None = None value: str | None = None @classmethod @@ -204,7 +204,7 @@ def from_xml(cls, elem: Element) -> "Fact": return cls( name=stringcase.snakecase(elem.tag.replace(prefix, "")), # Strip prefix c_id=elem.attrib["contextRef"], - f_id=elem.attrib["id"], + f_id=elem.attrib.get("id"), value=elem.text, ) diff --git a/tests/integration/lost_facts_test.py b/tests/integration/lost_facts_test.py index 0e5640c..bbdfe38 100644 --- a/tests/integration/lost_facts_test.py +++ b/tests/integration/lost_facts_test.py @@ -3,10 +3,11 @@ from collections import Counter from pathlib import Path +import pandas as pd from sqlalchemy import create_engine from ferc_xbrl_extractor.cli import TAXONOMY_MAP, get_instances -from ferc_xbrl_extractor.xbrl import extract, process_instance +from ferc_xbrl_extractor.xbrl import extract, get_fact_tables, process_instance def test_lost_fact_finder(tmp_path): @@ -18,36 +19,83 @@ def test_lost_fact_finder(tmp_path): ) used_ids = extract( - instances=instances[:1], + instances=instances, engine=create_engine("sqlite:///:memory:"), taxonomy=TAXONOMY_MAP[1], form_number=1, + batch_size=50, metadata_path=Path(tmp_path) / "metadata.json", ) - - instance = instances[0].parse() - instant_facts = itertools.chain.from_iterable( - itertools.chain.from_iterable( - context.values() for context in instance.instant_facts.values() - ) - ) - duration_facts = itertools.chain.from_iterable( - itertools.chain.from_iterable( - context.values() for context in instance.duration_facts.values() - ) + tables = get_fact_tables( + taxonomy_path=TAXONOMY_MAP[1], + form_number=1, + db_path="path", + metadata_path=Path(tmp_path) / "metadata.json", ) - all_facts = list(itertools.chain(instant_facts, duration_facts)) + + lost_facts = [] def clean_fact(fact, contexts): - return {"name": fact.name, "context": contexts[fact.c_id], "value": fact.value} + return { + "name": fact.name, + "context": contexts[fact.c_id], + "value": fact.value, + "instant": contexts[fact.c_id].period.instant, + } - lost_facts = [ - clean_fact(f, instance.contexts) - for f in all_facts - if f.f_id not in used_ids[instances[0].name] - ] + num_all_facts = 0 + for instance_builder in instances: + if len(used_ids[instance_builder.name]) < 10: + print(f"Skipping: {instance_builder.name}") + continue + + instance = instance_builder.parse() + instant_facts = itertools.chain.from_iterable( + itertools.chain.from_iterable( + context.values() for context in instance.instant_facts.values() + ) + ) + duration_facts = itertools.chain.from_iterable( + itertools.chain.from_iterable( + context.values() for context in instance.duration_facts.values() + ) + ) + all_facts = list(itertools.chain(instant_facts, duration_facts)) + num_all_facts += len(all_facts) + + lost_facts += [ + clean_fact(f, instance.contexts) + for f in all_facts + if f.f_id not in used_ids[instance_builder.name] + ] lostest_names = Counter(f["name"] for f in lost_facts) + + rows = [ + { + "name": fact["name"], + "filing": instance_builder.name, + "entity": fact["context"].entity.identifier, + "start_date": fact["context"].period.start_date, + "end_date": fact["context"].period.end_date, + "dimensions": [ + (dim.name, dim.value) for dim in fact["context"].entity.dimensions + ], + "value": fact["value"], + "table_candidates": [ + key + for key, table in tables.items() + if (fact["name"] in table.columns) + and (fact["instant"] == table.instant) + ], + "period": "instant" if fact["instant"] else "duration", + } + for fact in lost_facts + ] + df = pd.DataFrame(rows) + print(f"Of {num_all_facts} facts, {len(df) / num_all_facts}% were lost") + df.drop(df[df["name"] == "OrderNumber"].index, inplace=True) + df.to_pickle("lost_facts.pickle") breakpoint() assert len(lost_facts) / len(all_facts) < 0.1