Skip to content

Commit

Permalink
Merge pull request #52 from PSLmodels/dataset-update-5
Browse files Browse the repository at this point in the history
Dataset test additions
  • Loading branch information
nikhilwoodruff authored Apr 23, 2024
2 parents cacf608 + 48d434a commit a81ea8a
Show file tree
Hide file tree
Showing 5 changed files with 218 additions and 91 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@
tax_microdata_benchmarking/calibration

*.ipynb
tmp/
tax_microdata_benchmarking/examination/taxcalculator/pe23-23*
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ development will proceed in several phases.

To install, clone the repository and run `pip install -e .` from the
root directory. To check that the installation was successful, run
`make test` or `pytest .` from the root directory.
`make test` or `pytest .` from the root directory. This needs the POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN environment variable to be set- contact [Nikhil Woodruff](mailto:[email protected]) for the code if you have access to the IRS PUF.

To assess, review the data examination results that compare federal
agency tax estimates with those generated using the microdata file
Expand Down
24 changes: 19 additions & 5 deletions tax_microdata_benchmarking/create_flat_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,6 +720,8 @@ def get_variable_uprating(
str: The uprating factor.
"""

population = system.parameters.calibration.gov.census.populations.total

calibration = system.parameters.calibration
if variable in calibration.gov.irs.soi.children:
parameter = calibration.gov.irs.soi.children[variable]
Expand All @@ -728,8 +730,12 @@ def get_variable_uprating(
source_value = parameter(source_time_period)
target_value = parameter(target_time_period)

population_change = population(target_time_period) / population(
source_time_period
)

uprating_factor = target_value / source_value
return uprating_factor
return uprating_factor / population_change


def assert_no_duplicate_columns(df):
Expand Down Expand Up @@ -780,7 +786,7 @@ def create_stacked_flat_file(
)
stacked_file["PT_binc_w2_wages"] = (
qbi * 0.314 # Solved in 2021 using adjust_qbi.py
) # Solved in 2021 using adjust_qbi.py
)
input_data = tc.Records(data=stacked_file, start_year=target_year)
policy = tc.Policy()
simulation = tc.Calculator(records=input_data, policy=policy)
Expand All @@ -797,6 +803,7 @@ def create_stacked_flat_file(
try:
from tax_microdata_benchmarking.reweight import reweight

combined_file["s006_original"] = combined_file.s006
combined_file = reweight(
combined_file, time_period=target_year
)
Expand Down Expand Up @@ -834,6 +841,12 @@ def summary_analytics(df):
population = system.parameters.calibration.gov.census.populations.total


def get_population_growth(target_year: int, source_year: int):
return population(f"{target_year}-01-01") / population(
f"{source_year}-01-01"
)


def create_all_files():
PRIORITY_YEARS = [2021, 2023, 2026, 2015]
REMAINING_YEARS = [
Expand All @@ -845,11 +858,12 @@ def create_all_files():
if target_year == 2021:
latest_weights = stacked_file.s006
elif target_year > 2021:
population_uprating = population(
f"{target_year}-01-01"
) / population("2021-01-01")
population_uprating = get_population_growth(target_year, 2021)
stacked_file["s006_original"] = stacked_file.s006
stacked_file.s006 = latest_weights * population_uprating
print(f"Using 2021 solved weights for {target_year}")
else:
stacked_file["s006_original"] = stacked_file.s006
stacked_file.to_csv(
f"tax_microdata_{target_year}.csv.gz",
index=False,
Expand Down
85 changes: 0 additions & 85 deletions tests/test_basic_flat_file.py

This file was deleted.

196 changes: 196 additions & 0 deletions tests/test_flat_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
import os
import pytest
import yaml
from pathlib import Path
import pytest
import pandas as pd
import subprocess
import warnings

warnings.filterwarnings("ignore")

test_mode = os.environ.get("TEST_MODE", "lite")

FOLDER = Path(__file__).parent
with open(FOLDER / "tc_variable_totals.yaml") as f:
tc_variable_totals = yaml.safe_load(f)

with open(
FOLDER.parent
/ "tax_microdata_benchmarking"
/ "taxcalc_variable_metadata.yaml"
) as f:
taxcalc_variable_metadata = yaml.safe_load(f)

EXEMPTED_VARIABLES = [
"DSI", # Issue here but deprioritized.
"EIC", # PUF-PE file almost certainly more correct by including CPS data
"MIDR", # Issue here but deprioritized.
"RECID", # No reason to compare.
"a_lineno", # No reason to compare.
"agi_bin", # No reason to compare.
"blind_spouse", # Issue here but deprioritized.
"cmbtp", # No reason to compare.
"data_source", # No reason to compare.
"s006", # No reason to compare.
"h_seq", # No reason to compare.
"fips", # No reason to compare.
"ffpos", # No reason to compare.
"p23250", # PE-PUF likely closer to truth than taxdata (needs triple check).
"e01200", # Unknown but deprioritized for now.
"e17500", # Unknown but deprioritized for now.
"e18500", # Unknown but deprioritized for now.
"e02100", # Farm income, unsure who's closer.
]

# Exempt any variable split between filer and spouse for now.
EXEMPTED_VARIABLES += [
variable
for variable in taxcalc_variable_metadata["read"]
if variable.endswith("p") or variable.endswith("s")
]


def pytest_namespace():
return {"flat_file": None}


@pytest.mark.dependency()
def test_2021_flat_file_builds():
from tax_microdata_benchmarking.create_flat_file import (
create_stacked_flat_file,
)

flat_file = create_stacked_flat_file(2021, reweight=test_mode == "full")

pytest.flat_file_2021 = flat_file


variables_to_test = [
variable
for variable in tc_variable_totals.keys()
if variable not in EXEMPTED_VARIABLES
]


@pytest.mark.dependency(depends=["test_2021_flat_file_builds"])
@pytest.mark.parametrize("variable", variables_to_test)
def test_2021_tc_variable_totals(variable):
meta = taxcalc_variable_metadata["read"][variable]
name = meta.get("desc")
flat_file = pytest.flat_file_2021
weight = flat_file.s006
total = (flat_file[variable] * weight).sum()
if tc_variable_totals[variable] == 0:
# If the taxdata file has a zero total, we'll assume the PE file is still correct.
return
# 20% and more than 10bn off taxdata is a failure.
assert (
abs(total / tc_variable_totals[variable] - 1) < 0.45
or abs(total / 1e9 - tc_variable_totals[variable] / 1e9) < 30
), f"{variable} ({name}) differs to tax-data by {total / tc_variable_totals[variable] - 1:.1%} ({total/1e9:.1f}bn vs {tc_variable_totals[variable]/1e9:.1f}bn)"


FOLDER = Path(__file__).parent

test_mode = os.environ.get("TEST_MODE", "lite")

RUN_TE_TESTS = False


@pytest.mark.skipif(not RUN_TE_TESTS, reason="TE tests are disabled.")
@pytest.mark.dependency(depends=["test_2021_flat_file_builds"])
def test_2023_tax_expenditures():
flat_file_2021 = pytest.flat_file_2021

from tax_microdata_benchmarking.create_flat_file import (
create_stacked_flat_file,
get_population_growth,
)

flat_file_2023 = create_stacked_flat_file(
2023, reweight=test_mode == "full"
)

flat_file_2023.s006 = flat_file_2021.s006 * get_population_growth(
2023, 2021
)

tc_folder = (
FOLDER.parent
/ "tax_microdata_benchmarking"
/ "examination"
/ "taxcalculator"
)

flat_file_2023.to_csv(tc_folder / "pe23.csv.zip")

# cd into taxcalculator and run bash ./runs.sh pe23 23. That produces a file called pe23-23.res.actual. Print it out.

subprocess.run(["./runs.sh", "pe23", "23"], cwd=tc_folder.resolve())

with open(tc_folder / "pe23-23.res-actual") as f:
data = f.read().splitlines()

import warnings

warnings.filterwarnings("ignore")
import pandas as pd

df = pd.DataFrame(
columns=["Returns", "ExpInc", "IncTax", "PayTax", "LSTax", "AllTax"]
)
for line in data[2::3]:
line = line.split()[1:]
df = df.append(
pd.DataFrame(
[line],
columns=[
"Returns",
"ExpInc",
"IncTax",
"PayTax",
"LSTax",
"AllTax",
],
)
)

df.index = [
"Baseline",
"CGQD",
"CLP",
"CTC",
"EITC",
"NIIT",
"QBID",
"SALT",
"SSBEN",
]
df = df.astype(float)

taxdata_exp_results = [
3976.5,
274.5,
0.0,
125.6,
68.7,
-67.5,
59.5,
13.9,
76.6,
]

for i in range(len(taxdata_exp_results)):
name = df.index[i]
if name in ("QBID", "SALT"):
continue # QBID: PE far closer to truth. SALT: known issue.
rel_error = (
abs(df["AllTax"][i] - taxdata_exp_results[i])
/ taxdata_exp_results[i]
)
if taxdata_exp_results[i] == 0:
rel_error = 0
assert (
rel_error < 0.25
), f"Tax Expenditure for {name} is ${df['AllTax'][i]}bn compared to Tax-Data's ${taxdata_exp_results[i]}bn (relative error {rel_error:.1%})"

0 comments on commit a81ea8a

Please sign in to comment.