Skip to content

Commit

Permalink
Merge pull request #264 from PSLmodels/weights-precision
Browse files Browse the repository at this point in the history
Write Tax-Calculator weights files with increased precision
  • Loading branch information
martinholmer authored Oct 28, 2024
2 parents 5f7e781 + 59c9f5c commit ba065bd
Show file tree
Hide file tree
Showing 16 changed files with 86 additions and 124 deletions.
14 changes: 7 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,29 +25,29 @@ tmd/storage/output/tmd.csv.gz: \
tmd/create_taxcalc_input_variables.py
python tmd/create_taxcalc_input_variables.py

tmd/storage/output/tmd_growfactors.csv: \
tmd/storage/input/puf_growfactors.csv \
tmd/create_taxcalc_growth_factors.py
python tmd/create_taxcalc_growth_factors.py

tmd/storage/output/tmd_weights.csv.gz: \
tmd/storage/input/cbo_population_forecast.yaml \
tmd/storage/output/tmd.csv.gz \
tmd/create_taxcalc_sampling_weights.py
python tmd/create_taxcalc_sampling_weights.py

tmd/storage/output/tmd_growfactors.csv: \
tmd/storage/input/puf_growfactors.csv \
tmd/create_taxcalc_growth_factors.py
python tmd/create_taxcalc_growth_factors.py

tmd/storage/output/cached_files: \
tmd/storage/output/tmd.csv.gz \
tmd/storage/output/tmd_growfactors.csv \
tmd/storage/output/tmd_weights.csv.gz \
tmd/storage/output/tmd_growfactors.csv \
tmd/storage/__init__.py \
tmd/create_taxcalc_cached_files.py
python tmd/create_taxcalc_cached_files.py

.PHONY=tmd_files
tmd_files: tmd/storage/output/tmd.csv.gz \
tmd/storage/output/tmd_growfactors.csv \
tmd/storage/output/tmd_weights.csv.gz \
tmd/storage/output/tmd_growfactors.csv \
tmd/storage/output/cached_files

.PHONY=test
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@

setup(
name="tmd",
version="0.4.0",
version="0.5.0",
packages=find_packages(),
python_requires=">=3.10,<3.13",
install_requires=[
"policyengine_us==1.55.0",
"tables", # required by policyengine_us
"taxcalc>=4.3.0",
"taxcalc>=4.3.1",
"scikit-learn",
"torch",
"tensorboard",
Expand Down
1 change: 0 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from pathlib import Path
import yaml
import pytest
import numpy as np
import pandas as pd
Expand Down
2 changes: 1 addition & 1 deletion tests/expected_tax_expenditures
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ YR,KIND,EST= 2023 cgqd_tax_preference 220.9
YR,KIND,EST= 2023 qbid 58.0
YR,KIND,EST= 2023 salt 21.2
YR,KIND,EST= 2026 paytax 1851.4
YR,KIND,EST= 2026 iitax 3161.2
YR,KIND,EST= 2026 iitax 3160.9
YR,KIND,EST= 2026 ctc 44.7
YR,KIND,EST= 2026 eitc 77.8
YR,KIND,EST= 2026 social_security_partial_taxability 47.7
Expand Down
50 changes: 22 additions & 28 deletions tests/test_area_weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,46 +18,40 @@ def test_area_xx(tests_folder):
and compare actual Tax-Calculator results with expected results when
using area weights along with national input data and growfactors.
"""
rc = create_area_weights_file("xx", write_log=True, write_file=True)
rc = create_area_weights_file("xx", write_log=False, write_file=True)
assert rc == 0, "create_areas_weights_file has non-zero return code"
# compare actual vs expected results for faux area xx
# ... instantiate Tax-Calculator object for {area}
idpath = STORAGE_FOLDER / "output" / "tmd.csv.gz"
gfpath = STORAGE_FOLDER / "output" / "tmd_growfactors.csv"
wtpath = AREAS_FOLDER / "weights" / "xx_tmd_weights.csv.gz"
input_data = tc.Records(
data=pd.read_csv(idpath),
start_year=YEAR,
gfactors=tc.GrowFactors(growfactors_filename=str(gfpath)),
weights=str(wtpath),
adjust_ratios=None,
# ... instantiate Tax-Calculator object for area
rec = tc.Records.tmd_constructor(
data_path=(STORAGE_FOLDER / "output" / "tmd.csv.gz"),
weights_path=(AREAS_FOLDER / "weights" / "xx_tmd_weights.csv.gz"),
growfactors_path=(STORAGE_FOLDER / "output" / "tmd_growfactors.csv"),
exact_calculations=True,
)
sim = tc.Calculator(records=input_data, policy=tc.Policy())
sim = tc.Calculator(records=rec, policy=tc.Policy())
# ... calculate tax variables for YEAR
sim.advance_to_year(YEAR)
sim.calc_all()
vdf = sim.dataframe([], all_vars=True)
# ... calculate actual results and store in act dictionary
puf = vdf.data_source == 1
wght = vdf.s006 * puf
wght = vdf.s006 * (vdf.data_source == 1) # PUF weights
act = {
"popall": (vdf.s006 * vdf.XTOT).sum() * 1e-6,
"e00300": (wght * vdf.e00300[puf]).sum() * 1e-9,
"e00900": (wght * vdf.e00900[puf]).sum() * 1e-9,
"e00200": (wght * vdf.e00200[puf]).sum() * 1e-9,
"e02000": (wght * vdf.e02000[puf]).sum() * 1e-9,
"e02400": (wght * vdf.e02400[puf]).sum() * 1e-9,
"c00100": (wght * vdf.c00100[puf]).sum() * 1e-9,
"e00300": (wght * vdf.e00300).sum() * 1e-9,
"e00900": (wght * vdf.e00900).sum() * 1e-9,
"e00200": (wght * vdf.e00200).sum() * 1e-9,
"e02000": (wght * vdf.e02000).sum() * 1e-9,
"e02400": (wght * vdf.e02400).sum() * 1e-9,
"c00100": (wght * vdf.c00100).sum() * 1e-9,
"agihic": (wght * (vdf.c00100 >= 1e6)).sum() * 1e-3,
"e00400": (wght * vdf.e00400[puf]).sum() * 1e-9,
"e00600": (wght * vdf.e00600[puf]).sum() * 1e-9,
"e00650": (wght * vdf.e00650[puf]).sum() * 1e-9,
"e01700": (wght * vdf.e01700[puf]).sum() * 1e-9,
"e02300": (wght * vdf.e02300[puf]).sum() * 1e-9,
"e17500": (wght * vdf.e17500[puf]).sum() * 1e-9,
"e18400": (wght * vdf.e18400[puf]).sum() * 1e-9,
"e18500": (wght * vdf.e18500[puf]).sum() * 1e-9,
"e00400": (wght * vdf.e00400).sum() * 1e-9,
"e00600": (wght * vdf.e00600).sum() * 1e-9,
"e00650": (wght * vdf.e00650).sum() * 1e-9,
"e01700": (wght * vdf.e01700).sum() * 1e-9,
"e02300": (wght * vdf.e02300).sum() * 1e-9,
"e17500": (wght * vdf.e17500).sum() * 1e-9,
"e18400": (wght * vdf.e18400).sum() * 1e-9,
"e18500": (wght * vdf.e18500).sum() * 1e-9,
}
# ... read expected results into exp dictionary
exp_path = tests_folder / "test_area_weights_expect.yaml"
Expand Down
12 changes: 11 additions & 1 deletion tests/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def compare(name, act, exp, tol):
compare("wght_sum", wght.sum(), 184e6, 0.01)
hiagi = agi >= 1e6
compare("wght_sum_hiagi", (wght * hiagi).sum(), 0.875e6, 0.01)
compare("wght_itax_sum", (wght * itax).sum(), 1591e9, 0.01)
compare("wght_itax_sum", (wght * itax).sum(), 1595e9, 0.01)
compare("wght_itax_sum_hiagi", ((wght * itax) * hiagi).sum(), 902e9, 0.01)
# count weighted number of tax units with zero agi by filing status
agi0 = agi == 0
Expand All @@ -65,3 +65,13 @@ def compare(name, act, exp, tol):
compare("wght_sum_agi0_fs2", (wght * mars2 * agi0).sum(), 2.00e6, 0.01)
mars4 = mars == 4
compare("wght_sum_agi0_fs4", (wght * mars4 * agi0).sum(), 1.53e6, 0.01)
# count weighted number of PUF tax units with zero agi by filing status
puf = sim.array("data_source") == 1
pwght = puf * wght
compare("Pwght_sum_agi0_fs0", (pwght * agi0).sum(), 0.846e6, 0.01)
mars1 = mars == 1
compare("Pwght_sum_agi0_fs1", (pwght * mars1 * agi0).sum(), 0.616e6, 0.01)
mars2 = mars == 2
compare("Pwght_sum_agi0_fs2", (pwght * mars2 * agi0).sum(), 0.136e6, 0.01)
mars4 = mars == 4
compare("Pwght_sum_agi0_fs4", (pwght * mars4 * agi0).sum(), 0.0628e6, 0.01)
7 changes: 3 additions & 4 deletions tests/test_tax_expenditures.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
against expected tax expenditure values in the tests folder.
"""

import os
import difflib
import pytest
import numpy as np
Expand Down Expand Up @@ -51,15 +50,15 @@ def test_tax_exp_diffs(
# ... compare all other rows using a smaller relative diff tolerance
actval = actdf.iloc[:, 3].to_numpy(dtype=np.float64)
expval = expdf.iloc[:, 3].to_numpy(dtype=np.float64)
reltol = float(os.getenv("TMD_TAXEXP_DIFF_RTOL", default=0.011))
reltol = 0.011
if not np.allclose(actval, expval, atol=0.0, rtol=reltol):
same = False
if same:
return
# if same is False
with open(act_path, "r") as actfile:
with open(act_path, "r", encoding="utf-8") as actfile:
act = actfile.readlines()
with open(exp_path, "r") as expfile:
with open(exp_path, "r", encoding="utf-8") as expfile:
exp = expfile.readlines()
diffs = list(
difflib.context_diff(act, exp, fromfile="actual", tofile="expect", n=0)
Expand Down
17 changes: 12 additions & 5 deletions tests/test_tax_revenue.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
LAST_CYR = 2033

DEFAULT_RELTOL_ITAX = 0.10
RELTOL_ITAX = {}
RELTOL_ITAX = {
2022: 0.13,
}
DEFAULT_RELTOL_PTAX = 0.02
RELTOL_PTAX = {
2021: 0.05,
Expand All @@ -28,9 +30,11 @@ def test_tax_revenue(
tests_folder, tmd_variables, tmd_weights_path, tmd_growfactors_path
):
# read expected fiscal year revenues and convert to calendar year revenues
with open(tests_folder / "expected_itax_revenue.yaml") as f:
epath = tests_folder / "expected_itax_revenue.yaml"
with open(epath, "r", encoding="utf-8") as f:
fy_itax = yaml.safe_load(f)
with open(tests_folder / "expected_ptax_revenue.yaml") as f:
epath = tests_folder / "expected_ptax_revenue.yaml"
with open(epath, "r", encoding="utf-8") as f:
fy_ptax = yaml.safe_load(f)
exp_itax = {}
exp_ptax = {}
Expand All @@ -47,6 +51,7 @@ def test_tax_revenue(
gfactors=growf,
adjust_ratios=None,
exact_calculations=True,
weights_scale=1.0,
)
sim = tc.Calculator(records=input_data, policy=tc.Policy())
act_itax = {}
Expand Down Expand Up @@ -83,6 +88,8 @@ def test_tax_revenue(
assert False, f"test_tax_revenue DUMP output: {emsg}"
else:
if emsg:
emsg += f"\nRELTOL_ITAX= {RELTOL_ITAX:4.2f}"
emsg += f"\nRELTOL_PTAX= {RELTOL_PTAX:4.2f}"
reltol = RELTOL_ITAX.get(year, DEFAULT_RELTOL_ITAX)
emsg += f"\nRELTOL_ITAX= {reltol:4.2f}"
reltol = RELTOL_PTAX.get(year, DEFAULT_RELTOL_PTAX)
emsg += f"\nRELTOL_PTAX= {reltol:4.2f}"
raise ValueError(emsg)
6 changes: 4 additions & 2 deletions tests/test_variable_totals.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@

@pytest.mark.vartotals
def test_variable_totals(tests_folder, tmd_variables):
with open(STORAGE_FOLDER / "input" / "tc_variable_metadata.yaml") as f:
vpath = STORAGE_FOLDER / "input" / "tc_variable_metadata.yaml"
with open(vpath, "r", encoding="utf-8") as f:
tc_variable_metadata = yaml.safe_load(f)
with open(tests_folder / "taxdata_variable_totals.yaml") as f:
vpath = tests_folder / "taxdata_variable_totals.yaml"
with open(vpath, "r", encoding="utf-8") as f:
td_variable_totals = yaml.safe_load(f)
test_exempted_variables = [
"DSI", # Issue here but deprioritized
Expand Down
8 changes: 3 additions & 5 deletions tmd/areas/create_area_weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@
FIRST_YEAR = 2021
LAST_YEAR = 2034
INFILE_PATH = STORAGE_FOLDER / "output" / "tmd.csv.gz"
WTFILE_PATH = STORAGE_FOLDER / "output" / "tmd_weights.csv.gz"
GFFILE_PATH = STORAGE_FOLDER / "output" / "tmd_growfactors.csv"
POPFILE_PATH = STORAGE_FOLDER / "input" / "cbo_population_forecast.yaml"

# Tax-Calcultor calculated variable cache files:
Expand Down Expand Up @@ -641,7 +639,7 @@ def create_area_weights_file(
with open(POPFILE_PATH, "r", encoding="utf-8") as pfile:
pop = yaml.safe_load(pfile.read())
# ... set FIRST_YEAR weights
weights = wght_area * 100 # scale up weights by 100 for Tax-Calculator
weights = wght_area
# ... construct dictionary of scaled-up weights by year
wdict = {f"WT{FIRST_YEAR}": weights}
cum_pop_growth = 1.0
Expand All @@ -650,9 +648,9 @@ def create_area_weights_file(
cum_pop_growth *= annual_pop_growth
wght = weights.copy() * cum_pop_growth
wdict[f"WT{year}"] = wght
# ... write rounded integer scaled-up weights to CSV-formatted file
# ... write weights to CSV-formatted file
wdf = pd.DataFrame.from_dict(wdict)
wdf.to_csv(awpath, index=False, float_format="%.0f", compression="gzip")
wdf.to_csv(awpath, index=False, float_format="%.5f", compression="gzip")

return 0

Expand Down
9 changes: 0 additions & 9 deletions tmd/areas/targets/zz_targets.csv

This file was deleted.

40 changes: 0 additions & 40 deletions tmd/create_all_datasets.py

This file was deleted.

3 changes: 2 additions & 1 deletion tmd/create_taxcalc_input_variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def create_variable_file(write_file=True):
weights=None,
adjust_ratios=None,
exact_calculations=True,
weights_scale=1.0,
)
vdf.drop(columns=rec.IGNORED_VARS, inplace=True)
# round all float variables to nearest integer except for weights
Expand All @@ -59,7 +60,7 @@ def create_variable_file(write_file=True):
if write_file:
fname = STORAGE_FOLDER / "output" / "tmd.csv.gz"
print(f"Writing PUF+CPS file... [{fname}]")
vdf.to_csv(fname, index=False, float_format="%.2f")
vdf.to_csv(fname, index=False, float_format="%.5f")


if __name__ == "__main__":
Expand Down
8 changes: 4 additions & 4 deletions tmd/create_taxcalc_sampling_weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ def create_weights_file(pop_file=POPFILE):

# get FIRST_YEAR weights from VARFILE
vdf = pd.read_csv(VARFILE)
weights = vdf.s006 * 100 # scale up weights by 100 for Tax-Calculator
weights = vdf.s006

# construct dictionary of scaled-up weights by year
# construct dictionary of weights by year
wdict = {f"WT{FIRST_YEAR}": weights}
cum_pop_growth = 1.0
for year in range(FIRST_YEAR + 1, LAST_YEAR + 1):
Expand All @@ -37,9 +37,9 @@ def create_weights_file(pop_file=POPFILE):
wght = weights.copy() * cum_pop_growth
wdict[f"WT{year}"] = wght

# write rounded integer scaled-up weights to CSV-formatted file
# write weights to CSV-formatted file
wdf = pd.DataFrame.from_dict(wdict)
wdf.to_csv(WGTFILE, index=False, float_format="%.0f", compression="gzip")
wdf.to_csv(WGTFILE, index=False, float_format="%.5f", compression="gzip")


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit ba065bd

Please sign in to comment.