Merge pull request #264 from PSLmodels/weights-precision

Write Tax-Calculator weights files with increased precision
PSLmodels · Oct 28, 2024 · ba065bd · ba065bd
2 parents 5f7e781 + 59c9f5c
commit ba065bd
Show file tree

Hide file tree

Showing 16 changed files with 86 additions and 124 deletions.
diff --git a/Makefile b/Makefile
@@ -25,29 +25,29 @@ tmd/storage/output/tmd.csv.gz: \
   tmd/create_taxcalc_input_variables.py
 	python tmd/create_taxcalc_input_variables.py
 
-tmd/storage/output/tmd_growfactors.csv: \
-  tmd/storage/input/puf_growfactors.csv \
-  tmd/create_taxcalc_growth_factors.py
-	python tmd/create_taxcalc_growth_factors.py
-
 tmd/storage/output/tmd_weights.csv.gz: \
   tmd/storage/input/cbo_population_forecast.yaml \
   tmd/storage/output/tmd.csv.gz \
   tmd/create_taxcalc_sampling_weights.py
 	python tmd/create_taxcalc_sampling_weights.py
 
+tmd/storage/output/tmd_growfactors.csv: \
+  tmd/storage/input/puf_growfactors.csv \
+  tmd/create_taxcalc_growth_factors.py
+	python tmd/create_taxcalc_growth_factors.py
+
 tmd/storage/output/cached_files: \
   tmd/storage/output/tmd.csv.gz \
-  tmd/storage/output/tmd_growfactors.csv \
   tmd/storage/output/tmd_weights.csv.gz \
+  tmd/storage/output/tmd_growfactors.csv \
   tmd/storage/__init__.py \
   tmd/create_taxcalc_cached_files.py
 	python tmd/create_taxcalc_cached_files.py
 
 .PHONY=tmd_files
 tmd_files: tmd/storage/output/tmd.csv.gz \
-  tmd/storage/output/tmd_growfactors.csv \
   tmd/storage/output/tmd_weights.csv.gz \
+  tmd/storage/output/tmd_growfactors.csv \
   tmd/storage/output/cached_files
 
 .PHONY=test

diff --git a/setup.py b/setup.py
@@ -2,13 +2,13 @@
 
 setup(
     name="tmd",
-    version="0.4.0",
+    version="0.5.0",
     packages=find_packages(),
     python_requires=">=3.10,<3.13",
     install_requires=[
         "policyengine_us==1.55.0",
         "tables",  # required by policyengine_us
-        "taxcalc>=4.3.0",
+        "taxcalc>=4.3.1",
         "scikit-learn",
         "torch",
         "tensorboard",

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,5 +1,4 @@
 from pathlib import Path
-import yaml
 import pytest
 import numpy as np
 import pandas as pd

diff --git a/tests/expected_tax_expenditures b/tests/expected_tax_expenditures
@@ -8,7 +8,7 @@ YR,KIND,EST= 2023 cgqd_tax_preference 220.9
 YR,KIND,EST= 2023 qbid 58.0
 YR,KIND,EST= 2023 salt 21.2
 YR,KIND,EST= 2026 paytax 1851.4
-YR,KIND,EST= 2026 iitax 3161.2
+YR,KIND,EST= 2026 iitax 3160.9
 YR,KIND,EST= 2026 ctc 44.7
 YR,KIND,EST= 2026 eitc 77.8
 YR,KIND,EST= 2026 social_security_partial_taxability 47.7

diff --git a/tests/test_area_weights.py b/tests/test_area_weights.py
@@ -18,46 +18,40 @@ def test_area_xx(tests_folder):
     and compare actual Tax-Calculator results with expected results when
     using area weights along with national input data and growfactors.
     """
-    rc = create_area_weights_file("xx", write_log=True, write_file=True)
+    rc = create_area_weights_file("xx", write_log=False, write_file=True)
     assert rc == 0, "create_areas_weights_file has non-zero return code"
     # compare actual vs expected results for faux area xx
-    # ... instantiate Tax-Calculator object for {area}
-    idpath = STORAGE_FOLDER / "output" / "tmd.csv.gz"
-    gfpath = STORAGE_FOLDER / "output" / "tmd_growfactors.csv"
-    wtpath = AREAS_FOLDER / "weights" / "xx_tmd_weights.csv.gz"
-    input_data = tc.Records(
-        data=pd.read_csv(idpath),
-        start_year=YEAR,
-        gfactors=tc.GrowFactors(growfactors_filename=str(gfpath)),
-        weights=str(wtpath),
-        adjust_ratios=None,
+    # ... instantiate Tax-Calculator object for area
+    rec = tc.Records.tmd_constructor(
+        data_path=(STORAGE_FOLDER / "output" / "tmd.csv.gz"),
+        weights_path=(AREAS_FOLDER / "weights" / "xx_tmd_weights.csv.gz"),
+        growfactors_path=(STORAGE_FOLDER / "output" / "tmd_growfactors.csv"),
         exact_calculations=True,
     )
-    sim = tc.Calculator(records=input_data, policy=tc.Policy())
+    sim = tc.Calculator(records=rec, policy=tc.Policy())
     # ... calculate tax variables for YEAR
     sim.advance_to_year(YEAR)
     sim.calc_all()
     vdf = sim.dataframe([], all_vars=True)
     # ... calculate actual results and store in act dictionary
-    puf = vdf.data_source == 1
-    wght = vdf.s006 * puf
+    wght = vdf.s006 * (vdf.data_source == 1)  # PUF weights
     act = {
         "popall": (vdf.s006 * vdf.XTOT).sum() * 1e-6,
-        "e00300": (wght * vdf.e00300[puf]).sum() * 1e-9,
-        "e00900": (wght * vdf.e00900[puf]).sum() * 1e-9,
-        "e00200": (wght * vdf.e00200[puf]).sum() * 1e-9,
-        "e02000": (wght * vdf.e02000[puf]).sum() * 1e-9,
-        "e02400": (wght * vdf.e02400[puf]).sum() * 1e-9,
-        "c00100": (wght * vdf.c00100[puf]).sum() * 1e-9,
+        "e00300": (wght * vdf.e00300).sum() * 1e-9,
+        "e00900": (wght * vdf.e00900).sum() * 1e-9,
+        "e00200": (wght * vdf.e00200).sum() * 1e-9,
+        "e02000": (wght * vdf.e02000).sum() * 1e-9,
+        "e02400": (wght * vdf.e02400).sum() * 1e-9,
+        "c00100": (wght * vdf.c00100).sum() * 1e-9,
         "agihic": (wght * (vdf.c00100 >= 1e6)).sum() * 1e-3,
-        "e00400": (wght * vdf.e00400[puf]).sum() * 1e-9,
-        "e00600": (wght * vdf.e00600[puf]).sum() * 1e-9,
-        "e00650": (wght * vdf.e00650[puf]).sum() * 1e-9,
-        "e01700": (wght * vdf.e01700[puf]).sum() * 1e-9,
-        "e02300": (wght * vdf.e02300[puf]).sum() * 1e-9,
-        "e17500": (wght * vdf.e17500[puf]).sum() * 1e-9,
-        "e18400": (wght * vdf.e18400[puf]).sum() * 1e-9,
-        "e18500": (wght * vdf.e18500[puf]).sum() * 1e-9,
+        "e00400": (wght * vdf.e00400).sum() * 1e-9,
+        "e00600": (wght * vdf.e00600).sum() * 1e-9,
+        "e00650": (wght * vdf.e00650).sum() * 1e-9,
+        "e01700": (wght * vdf.e01700).sum() * 1e-9,
+        "e02300": (wght * vdf.e02300).sum() * 1e-9,
+        "e17500": (wght * vdf.e17500).sum() * 1e-9,
+        "e18400": (wght * vdf.e18400).sum() * 1e-9,
+        "e18500": (wght * vdf.e18500).sum() * 1e-9,
     }
     # ... read expected results into exp dictionary
     exp_path = tests_folder / "test_area_weights_expect.yaml"

diff --git a/tests/test_misc.py b/tests/test_misc.py
@@ -54,7 +54,7 @@ def compare(name, act, exp, tol):
     compare("wght_sum", wght.sum(), 184e6, 0.01)
     hiagi = agi >= 1e6
     compare("wght_sum_hiagi", (wght * hiagi).sum(), 0.875e6, 0.01)
-    compare("wght_itax_sum", (wght * itax).sum(), 1591e9, 0.01)
+    compare("wght_itax_sum", (wght * itax).sum(), 1595e9, 0.01)
     compare("wght_itax_sum_hiagi", ((wght * itax) * hiagi).sum(), 902e9, 0.01)
     # count weighted number of tax units with zero agi by filing status
     agi0 = agi == 0
@@ -65,3 +65,13 @@ def compare(name, act, exp, tol):
     compare("wght_sum_agi0_fs2", (wght * mars2 * agi0).sum(), 2.00e6, 0.01)
     mars4 = mars == 4
     compare("wght_sum_agi0_fs4", (wght * mars4 * agi0).sum(), 1.53e6, 0.01)
+    # count weighted number of PUF tax units with zero agi by filing status
+    puf = sim.array("data_source") == 1
+    pwght = puf * wght
+    compare("Pwght_sum_agi0_fs0", (pwght * agi0).sum(), 0.846e6, 0.01)
+    mars1 = mars == 1
+    compare("Pwght_sum_agi0_fs1", (pwght * mars1 * agi0).sum(), 0.616e6, 0.01)
+    mars2 = mars == 2
+    compare("Pwght_sum_agi0_fs2", (pwght * mars2 * agi0).sum(), 0.136e6, 0.01)
+    mars4 = mars == 4
+    compare("Pwght_sum_agi0_fs4", (pwght * mars4 * agi0).sum(), 0.0628e6, 0.01)
diff --git a/tests/test_tax_expenditures.py b/tests/test_tax_expenditures.py
@@ -3,7 +3,6 @@
 against expected tax expenditure values in the tests folder.
 """
 
-import os
 import difflib
 import pytest
 import numpy as np
@@ -51,15 +50,15 @@ def test_tax_exp_diffs(
     # ... compare all other rows using a smaller relative diff tolerance
     actval = actdf.iloc[:, 3].to_numpy(dtype=np.float64)
     expval = expdf.iloc[:, 3].to_numpy(dtype=np.float64)
-    reltol = float(os.getenv("TMD_TAXEXP_DIFF_RTOL", default=0.011))
+    reltol = 0.011
     if not np.allclose(actval, expval, atol=0.0, rtol=reltol):
         same = False
     if same:
         return
     # if same is False
-    with open(act_path, "r") as actfile:
+    with open(act_path, "r", encoding="utf-8") as actfile:
         act = actfile.readlines()
-    with open(exp_path, "r") as expfile:
+    with open(exp_path, "r", encoding="utf-8") as expfile:
         exp = expfile.readlines()
     diffs = list(
         difflib.context_diff(act, exp, fromfile="actual", tofile="expect", n=0)

diff --git a/tests/test_tax_revenue.py b/tests/test_tax_revenue.py
@@ -10,7 +10,9 @@
 LAST_CYR = 2033
 
 DEFAULT_RELTOL_ITAX = 0.10
-RELTOL_ITAX = {}
+RELTOL_ITAX = {
+    2022: 0.13,
+}
 DEFAULT_RELTOL_PTAX = 0.02
 RELTOL_PTAX = {
     2021: 0.05,
@@ -28,9 +30,11 @@ def test_tax_revenue(
     tests_folder, tmd_variables, tmd_weights_path, tmd_growfactors_path
 ):
     # read expected fiscal year revenues and convert to calendar year revenues
-    with open(tests_folder / "expected_itax_revenue.yaml") as f:
+    epath = tests_folder / "expected_itax_revenue.yaml"
+    with open(epath, "r", encoding="utf-8") as f:
         fy_itax = yaml.safe_load(f)
-    with open(tests_folder / "expected_ptax_revenue.yaml") as f:
+    epath = tests_folder / "expected_ptax_revenue.yaml"
+    with open(epath, "r", encoding="utf-8") as f:
         fy_ptax = yaml.safe_load(f)
     exp_itax = {}
     exp_ptax = {}
@@ -47,6 +51,7 @@ def test_tax_revenue(
         gfactors=growf,
         adjust_ratios=None,
         exact_calculations=True,
+        weights_scale=1.0,
     )
     sim = tc.Calculator(records=input_data, policy=tc.Policy())
     act_itax = {}
@@ -83,6 +88,8 @@ def test_tax_revenue(
         assert False, f"test_tax_revenue DUMP output: {emsg}"
     else:
         if emsg:
-            emsg += f"\nRELTOL_ITAX= {RELTOL_ITAX:4.2f}"
-            emsg += f"\nRELTOL_PTAX= {RELTOL_PTAX:4.2f}"
+            reltol = RELTOL_ITAX.get(year, DEFAULT_RELTOL_ITAX)
+            emsg += f"\nRELTOL_ITAX= {reltol:4.2f}"
+            reltol = RELTOL_PTAX.get(year, DEFAULT_RELTOL_PTAX)
+            emsg += f"\nRELTOL_PTAX= {reltol:4.2f}"
             raise ValueError(emsg)
diff --git a/tests/test_variable_totals.py b/tests/test_variable_totals.py
@@ -11,9 +11,11 @@
 
 @pytest.mark.vartotals
 def test_variable_totals(tests_folder, tmd_variables):
-    with open(STORAGE_FOLDER / "input" / "tc_variable_metadata.yaml") as f:
+    vpath = STORAGE_FOLDER / "input" / "tc_variable_metadata.yaml"
+    with open(vpath, "r", encoding="utf-8") as f:
         tc_variable_metadata = yaml.safe_load(f)
-    with open(tests_folder / "taxdata_variable_totals.yaml") as f:
+    vpath = tests_folder / "taxdata_variable_totals.yaml"
+    with open(vpath, "r", encoding="utf-8") as f:
         td_variable_totals = yaml.safe_load(f)
     test_exempted_variables = [
         "DSI",  # Issue here but deprioritized

diff --git a/tmd/areas/create_area_weights.py b/tmd/areas/create_area_weights.py
@@ -25,8 +25,6 @@
 FIRST_YEAR = 2021
 LAST_YEAR = 2034
 INFILE_PATH = STORAGE_FOLDER / "output" / "tmd.csv.gz"
-WTFILE_PATH = STORAGE_FOLDER / "output" / "tmd_weights.csv.gz"
-GFFILE_PATH = STORAGE_FOLDER / "output" / "tmd_growfactors.csv"
 POPFILE_PATH = STORAGE_FOLDER / "input" / "cbo_population_forecast.yaml"
 
 # Tax-Calcultor calculated variable cache files:
@@ -641,7 +639,7 @@ def create_area_weights_file(
     with open(POPFILE_PATH, "r", encoding="utf-8") as pfile:
         pop = yaml.safe_load(pfile.read())
     # ... set FIRST_YEAR weights
-    weights = wght_area * 100  # scale up weights by 100 for Tax-Calculator
+    weights = wght_area
     # ... construct dictionary of scaled-up weights by year
     wdict = {f"WT{FIRST_YEAR}": weights}
     cum_pop_growth = 1.0
@@ -650,9 +648,9 @@ def create_area_weights_file(
         cum_pop_growth *= annual_pop_growth
         wght = weights.copy() * cum_pop_growth
         wdict[f"WT{year}"] = wght
-    # ... write rounded integer scaled-up weights to CSV-formatted file
+    # ... write weights to CSV-formatted file
     wdf = pd.DataFrame.from_dict(wdict)
-    wdf.to_csv(awpath, index=False, float_format="%.0f", compression="gzip")
+    wdf.to_csv(awpath, index=False, float_format="%.5f", compression="gzip")
 
     return 0
 

diff --git a/tmd/areas/targets/zz_targets.csv b/tmd/areas/targets/zz_targets.csv
diff --git a/tmd/create_all_datasets.py b/tmd/create_all_datasets.py
diff --git a/tmd/create_taxcalc_input_variables.py b/tmd/create_taxcalc_input_variables.py
@@ -48,6 +48,7 @@ def create_variable_file(write_file=True):
         weights=None,
         adjust_ratios=None,
         exact_calculations=True,
+        weights_scale=1.0,
     )
     vdf.drop(columns=rec.IGNORED_VARS, inplace=True)
     # round all float variables to nearest integer except for weights
@@ -59,7 +60,7 @@ def create_variable_file(write_file=True):
     if write_file:
         fname = STORAGE_FOLDER / "output" / "tmd.csv.gz"
         print(f"Writing PUF+CPS file... [{fname}]")
-        vdf.to_csv(fname, index=False, float_format="%.2f")
+        vdf.to_csv(fname, index=False, float_format="%.5f")
 
 
 if __name__ == "__main__":

diff --git a/tmd/create_taxcalc_sampling_weights.py b/tmd/create_taxcalc_sampling_weights.py
@@ -26,9 +26,9 @@ def create_weights_file(pop_file=POPFILE):
 
     # get FIRST_YEAR weights from VARFILE
     vdf = pd.read_csv(VARFILE)
-    weights = vdf.s006 * 100  # scale up weights by 100 for Tax-Calculator
+    weights = vdf.s006
 
-    # construct dictionary of scaled-up weights by year
+    # construct dictionary of weights by year
     wdict = {f"WT{FIRST_YEAR}": weights}
     cum_pop_growth = 1.0
     for year in range(FIRST_YEAR + 1, LAST_YEAR + 1):
@@ -37,9 +37,9 @@ def create_weights_file(pop_file=POPFILE):
         wght = weights.copy() * cum_pop_growth
         wdict[f"WT{year}"] = wght
 
-    # write rounded integer scaled-up weights to CSV-formatted file
+    # write weights to CSV-formatted file
     wdf = pd.DataFrame.from_dict(wdict)
-    wdf.to_csv(WGTFILE, index=False, float_format="%.0f", compression="gzip")
+    wdf.to_csv(WGTFILE, index=False, float_format="%.5f", compression="gzip")
 
 
 if __name__ == "__main__":