Merge pull request #104 from PSLmodels/fix-103

Make changes necessary to fix issue 103
PSLmodels · Jun 24, 2024 · 13447c8 · 13447c8
2 parents c3ca151 + 3234231
commit 13447c8
Show file tree

Hide file tree

Showing 12 changed files with 463 additions and 58 deletions.
diff --git a/setup.py b/setup.py
@@ -13,6 +13,6 @@
         "tensorboard",
         "jupyter-book",
         "furo",
-        "survey_enhance",
+        "scikit-learn",
     ],
 )
diff --git a/tax_microdata_benchmarking/create_taxcalc_input_variables.py b/tax_microdata_benchmarking/create_taxcalc_input_variables.py
@@ -2,39 +2,49 @@
 Construct tmd.csv, a Tax-Calculator-style input variable file for 2021.
 """
 
+import taxcalc as tc
+from tax_microdata_benchmarking.datasets.tmd import create_tmd_2021
+from tax_microdata_benchmarking.utils.qbi import (
+    add_pt_w2_wages,
+)
+from tax_microdata_benchmarking.imputation_assumptions import (
+    IMPUTATION_RF_RNG_SEED,
+    IMPUTATION_BETA_RNG_SEED,
+    W2_WAGES_SCALE,
+)
+from tax_microdata_benchmarking.storage import STORAGE_FOLDER
+
+
 TAXYEAR = 2021
+INITIAL_W2_WAGES_SCALE = W2_WAGES_SCALE
 DO_REWEIGHTING = True
-INITIAL_W2_WAGES_SCALE = 0.19980
 INCLUDE_ORIGINAL_WEIGHTS = True
 
 
 def create_variable_file(write_file=True):
     """
     Create Tax-Calculator-style input variable file for TAXYEAR.
     """
-    import taxcalc as tc
-    from tax_microdata_benchmarking.datasets.tmd import create_tmd_2021
-    from tax_microdata_benchmarking.utils.qbi import (
-        add_pt_w2_wages,
-    )
-    from tax_microdata_benchmarking.storage import STORAGE_FOLDER
-
     # construct dataframe containing input and output variables
-    print(f"Creating {TAXYEAR} PUF-ECPS file assuming:")
-    print(f"  DO_REWEIGHTING = {DO_REWEIGHTING}")
+    print(f"Creating {TAXYEAR} PUF+CPS file assuming:")
+    print(f"  IMPUTATION_RF_RNG_SEED = {IMPUTATION_RF_RNG_SEED}")
+    print(f"  IMPUTATION_BETA_RNG_SEED = {IMPUTATION_BETA_RNG_SEED}")
     print(f"  INITIAL_W2_WAGES_SCALE = {INITIAL_W2_WAGES_SCALE:.5f}")
+    print(f"  DO_REWEIGHTING = {DO_REWEIGHTING}")
     print(f"  INCLUDE_ORIGINAL_WEIGHTS = {INCLUDE_ORIGINAL_WEIGHTS}")
     vdf = create_tmd_2021()
     vdf.FLPDYR = TAXYEAR
     (vdf, pt_w2_wages_scale) = add_pt_w2_wages(vdf)
     abs_diff = abs(pt_w2_wages_scale - INITIAL_W2_WAGES_SCALE)
-    if abs_diff > 1e-6:
-        msg = (
-            f"\nFINAL vs INITIAL scale diff = {abs_diff:.6f}"
-            f"\n  INITIAL pt_w2_wages_scale = {INITIAL_W2_WAGES_SCALE:.6f}"
-            f"\n    FINAL pt_w2_wages_scale = {pt_w2_wages_scale:.6f}"
-        )
-        raise ValueError(msg)
+    msg = (
+        f"  FINAL vs INITIAL scale diff = {abs_diff:.6f}\n"
+        f"    INITIAL pt_w2_wages_scale = {INITIAL_W2_WAGES_SCALE:.6f}\n"
+        f"      FINAL pt_w2_wages_scale = {pt_w2_wages_scale:.6f}"
+    )
+    print(msg)
+    if abs_diff > 1e-3:
+        emsg = "INITIAL and FINAL scale values are substantially inconsistent"
+        raise ValueError(emsg)
     # streamline dataframe so that it includes only input variables
     rec = tc.Records(
         data=vdf,
@@ -59,6 +69,7 @@ def create_variable_file(write_file=True):
     # write streamlined variables dataframe to CSV-formatted file
     if write_file:
         tmd_csv_fname = STORAGE_FOLDER / "output" / "tmd.csv.gz"
+        print(f"Writing PUF+CPS file... [{tmd_csv_fname}]")
         vdf.to_csv(tmd_csv_fname, index=False, float_format="%.2f")
 
 

diff --git a/tax_microdata_benchmarking/datasets/cps.py b/tax_microdata_benchmarking/datasets/cps.py
@@ -1,6 +1,5 @@
 from io import BytesIO
 from zipfile import ZipFile
-from policyengine_core.data import Dataset
 import pandas as pd
 import requests
 from tqdm import tqdm
@@ -714,13 +713,10 @@ def add_household_variables(cps: h5py.File, household: DataFrame) -> None:
 
 def add_previous_year_income(self, cps: h5py.File) -> None:
     if self.previous_year_raw_cps is None:
-        print(
-            "No previous year data available for this dataset, skipping previous year income imputation."
-        )
+        msg = "Skipping CPS previous year income imputation given lack of data"
+        print(f"{msg}...")
         return
 
-    from survey_enhance.impute import Imputation
-
     cps_current_year_data = self.raw_cps(require=True).load()
     cps_previous_year_data = self.previous_year_raw_cps(require=True).load()
     cps_previous_year = cps_previous_year_data.person.set_index(

diff --git a/tax_microdata_benchmarking/datasets/puf.py b/tax_microdata_benchmarking/datasets/puf.py
@@ -1,14 +1,17 @@
 import pandas as pd
 import numpy as np
 import yaml
-from survey_enhance import Imputation
 from microdf import MicroDataFrame
 from tax_microdata_benchmarking.storage import STORAGE_FOLDER
 from tax_microdata_benchmarking.utils.pension_contributions import (
     impute_pension_contributions_to_puf,
 )
-
-DEFAULT_W2_WAGE_RATE = 0.19824  # Solved for JCT Tax Expenditures in 2021
+from tax_microdata_benchmarking.utils.imputation import Imputation
+from tax_microdata_benchmarking.imputation_assumptions import (
+    IMPUTATION_RF_RNG_SEED,
+    IMPUTATION_BETA_RNG_SEED,
+    W2_WAGES_SCALE,
+)
 
 
 def impute_missing_demographics(
@@ -37,6 +40,9 @@ def impute_missing_demographics(
     ]
 
     demographics_from_puf = Imputation()
+    demographics_from_puf.rf_rng_seed = IMPUTATION_RF_RNG_SEED
+    demographics_from_puf.beta_rng_seed = IMPUTATION_BETA_RNG_SEED
+
     demographics_from_puf.train(
         X=puf_with_demographics[NON_DEMOGRAPHIC_VARIABLES],
         Y=puf_with_demographics[DEMOGRAPHIC_VARIABLES],
@@ -46,7 +52,7 @@ def impute_missing_demographics(
         ~puf.RECID.isin(puf_with_demographics.RECID)
     ].reset_index()
     predicted_demographics = demographics_from_puf.predict(
-        puf_without_demographics
+        X=puf_without_demographics,
     )
     puf_with_imputed_demographics = pd.concat(
         [puf_without_demographics, predicted_demographics], axis=1
@@ -174,8 +180,8 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     # Ignore f2441 (AMT form attached)
     # Ignore cmbtp (estimate of AMT income not in AGI)
     # Ignore k1bx14s and k1bx14p (partner self-employment income included in partnership and S-corp income)
-    qbi = puf.E00900 + puf.E26270 + puf.E02100 + puf.E27200
-    puf["w2_wages_from_qualified_business"] = qbi * DEFAULT_W2_WAGE_RATE
+    qbi = np.maximum(0, puf.E00900 + puf.E26270 + puf.E02100 + puf.E27200)
+    puf["w2_wages_from_qualified_business"] = qbi * W2_WAGES_SCALE
 
     puf["filing_status"] = puf.MARS.map(
         {
@@ -264,15 +270,14 @@ def generate(self, puf: pd.DataFrame, demographics: pd.DataFrame):
         from tax_microdata_benchmarking.datasets.uprate_puf import uprate_puf
 
         if self.time_period > 2015:
-            print("Uprating PUF...")
             puf = uprate_puf(puf, 2015, self.time_period)
 
-        print("Loading and pre-processing PUF...")
+        print("Pre-processing PUF...")
         original_recid = puf.RECID.values.copy()
         puf = preprocess_puf(puf)
-        print("Imputing missing demographics...")
+        print("Imputing missing PUF demographics...")
         puf = impute_missing_demographics(puf, demographics)
-        print("Imputing pension contributions...")
+        print("Imputing PUF pension contributions...")
         puf["pre_tax_contributions"] = impute_pension_contributions_to_puf(
             puf[["employment_income"]]
         )

diff --git a/tax_microdata_benchmarking/datasets/taxcalc_dataset.py b/tax_microdata_benchmarking/datasets/taxcalc_dataset.py
@@ -14,6 +14,8 @@ def create_tc_dataset(pe_dataset: Type, year: int = 2015) -> pd.DataFrame:
     pe_sim = Microsimulation(dataset=pe_dataset)
     df = pd.DataFrame()
 
+    print(f"Creating tc dataset for year {year}...")
+
     is_non_dep = ~pe_sim.calculate("is_tax_unit_dependent").values
     tax_unit = pe_sim.populations["tax_unit"]
 

diff --git a/tax_microdata_benchmarking/datasets/tmd.py b/tax_microdata_benchmarking/datasets/tmd.py
@@ -13,16 +13,18 @@
 
 
 def create_tmd_2021():
-    if not CPS_2021().exists:
-        # Don't recreate if already exists.
-        create_cps_2021()
-    if not PUF_2021().exists:
-        # Don't recreate if already exists.
-        create_pe_puf_2021()
+    # always create CPS_2021 and PUF_2021
+    # (because imputation assumptions may have changed)
+    create_cps_2021()
+    create_pe_puf_2021()
+
     tc_puf_21 = create_tc_dataset(PUF_2021)
     tc_cps_21 = create_tc_dataset(CPS_2021)
 
-    # Add nonfiler flag to tc_cps_21 with 2022 filing rules (2021 had large changes)
+    print("Combining PUF and CPS nonfilers...")
+
+    # Add nonfiler flag to tc_cps_21 with 2022 filing rules
+    # (2021 had large changes)
     from policyengine_us import Microsimulation
 
     sim = Microsimulation(dataset=CPS_2021)
@@ -31,17 +33,13 @@ def create_tmd_2021():
 
     combined = pd.concat([tc_puf_21, tc_cps_21], ignore_index=True)
 
-    print("Combined PUF and CPS nonfilers.")
-
     # Add Tax-Calculator outputs
     print("Adding Tax-Calculator outputs...")
     combined = add_taxcalc_outputs(combined, 2021)
     combined["s006_original"] = combined.s006.values
     print("Reweighting...")
     combined = reweight(combined, 2021, weight_deviation_penalty=0)
 
-    print("Completed.")
-
     return combined
 
 

diff --git a/tax_microdata_benchmarking/datasets/uprate_puf.py b/tax_microdata_benchmarking/datasets/uprate_puf.py
@@ -134,13 +134,13 @@ def get_growth(variable, from_year, to_year):
 
 
 def uprate_puf(puf, from_year, to_year):
+    print(f"Uprating PUF from {from_year} to {to_year}...")
     puf = puf.copy()
     for variable in SOI_TO_PUF_STRAIGHT_RENAMES:
         growth = get_growth(variable, from_year, to_year)
         puf[SOI_TO_PUF_STRAIGHT_RENAMES[variable]] *= growth
 
     # Positive and negative split variables
-
     for variable in SOI_TO_PUF_POS_ONLY_RENAMES:
         growth = get_growth(variable, from_year, to_year)
         puf_variable = SOI_TO_PUF_POS_ONLY_RENAMES[variable]
@@ -151,20 +151,18 @@ def uprate_puf(puf, from_year, to_year):
         puf_variable = SOI_TO_PUF_NEG_ONLY_RENAMES[variable]
         puf[puf_variable][puf[puf_variable] < 0] *= growth
 
-    # Remaining variables, uprate purely by AGI growth (for now, because I'm not sure how to handle the deductions, credits and incomes separately)
-
+    # Remaining variables, uprate purely by AGI growth
+    # (for now, because I'm not sure how to handle the deductions,
+    #  credits, and incomes separately)
     for variable in REMAINING_VARIABLES:
         growth = get_growth("adjusted_gross_income", from_year, to_year)
         puf[variable] *= growth
 
     # Uprate the weights
-
     returns_start = get_soi_aggregate("count", from_year, True)
     returns_end = get_soi_aggregate("count", to_year, True)
     puf.S006 *= returns_end / returns_start
 
-    print(f"Uprated PUF from {from_year} to {to_year}")
-
     return puf
 
 

diff --git a/tax_microdata_benchmarking/imputation_assumptions.py b/tax_microdata_benchmarking/imputation_assumptions.py
@@ -0,0 +1,9 @@
+"""
+Central location for data imputation assumptions.
+"""
+
+IMPUTATION_RF_RNG_SEED = 1928374  # random number seed used by RandomForest
+
+IMPUTATION_BETA_RNG_SEED = 37465  # random number seed used for Beta variates
+
+W2_WAGES_SCALE = 0.19979  # parameter used to impute pass-through W-2 wages