Skip to content

Commit

Permalink
Merge pull request #104 from PSLmodels/fix-103
Browse files Browse the repository at this point in the history
Make changes necessary to fix issue 103
  • Loading branch information
martinholmer authored Jun 24, 2024
2 parents c3ca151 + 3234231 commit 13447c8
Show file tree
Hide file tree
Showing 12 changed files with 463 additions and 58 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@
"tensorboard",
"jupyter-book",
"furo",
"survey_enhance",
"scikit-learn",
],
)
45 changes: 28 additions & 17 deletions tax_microdata_benchmarking/create_taxcalc_input_variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,39 +2,49 @@
Construct tmd.csv, a Tax-Calculator-style input variable file for 2021.
"""

import taxcalc as tc
from tax_microdata_benchmarking.datasets.tmd import create_tmd_2021
from tax_microdata_benchmarking.utils.qbi import (
add_pt_w2_wages,
)
from tax_microdata_benchmarking.imputation_assumptions import (
IMPUTATION_RF_RNG_SEED,
IMPUTATION_BETA_RNG_SEED,
W2_WAGES_SCALE,
)
from tax_microdata_benchmarking.storage import STORAGE_FOLDER


TAXYEAR = 2021
INITIAL_W2_WAGES_SCALE = W2_WAGES_SCALE
DO_REWEIGHTING = True
INITIAL_W2_WAGES_SCALE = 0.19980
INCLUDE_ORIGINAL_WEIGHTS = True


def create_variable_file(write_file=True):
"""
Create Tax-Calculator-style input variable file for TAXYEAR.
"""
import taxcalc as tc
from tax_microdata_benchmarking.datasets.tmd import create_tmd_2021
from tax_microdata_benchmarking.utils.qbi import (
add_pt_w2_wages,
)
from tax_microdata_benchmarking.storage import STORAGE_FOLDER

# construct dataframe containing input and output variables
print(f"Creating {TAXYEAR} PUF-ECPS file assuming:")
print(f" DO_REWEIGHTING = {DO_REWEIGHTING}")
print(f"Creating {TAXYEAR} PUF+CPS file assuming:")
print(f" IMPUTATION_RF_RNG_SEED = {IMPUTATION_RF_RNG_SEED}")
print(f" IMPUTATION_BETA_RNG_SEED = {IMPUTATION_BETA_RNG_SEED}")
print(f" INITIAL_W2_WAGES_SCALE = {INITIAL_W2_WAGES_SCALE:.5f}")
print(f" DO_REWEIGHTING = {DO_REWEIGHTING}")
print(f" INCLUDE_ORIGINAL_WEIGHTS = {INCLUDE_ORIGINAL_WEIGHTS}")
vdf = create_tmd_2021()
vdf.FLPDYR = TAXYEAR
(vdf, pt_w2_wages_scale) = add_pt_w2_wages(vdf)
abs_diff = abs(pt_w2_wages_scale - INITIAL_W2_WAGES_SCALE)
if abs_diff > 1e-6:
msg = (
f"\nFINAL vs INITIAL scale diff = {abs_diff:.6f}"
f"\n INITIAL pt_w2_wages_scale = {INITIAL_W2_WAGES_SCALE:.6f}"
f"\n FINAL pt_w2_wages_scale = {pt_w2_wages_scale:.6f}"
)
raise ValueError(msg)
msg = (
f" FINAL vs INITIAL scale diff = {abs_diff:.6f}\n"
f" INITIAL pt_w2_wages_scale = {INITIAL_W2_WAGES_SCALE:.6f}\n"
f" FINAL pt_w2_wages_scale = {pt_w2_wages_scale:.6f}"
)
print(msg)
if abs_diff > 1e-3:
emsg = "INITIAL and FINAL scale values are substantially inconsistent"
raise ValueError(emsg)
# streamline dataframe so that it includes only input variables
rec = tc.Records(
data=vdf,
Expand All @@ -59,6 +69,7 @@ def create_variable_file(write_file=True):
# write streamlined variables dataframe to CSV-formatted file
if write_file:
tmd_csv_fname = STORAGE_FOLDER / "output" / "tmd.csv.gz"
print(f"Writing PUF+CPS file... [{tmd_csv_fname}]")
vdf.to_csv(tmd_csv_fname, index=False, float_format="%.2f")


Expand Down
8 changes: 2 additions & 6 deletions tax_microdata_benchmarking/datasets/cps.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from io import BytesIO
from zipfile import ZipFile
from policyengine_core.data import Dataset
import pandas as pd
import requests
from tqdm import tqdm
Expand Down Expand Up @@ -714,13 +713,10 @@ def add_household_variables(cps: h5py.File, household: DataFrame) -> None:

def add_previous_year_income(self, cps: h5py.File) -> None:
if self.previous_year_raw_cps is None:
print(
"No previous year data available for this dataset, skipping previous year income imputation."
)
msg = "Skipping CPS previous year income imputation given lack of data"
print(f"{msg}...")
return

from survey_enhance.impute import Imputation

cps_current_year_data = self.raw_cps(require=True).load()
cps_previous_year_data = self.previous_year_raw_cps(require=True).load()
cps_previous_year = cps_previous_year_data.person.set_index(
Expand Down
25 changes: 15 additions & 10 deletions tax_microdata_benchmarking/datasets/puf.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import pandas as pd
import numpy as np
import yaml
from survey_enhance import Imputation
from microdf import MicroDataFrame
from tax_microdata_benchmarking.storage import STORAGE_FOLDER
from tax_microdata_benchmarking.utils.pension_contributions import (
impute_pension_contributions_to_puf,
)

DEFAULT_W2_WAGE_RATE = 0.19824 # Solved for JCT Tax Expenditures in 2021
from tax_microdata_benchmarking.utils.imputation import Imputation
from tax_microdata_benchmarking.imputation_assumptions import (
IMPUTATION_RF_RNG_SEED,
IMPUTATION_BETA_RNG_SEED,
W2_WAGES_SCALE,
)


def impute_missing_demographics(
Expand Down Expand Up @@ -37,6 +40,9 @@ def impute_missing_demographics(
]

demographics_from_puf = Imputation()
demographics_from_puf.rf_rng_seed = IMPUTATION_RF_RNG_SEED
demographics_from_puf.beta_rng_seed = IMPUTATION_BETA_RNG_SEED

demographics_from_puf.train(
X=puf_with_demographics[NON_DEMOGRAPHIC_VARIABLES],
Y=puf_with_demographics[DEMOGRAPHIC_VARIABLES],
Expand All @@ -46,7 +52,7 @@ def impute_missing_demographics(
~puf.RECID.isin(puf_with_demographics.RECID)
].reset_index()
predicted_demographics = demographics_from_puf.predict(
puf_without_demographics
X=puf_without_demographics,
)
puf_with_imputed_demographics = pd.concat(
[puf_without_demographics, predicted_demographics], axis=1
Expand Down Expand Up @@ -174,8 +180,8 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
# Ignore f2441 (AMT form attached)
# Ignore cmbtp (estimate of AMT income not in AGI)
# Ignore k1bx14s and k1bx14p (partner self-employment income included in partnership and S-corp income)
qbi = puf.E00900 + puf.E26270 + puf.E02100 + puf.E27200
puf["w2_wages_from_qualified_business"] = qbi * DEFAULT_W2_WAGE_RATE
qbi = np.maximum(0, puf.E00900 + puf.E26270 + puf.E02100 + puf.E27200)
puf["w2_wages_from_qualified_business"] = qbi * W2_WAGES_SCALE

puf["filing_status"] = puf.MARS.map(
{
Expand Down Expand Up @@ -264,15 +270,14 @@ def generate(self, puf: pd.DataFrame, demographics: pd.DataFrame):
from tax_microdata_benchmarking.datasets.uprate_puf import uprate_puf

if self.time_period > 2015:
print("Uprating PUF...")
puf = uprate_puf(puf, 2015, self.time_period)

print("Loading and pre-processing PUF...")
print("Pre-processing PUF...")
original_recid = puf.RECID.values.copy()
puf = preprocess_puf(puf)
print("Imputing missing demographics...")
print("Imputing missing PUF demographics...")
puf = impute_missing_demographics(puf, demographics)
print("Imputing pension contributions...")
print("Imputing PUF pension contributions...")
puf["pre_tax_contributions"] = impute_pension_contributions_to_puf(
puf[["employment_income"]]
)
Expand Down
2 changes: 2 additions & 0 deletions tax_microdata_benchmarking/datasets/taxcalc_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ def create_tc_dataset(pe_dataset: Type, year: int = 2015) -> pd.DataFrame:
pe_sim = Microsimulation(dataset=pe_dataset)
df = pd.DataFrame()

print(f"Creating tc dataset for year {year}...")

is_non_dep = ~pe_sim.calculate("is_tax_unit_dependent").values
tax_unit = pe_sim.populations["tax_unit"]

Expand Down
20 changes: 9 additions & 11 deletions tax_microdata_benchmarking/datasets/tmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,18 @@


def create_tmd_2021():
if not CPS_2021().exists:
# Don't recreate if already exists.
create_cps_2021()
if not PUF_2021().exists:
# Don't recreate if already exists.
create_pe_puf_2021()
# always create CPS_2021 and PUF_2021
# (because imputation assumptions may have changed)
create_cps_2021()
create_pe_puf_2021()

tc_puf_21 = create_tc_dataset(PUF_2021)
tc_cps_21 = create_tc_dataset(CPS_2021)

# Add nonfiler flag to tc_cps_21 with 2022 filing rules (2021 had large changes)
print("Combining PUF and CPS nonfilers...")

# Add nonfiler flag to tc_cps_21 with 2022 filing rules
# (2021 had large changes)
from policyengine_us import Microsimulation

sim = Microsimulation(dataset=CPS_2021)
Expand All @@ -31,17 +33,13 @@ def create_tmd_2021():

combined = pd.concat([tc_puf_21, tc_cps_21], ignore_index=True)

print("Combined PUF and CPS nonfilers.")

# Add Tax-Calculator outputs
print("Adding Tax-Calculator outputs...")
combined = add_taxcalc_outputs(combined, 2021)
combined["s006_original"] = combined.s006.values
print("Reweighting...")
combined = reweight(combined, 2021, weight_deviation_penalty=0)

print("Completed.")

return combined


Expand Down
10 changes: 4 additions & 6 deletions tax_microdata_benchmarking/datasets/uprate_puf.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,13 +134,13 @@ def get_growth(variable, from_year, to_year):


def uprate_puf(puf, from_year, to_year):
print(f"Uprating PUF from {from_year} to {to_year}...")
puf = puf.copy()
for variable in SOI_TO_PUF_STRAIGHT_RENAMES:
growth = get_growth(variable, from_year, to_year)
puf[SOI_TO_PUF_STRAIGHT_RENAMES[variable]] *= growth

# Positive and negative split variables

for variable in SOI_TO_PUF_POS_ONLY_RENAMES:
growth = get_growth(variable, from_year, to_year)
puf_variable = SOI_TO_PUF_POS_ONLY_RENAMES[variable]
Expand All @@ -151,20 +151,18 @@ def uprate_puf(puf, from_year, to_year):
puf_variable = SOI_TO_PUF_NEG_ONLY_RENAMES[variable]
puf[puf_variable][puf[puf_variable] < 0] *= growth

# Remaining variables, uprate purely by AGI growth (for now, because I'm not sure how to handle the deductions, credits and incomes separately)

# Remaining variables, uprate purely by AGI growth
# (for now, because I'm not sure how to handle the deductions,
# credits, and incomes separately)
for variable in REMAINING_VARIABLES:
growth = get_growth("adjusted_gross_income", from_year, to_year)
puf[variable] *= growth

# Uprate the weights

returns_start = get_soi_aggregate("count", from_year, True)
returns_end = get_soi_aggregate("count", to_year, True)
puf.S006 *= returns_end / returns_start

print(f"Uprated PUF from {from_year} to {to_year}")

return puf


Expand Down
9 changes: 9 additions & 0 deletions tax_microdata_benchmarking/imputation_assumptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""
Central location for data imputation assumptions.
"""

IMPUTATION_RF_RNG_SEED = 1928374 # random number seed used by RandomForest

IMPUTATION_BETA_RNG_SEED = 37465 # random number seed used for Beta variates

W2_WAGES_SCALE = 0.19979 # parameter used to impute pass-through W-2 wages
Loading

0 comments on commit 13447c8

Please sign in to comment.