Skip to content

Commit

Permalink
Merge pull request #66 from martinholmer/tmd-var-file
Browse files Browse the repository at this point in the history
Create Tax-Calculator-style input variable file for 2021
  • Loading branch information
martinholmer authored May 1, 2024
2 parents ccac06a + 4c180b9 commit bbc4ab9
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 66 deletions.
90 changes: 31 additions & 59 deletions tax_microdata_benchmarking/adjust_qbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,76 +4,49 @@
import taxcalc as tc


def add_pt_w2_wages(df, time_period: int, verbose: bool = True):
def add_pt_w2_wages(df, verbose: bool = True):
"""
Add pass-through W2 wages to the flat file.
Add 2021 pass-through W-2 wages to the flat file.
Args:
df (pd.DataFrame): The DataFrame to add W2 wages to.
df (pd.DataFrame): the 2021 DataFrame to which adding W-2 wages
Returns:
pd.DataFrame: The DataFrame with W2 wages added.
tuple containing:
pd.DataFrame: the 2021 DataFrame with pass-through W-2 wages added
pt_w2_wages_scale: rounded to five decimal digits
"""

# Note: just calculate the share in 2021 and use for all years.

qbid_tax_expenditures = { # From JCT TE reports 2018- and 2023-
2015: 0,
2016: 0,
2017: 0,
2018: 33.2,
2019: 48.6,
2020: 56.3,
2021: 59.0,
2022: 61.9,
2023: 55.7,
2024: 57.6,
2025: 60.9,
2026: 24.9,
2027: 0,
}

QBID_TOTAL_21 = 205.8 # From SOI 2021

target = (
QBID_TOTAL_21
* qbid_tax_expenditures[time_period]
/ qbid_tax_expenditures[2021]
)

if verbose:
print("Finding scale to use in imputing pass-through W-2 wages")
QBID_TOTAL = 205.8 # from IRS SOI P4801 tabulations of 2021 data (in $B)
qbi = np.maximum(0, df.e00900 + df.e26270 + df.e02100 + df.e27200)

if target == 0:
df["PT_binc_w2_wages"] = qbi * 0
# solve for the scale value that generates the QBID_TOTAL target

return df

# Solve for scale to match the tax expenditure

def expenditure_loss(scale):
def deduction_deviation(scale):
input_data = df.copy()
input_data["PT_binc_w2_wages"] = qbi * scale
input_data = tc.Records(data=input_data, start_year=time_period)
policy = tc.Policy()
simulation = tc.Calculator(records=input_data, policy=policy)
simulation.calc_all()
taxcalc_qbided_sum = (
simulation.dataframe(["qbided"]).qbided * df.s006
).sum() / 1e9
deviation = taxcalc_qbided_sum - target
input_data = tc.Records(
data=input_data,
start_year=2021,
gfactors=None,
weights=None,
adjust_ratios=None,
exact_calculations=True,
)
sim = tc.Calculator(records=input_data, policy=tc.Policy())
sim.calc_all()
qbided = (sim.array("qbided") * df.s006).sum() / 1e9
dev = qbided - QBID_TOTAL
if verbose:
print(
f"scale: {scale}, deviation: {deviation}, total: {taxcalc_qbided_sum}"
)
return deviation

scale = bisect(expenditure_loss, 0, 2, rtol=0.01)
print(f"scale: {scale:8.6f}, dev: {dev:6.2f}, tot: {qbided:.2f}")
return dev

print(f"Final scale: {scale:.1%}")

df["PT_binc_w2_wages"] = qbi * scale

return df
scale = bisect(deduction_deviation, 0.1, 0.5, rtol=0.001)
rounded_scale = round(scale, 5)
print(f"Final (rounded) scale: {rounded_scale}")
df["PT_binc_w2_wages"] = qbi * rounded_scale
return (df, rounded_scale)


if __name__ == "__main__":
Expand All @@ -82,5 +55,4 @@ def expenditure_loss(scale):
)

df = create_stacked_flat_file(2021)

df = add_pt_w2_wages(df, 2021)
(df, scale) = add_pt_w2_wages(df)
11 changes: 6 additions & 5 deletions tax_microdata_benchmarking/create_flat_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -750,6 +750,7 @@ def assert_no_duplicate_columns(df):

def create_stacked_flat_file(
target_year: int = 2024,
pt_w2_wages_scale: float = 0.318,
use_puf: bool = True,
add_tc_outputs: bool = True,
reweight: bool = True,
Expand All @@ -774,19 +775,19 @@ def create_stacked_flat_file(
print(
f"Adding Tax-Calculator outputs to the flat file for {target_year}"
)
print(
f"Adding pass-through W2 wages to the flat file for {target_year}"
msg = (
f"Adding pass-through W-2 wages to the flat file for {target_year}"
f" using scale = {pt_w2_wages_scale}"
)
print(msg)
qbi = np.maximum(
0,
stacked_file.e00900
+ stacked_file.e26270
+ stacked_file.e02100
+ stacked_file.e27200,
)
stacked_file["PT_binc_w2_wages"] = (
qbi * 0.314 # Solved in 2021 using adjust_qbi.py
)
stacked_file["PT_binc_w2_wages"] = qbi * pt_w2_wages_scale
input_data = tc.Records(data=stacked_file, start_year=target_year)
policy = tc.Policy()
simulation = tc.Calculator(records=input_data, policy=policy)
Expand Down
62 changes: 62 additions & 0 deletions tax_microdata_benchmarking/create_taxcalc_input_variables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""
Construct tmd.csv.gz, a Tax-Calculator-style input variable file for 2021.
"""

from tax_microdata_benchmarking.create_flat_file import (
create_stacked_flat_file,
)
from tax_microdata_benchmarking.adjust_qbi import (
add_pt_w2_wages,
)
import taxcalc as tc


TAXYEAR = 2021
INITIAL_PT_W2_WAGES_SCALE = 0.31738


def create_variable_file():
"""
Create Tax-Calculator-style input variable file for TAXYEAR.
"""
# construct dataframe containing input and output variables
vdf = create_stacked_flat_file(
target_year=TAXYEAR,
pt_w2_wages_scale=INITIAL_PT_W2_WAGES_SCALE,
)
vdf.FLPDYR = TAXYEAR
(vdf, pt_w2_wages_scale) = add_pt_w2_wages(vdf)
abs_diff = abs(pt_w2_wages_scale - INITIAL_PT_W2_WAGES_SCALE)
if abs_diff > 1e-6:
print(f"WARNING: FINAL vs INITIAL scale diff = {abs_diff:.6f}")
print(f" INITIAL pt_w2_wages_scale = {INITIAL_PT_W2_WAGES_SCALE:.6f}")
print(f" FINAL pt_w2_wages_scale = {pt_w2_wages_scale:.6f}")
# streamline variables dataframe
rec = tc.Records(
data=vdf,
start_year=TAXYEAR,
gfactors=None,
weights=None,
adjust_ratios=None,
)
vdf.drop(columns=rec.IGNORED_VARS, inplace=True)
vdf.e00200p = vdf.e00200p.to_numpy().round()
vdf.e00200s = vdf.e00200s.to_numpy().round()
vdf.e00200 = vdf.e00200p + vdf.e00200s
vdf.e00900p = vdf.e00900p.to_numpy().round()
vdf.e00900s = vdf.e00900s.to_numpy().round()
vdf.e00900 = vdf.e00900p + vdf.e00900s
vdf.e02100p = vdf.e02100p.to_numpy().round()
vdf.e02100s = vdf.e02100s.to_numpy().round()
vdf.e02100 = vdf.e02100p + vdf.e02100s
# write streamlined variables dataframe to CSV-formatted file
vdf.to_csv(
"tmd.csv.gz",
index=False,
float_format="%.0f",
compression="gzip",
)


if __name__ == "__main__":
create_variable_file()
6 changes: 4 additions & 2 deletions tests/test_flat_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,20 +200,22 @@ def test_2023_tax_expenditures():


@pytest.mark.dependency(depends=["test_2021_flat_file_builds"])
@pytest.mark.skip
def test_2021_unemployment_compensation():
flat_file_2021 = pytest.flat_file_2021

total = (flat_file_2021["e02300"] * flat_file_2021.s006).sum()
assert (
abs(total / 1e9 / 33 - 1) < 0.2
abs(total / 1e9 / 33 - 1) < 0.2 # WHERE DOES 33 COME FROM ????
), f"Unemployment compensation total is ${total/1e9:.1f}bn, expected $33bn"


@pytest.mark.dependency(depends=["test_2021_flat_file_builds"])
@pytest.mark.skip
def test_2021_medical_expenses():
flat_file_2021 = pytest.flat_file_2021

total = (flat_file_2021["e17500"] * flat_file_2021.s006).sum()
assert (
abs(total / 1e9 / 215 - 1) < 0.2
abs(total / 1e9 / 215 - 1) < 0.2 # WHERE DOES 215 COME FROM ????
), f"Medical expense total is ${total/1e9:.1f}bn, expected $215bn"

0 comments on commit bbc4ab9

Please sign in to comment.