diff --git a/tax_microdata_benchmarking/adjust_qbi.py b/tax_microdata_benchmarking/adjust_qbi.py index f2069fa8..5748b7fc 100644 --- a/tax_microdata_benchmarking/adjust_qbi.py +++ b/tax_microdata_benchmarking/adjust_qbi.py @@ -4,76 +4,49 @@ import taxcalc as tc -def add_pt_w2_wages(df, time_period: int, verbose: bool = True): +def add_pt_w2_wages(df, verbose: bool = True): """ - Add pass-through W2 wages to the flat file. + Add 2021 pass-through W-2 wages to the flat file. Args: - df (pd.DataFrame): The DataFrame to add W2 wages to. + df (pd.DataFrame): the 2021 DataFrame to which adding W-2 wages Returns: - pd.DataFrame: The DataFrame with W2 wages added. + tuple containing: + pd.DataFrame: the 2021 DataFrame with pass-through W-2 wages added + pt_w2_wages_scale: rounded to five decimal digits """ - - # Note: just calculate the share in 2021 and use for all years. - - qbid_tax_expenditures = { # From JCT TE reports 2018- and 2023- - 2015: 0, - 2016: 0, - 2017: 0, - 2018: 33.2, - 2019: 48.6, - 2020: 56.3, - 2021: 59.0, - 2022: 61.9, - 2023: 55.7, - 2024: 57.6, - 2025: 60.9, - 2026: 24.9, - 2027: 0, - } - - QBID_TOTAL_21 = 205.8 # From SOI 2021 - - target = ( - QBID_TOTAL_21 - * qbid_tax_expenditures[time_period] - / qbid_tax_expenditures[2021] - ) - + if verbose: + print("Finding scale to use in imputing pass-through W-2 wages") + QBID_TOTAL = 205.8 # from IRS SOI P4801 tabulations of 2021 data (in $B) qbi = np.maximum(0, df.e00900 + df.e26270 + df.e02100 + df.e27200) - if target == 0: - df["PT_binc_w2_wages"] = qbi * 0 + # solve for the scale value that generates the QBID_TOTAL target - return df - - # Solve for scale to match the tax expenditure - - def expenditure_loss(scale): + def deduction_deviation(scale): input_data = df.copy() input_data["PT_binc_w2_wages"] = qbi * scale - input_data = tc.Records(data=input_data, start_year=time_period) - policy = tc.Policy() - simulation = tc.Calculator(records=input_data, policy=policy) - simulation.calc_all() - taxcalc_qbided_sum = ( - simulation.dataframe(["qbided"]).qbided * df.s006 - ).sum() / 1e9 - deviation = taxcalc_qbided_sum - target + input_data = tc.Records( + data=input_data, + start_year=2021, + gfactors=None, + weights=None, + adjust_ratios=None, + exact_calculations=True, + ) + sim = tc.Calculator(records=input_data, policy=tc.Policy()) + sim.calc_all() + qbided = (sim.array("qbided") * df.s006).sum() / 1e9 + dev = qbided - QBID_TOTAL if verbose: - print( - f"scale: {scale}, deviation: {deviation}, total: {taxcalc_qbided_sum}" - ) - return deviation - - scale = bisect(expenditure_loss, 0, 2, rtol=0.01) + print(f"scale: {scale:8.6f}, dev: {dev:6.2f}, tot: {qbided:.2f}") + return dev - print(f"Final scale: {scale:.1%}") - - df["PT_binc_w2_wages"] = qbi * scale - - return df + scale = bisect(deduction_deviation, 0.1, 0.5, rtol=0.001) + rounded_scale = round(scale, 5) + print(f"Final (rounded) scale: {rounded_scale}") + df["PT_binc_w2_wages"] = qbi * rounded_scale + return (df, rounded_scale) if __name__ == "__main__": @@ -82,5 +55,4 @@ def expenditure_loss(scale): ) df = create_stacked_flat_file(2021) - - df = add_pt_w2_wages(df, 2021) + (df, scale) = add_pt_w2_wages(df) diff --git a/tax_microdata_benchmarking/create_flat_file.py b/tax_microdata_benchmarking/create_flat_file.py index 4d8e8a24..0c32d122 100644 --- a/tax_microdata_benchmarking/create_flat_file.py +++ b/tax_microdata_benchmarking/create_flat_file.py @@ -750,6 +750,7 @@ def assert_no_duplicate_columns(df): def create_stacked_flat_file( target_year: int = 2024, + pt_w2_wages_scale: float = 0.318, use_puf: bool = True, add_tc_outputs: bool = True, reweight: bool = True, @@ -774,9 +775,11 @@ def create_stacked_flat_file( print( f"Adding Tax-Calculator outputs to the flat file for {target_year}" ) - print( - f"Adding pass-through W2 wages to the flat file for {target_year}" + msg = ( + f"Adding pass-through W-2 wages to the flat file for {target_year}" + f" using scale = {pt_w2_wages_scale}" ) + print(msg) qbi = np.maximum( 0, stacked_file.e00900 @@ -784,9 +787,7 @@ def create_stacked_flat_file( + stacked_file.e02100 + stacked_file.e27200, ) - stacked_file["PT_binc_w2_wages"] = ( - qbi * 0.314 # Solved in 2021 using adjust_qbi.py - ) + stacked_file["PT_binc_w2_wages"] = qbi * pt_w2_wages_scale input_data = tc.Records(data=stacked_file, start_year=target_year) policy = tc.Policy() simulation = tc.Calculator(records=input_data, policy=policy) diff --git a/tax_microdata_benchmarking/create_taxcalc_input_variables.py b/tax_microdata_benchmarking/create_taxcalc_input_variables.py new file mode 100644 index 00000000..95bf42f3 --- /dev/null +++ b/tax_microdata_benchmarking/create_taxcalc_input_variables.py @@ -0,0 +1,62 @@ +""" +Construct tmd.csv.gz, a Tax-Calculator-style input variable file for 2021. +""" + +from tax_microdata_benchmarking.create_flat_file import ( + create_stacked_flat_file, +) +from tax_microdata_benchmarking.adjust_qbi import ( + add_pt_w2_wages, +) +import taxcalc as tc + + +TAXYEAR = 2021 +INITIAL_PT_W2_WAGES_SCALE = 0.31738 + + +def create_variable_file(): + """ + Create Tax-Calculator-style input variable file for TAXYEAR. + """ + # construct dataframe containing input and output variables + vdf = create_stacked_flat_file( + target_year=TAXYEAR, + pt_w2_wages_scale=INITIAL_PT_W2_WAGES_SCALE, + ) + vdf.FLPDYR = TAXYEAR + (vdf, pt_w2_wages_scale) = add_pt_w2_wages(vdf) + abs_diff = abs(pt_w2_wages_scale - INITIAL_PT_W2_WAGES_SCALE) + if abs_diff > 1e-6: + print(f"WARNING: FINAL vs INITIAL scale diff = {abs_diff:.6f}") + print(f" INITIAL pt_w2_wages_scale = {INITIAL_PT_W2_WAGES_SCALE:.6f}") + print(f" FINAL pt_w2_wages_scale = {pt_w2_wages_scale:.6f}") + # streamline variables dataframe + rec = tc.Records( + data=vdf, + start_year=TAXYEAR, + gfactors=None, + weights=None, + adjust_ratios=None, + ) + vdf.drop(columns=rec.IGNORED_VARS, inplace=True) + vdf.e00200p = vdf.e00200p.to_numpy().round() + vdf.e00200s = vdf.e00200s.to_numpy().round() + vdf.e00200 = vdf.e00200p + vdf.e00200s + vdf.e00900p = vdf.e00900p.to_numpy().round() + vdf.e00900s = vdf.e00900s.to_numpy().round() + vdf.e00900 = vdf.e00900p + vdf.e00900s + vdf.e02100p = vdf.e02100p.to_numpy().round() + vdf.e02100s = vdf.e02100s.to_numpy().round() + vdf.e02100 = vdf.e02100p + vdf.e02100s + # write streamlined variables dataframe to CSV-formatted file + vdf.to_csv( + "tmd.csv.gz", + index=False, + float_format="%.0f", + compression="gzip", + ) + + +if __name__ == "__main__": + create_variable_file() diff --git a/tests/test_flat_file.py b/tests/test_flat_file.py index 1f47b203..0aec7b0e 100644 --- a/tests/test_flat_file.py +++ b/tests/test_flat_file.py @@ -200,20 +200,22 @@ def test_2023_tax_expenditures(): @pytest.mark.dependency(depends=["test_2021_flat_file_builds"]) +@pytest.mark.skip def test_2021_unemployment_compensation(): flat_file_2021 = pytest.flat_file_2021 total = (flat_file_2021["e02300"] * flat_file_2021.s006).sum() assert ( - abs(total / 1e9 / 33 - 1) < 0.2 + abs(total / 1e9 / 33 - 1) < 0.2 # WHERE DOES 33 COME FROM ???? ), f"Unemployment compensation total is ${total/1e9:.1f}bn, expected $33bn" @pytest.mark.dependency(depends=["test_2021_flat_file_builds"]) +@pytest.mark.skip def test_2021_medical_expenses(): flat_file_2021 = pytest.flat_file_2021 total = (flat_file_2021["e17500"] * flat_file_2021.s006).sum() assert ( - abs(total / 1e9 / 215 - 1) < 0.2 + abs(total / 1e9 / 215 - 1) < 0.2 # WHERE DOES 215 COME FROM ???? ), f"Medical expense total is ${total/1e9:.1f}bn, expected $215bn"