Merge pull request #66 from martinholmer/tmd-var-file

Create Tax-Calculator-style input variable file for 2021
PSLmodels · May 1, 2024 · bbc4ab9 · bbc4ab9
2 parents ccac06a + 4c180b9
commit bbc4ab9
Show file tree

Hide file tree

Showing 4 changed files with 103 additions and 66 deletions.
diff --git a/tax_microdata_benchmarking/adjust_qbi.py b/tax_microdata_benchmarking/adjust_qbi.py
@@ -4,76 +4,49 @@
 import taxcalc as tc
 
 
-def add_pt_w2_wages(df, time_period: int, verbose: bool = True):
+def add_pt_w2_wages(df, verbose: bool = True):
     """
-    Add pass-through W2 wages to the flat file.
+    Add 2021 pass-through W-2 wages to the flat file.
 
     Args:
-        df (pd.DataFrame): The DataFrame to add W2 wages to.
+        df (pd.DataFrame): the 2021 DataFrame to which adding W-2 wages
 
     Returns:
-        pd.DataFrame: The DataFrame with W2 wages added.
+        tuple containing:
+          pd.DataFrame: the 2021 DataFrame with pass-through W-2 wages added
+          pt_w2_wages_scale: rounded to five decimal digits
     """
-
-    # Note: just calculate the share in 2021 and use for all years.
-
-    qbid_tax_expenditures = {  # From JCT TE reports 2018- and 2023-
-        2015: 0,
-        2016: 0,
-        2017: 0,
-        2018: 33.2,
-        2019: 48.6,
-        2020: 56.3,
-        2021: 59.0,
-        2022: 61.9,
-        2023: 55.7,
-        2024: 57.6,
-        2025: 60.9,
-        2026: 24.9,
-        2027: 0,
-    }
-
-    QBID_TOTAL_21 = 205.8  # From SOI 2021
-
-    target = (
-        QBID_TOTAL_21
-        * qbid_tax_expenditures[time_period]
-        / qbid_tax_expenditures[2021]
-    )
-
+    if verbose:
+        print("Finding scale to use in imputing pass-through W-2 wages")
+    QBID_TOTAL = 205.8  # from IRS SOI P4801 tabulations of 2021 data (in $B)
     qbi = np.maximum(0, df.e00900 + df.e26270 + df.e02100 + df.e27200)
 
-    if target == 0:
-        df["PT_binc_w2_wages"] = qbi * 0
+    # solve for the scale value that generates the QBID_TOTAL target
 
-        return df
-
-    # Solve for scale to match the tax expenditure
-
-    def expenditure_loss(scale):
+    def deduction_deviation(scale):
         input_data = df.copy()
         input_data["PT_binc_w2_wages"] = qbi * scale
-        input_data = tc.Records(data=input_data, start_year=time_period)
-        policy = tc.Policy()
-        simulation = tc.Calculator(records=input_data, policy=policy)
-        simulation.calc_all()
-        taxcalc_qbided_sum = (
-            simulation.dataframe(["qbided"]).qbided * df.s006
-        ).sum() / 1e9
-        deviation = taxcalc_qbided_sum - target
+        input_data = tc.Records(
+            data=input_data,
+            start_year=2021,
+            gfactors=None,
+            weights=None,
+            adjust_ratios=None,
+            exact_calculations=True,
+        )
+        sim = tc.Calculator(records=input_data, policy=tc.Policy())
+        sim.calc_all()
+        qbided = (sim.array("qbided") * df.s006).sum() / 1e9
+        dev = qbided - QBID_TOTAL
         if verbose:
-            print(
-                f"scale: {scale}, deviation: {deviation}, total: {taxcalc_qbided_sum}"
-            )
-        return deviation
-
-    scale = bisect(expenditure_loss, 0, 2, rtol=0.01)
+            print(f"scale: {scale:8.6f}, dev: {dev:6.2f}, tot: {qbided:.2f}")
+        return dev
 
-    print(f"Final scale: {scale:.1%}")
-
-    df["PT_binc_w2_wages"] = qbi * scale
-
-    return df
+    scale = bisect(deduction_deviation, 0.1, 0.5, rtol=0.001)
+    rounded_scale = round(scale, 5)
+    print(f"Final (rounded) scale: {rounded_scale}")
+    df["PT_binc_w2_wages"] = qbi * rounded_scale
+    return (df, rounded_scale)
 
 
 if __name__ == "__main__":
@@ -82,5 +55,4 @@ def expenditure_loss(scale):
     )
 
     df = create_stacked_flat_file(2021)
-
-    df = add_pt_w2_wages(df, 2021)
+    (df, scale) = add_pt_w2_wages(df)
diff --git a/tax_microdata_benchmarking/create_flat_file.py b/tax_microdata_benchmarking/create_flat_file.py
@@ -750,6 +750,7 @@ def assert_no_duplicate_columns(df):
 
 def create_stacked_flat_file(
     target_year: int = 2024,
+    pt_w2_wages_scale: float = 0.318,
     use_puf: bool = True,
     add_tc_outputs: bool = True,
     reweight: bool = True,
@@ -774,19 +775,19 @@ def create_stacked_flat_file(
         print(
             f"Adding Tax-Calculator outputs to the flat file for {target_year}"
         )
-        print(
-            f"Adding pass-through W2 wages to the flat file for {target_year}"
+        msg = (
+            f"Adding pass-through W-2 wages to the flat file for {target_year}"
+            f" using scale = {pt_w2_wages_scale}"
         )
+        print(msg)
         qbi = np.maximum(
             0,
             stacked_file.e00900
             + stacked_file.e26270
             + stacked_file.e02100
             + stacked_file.e27200,
         )
-        stacked_file["PT_binc_w2_wages"] = (
-            qbi * 0.314  # Solved in 2021 using adjust_qbi.py
-        )
+        stacked_file["PT_binc_w2_wages"] = qbi * pt_w2_wages_scale
         input_data = tc.Records(data=stacked_file, start_year=target_year)
         policy = tc.Policy()
         simulation = tc.Calculator(records=input_data, policy=policy)

diff --git a/tax_microdata_benchmarking/create_taxcalc_input_variables.py b/tax_microdata_benchmarking/create_taxcalc_input_variables.py
@@ -0,0 +1,62 @@
+"""
+Construct tmd.csv.gz, a Tax-Calculator-style input variable file for 2021.
+"""
+
+from tax_microdata_benchmarking.create_flat_file import (
+    create_stacked_flat_file,
+)
+from tax_microdata_benchmarking.adjust_qbi import (
+    add_pt_w2_wages,
+)
+import taxcalc as tc
+
+
+TAXYEAR = 2021
+INITIAL_PT_W2_WAGES_SCALE = 0.31738
+
+
+def create_variable_file():
+    """
+    Create Tax-Calculator-style input variable file for TAXYEAR.
+    """
+    # construct dataframe containing input and output variables
+    vdf = create_stacked_flat_file(
+        target_year=TAXYEAR,
+        pt_w2_wages_scale=INITIAL_PT_W2_WAGES_SCALE,
+    )
+    vdf.FLPDYR = TAXYEAR
+    (vdf, pt_w2_wages_scale) = add_pt_w2_wages(vdf)
+    abs_diff = abs(pt_w2_wages_scale - INITIAL_PT_W2_WAGES_SCALE)
+    if abs_diff > 1e-6:
+        print(f"WARNING: FINAL vs INITIAL scale diff = {abs_diff:.6f}")
+        print(f"  INITIAL pt_w2_wages_scale = {INITIAL_PT_W2_WAGES_SCALE:.6f}")
+        print(f"    FINAL pt_w2_wages_scale = {pt_w2_wages_scale:.6f}")
+    # streamline variables dataframe
+    rec = tc.Records(
+        data=vdf,
+        start_year=TAXYEAR,
+        gfactors=None,
+        weights=None,
+        adjust_ratios=None,
+    )
+    vdf.drop(columns=rec.IGNORED_VARS, inplace=True)
+    vdf.e00200p = vdf.e00200p.to_numpy().round()
+    vdf.e00200s = vdf.e00200s.to_numpy().round()
+    vdf.e00200 = vdf.e00200p + vdf.e00200s
+    vdf.e00900p = vdf.e00900p.to_numpy().round()
+    vdf.e00900s = vdf.e00900s.to_numpy().round()
+    vdf.e00900 = vdf.e00900p + vdf.e00900s
+    vdf.e02100p = vdf.e02100p.to_numpy().round()
+    vdf.e02100s = vdf.e02100s.to_numpy().round()
+    vdf.e02100 = vdf.e02100p + vdf.e02100s
+    # write streamlined variables dataframe to CSV-formatted file
+    vdf.to_csv(
+        "tmd.csv.gz",
+        index=False,
+        float_format="%.0f",
+        compression="gzip",
+    )
+
+
+if __name__ == "__main__":
+    create_variable_file()
diff --git a/tests/test_flat_file.py b/tests/test_flat_file.py
@@ -200,20 +200,22 @@ def test_2023_tax_expenditures():
 
 
 @pytest.mark.dependency(depends=["test_2021_flat_file_builds"])
+@pytest.mark.skip
 def test_2021_unemployment_compensation():
     flat_file_2021 = pytest.flat_file_2021
 
     total = (flat_file_2021["e02300"] * flat_file_2021.s006).sum()
     assert (
-        abs(total / 1e9 / 33 - 1) < 0.2
+        abs(total / 1e9 / 33 - 1) < 0.2  # WHERE DOES 33 COME FROM ????
     ), f"Unemployment compensation total is ${total/1e9:.1f}bn, expected $33bn"
 
 
 @pytest.mark.dependency(depends=["test_2021_flat_file_builds"])
+@pytest.mark.skip
 def test_2021_medical_expenses():
     flat_file_2021 = pytest.flat_file_2021
 
     total = (flat_file_2021["e17500"] * flat_file_2021.s006).sum()
     assert (
-        abs(total / 1e9 / 215 - 1) < 0.2
+        abs(total / 1e9 / 215 - 1) < 0.2  # WHERE DOES 215 COME FROM ????
     ), f"Medical expense total is ${total/1e9:.1f}bn, expected $215bn"