Merge pull request #148 from PSLmodels/fix-reweighting

Remove use of "taxable returns" statistics from reweighting calculations
PSLmodels · Jul 17, 2024 · eb57dbd · eb57dbd
2 parents b02a413 + bdbd0dd
commit eb57dbd
Show file tree

Hide file tree

Showing 5 changed files with 48 additions and 84 deletions.
diff --git a/tax_microdata_benchmarking/datasets/puf.py b/tax_microdata_benchmarking/datasets/puf.py
@@ -23,8 +23,6 @@
 DEP_AGE_RNG = np.random.default_rng(seed=24354657)
 DEP_GENDER_RNG = np.random.default_rng(seed=74382916)
 EARN_SPLIT_RNG = np.random.default_rng(seed=18374659)
-N24_UNCAP_RNG = np.random.default_rng(seed=34659781)
-DEP_XAGE_RNG = np.random.default_rng(seed=46357918)
 
 
 def impute_missing_demographics(
@@ -85,17 +83,6 @@ def impute_missing_demographics(
         ]
     )
 
-    # change AGEDP? values for tax units with zero N24 in an
-    # attempt to generate more CTC-eligible children;
-    # also change AGEDP?==4 values to 5 to offset some of the
-    # rise in EITC-eligible children caused by the CTC changes
-    n24_is_zero = puf_combined.N24 == 0
-    for var in ["AGEDP1", "AGEDP2", "AGEDP3"]:
-        puf_combined[var] = np.where(n24_is_zero, 3, puf_combined[var])
-        puf_combined[var] = np.where(
-            puf_combined[var] == 4, 5, puf_combined[var]
-        )
-
     return puf_combined
 
 
@@ -215,19 +202,7 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     )
     puf["household_id"] = puf.RECID
     puf["household_weight"] = puf.S006
-
-    # uncap XTOT using IRS statistics on uncapped N24 variable:
-    # 2015 PUF N24 variable is capped at three
-    # 2015 PUF XTOT variable is capped at five for JOINT filers; four otherwise
-    # Approach is to impute xtotuc by adding extra uncapped N24 dependents
-    n24vals = [3, 4, 5, 6, 7, 8]
-    n24cnts = [4843, 1239, 288, 65, 34, 13]
-    n24prbs = n24cnts / np.sum(n24cnts, dtype=float)
-    uncapped_n24 = N24_UNCAP_RNG.choice(n24vals, size=len(puf.N24), p=n24prbs)
-    n24uc = np.where(puf.N24 < 3, puf.N24, uncapped_n24)
-    extra = n24uc - puf.N24
-    xtotuc = puf.XTOT + extra
-    puf["exemptions_count"] = xtotuc
+    puf["exemptions_count"] = puf.XTOT
 
     return puf
 
@@ -376,9 +351,8 @@ def generate(self, puf: pd.DataFrame, demographics: pd.DataFrame):
                 self.add_spouse(row, tax_unit_id)
                 exemptions -= 1
 
-            max_known_age = 0
-            for j in range(exemptions):
-                self.add_dependent(row, tax_unit_id, j, max_known_age)
+            for j in range(min(3, exemptions)):
+                self.add_dependent(row, tax_unit_id, j)
 
         groups_assumed_to_be_tax_unit_like = [
             "family",
@@ -470,7 +444,7 @@ def add_spouse(self, row, tax_unit_id):
             if self.variable_to_entity[key] == "person":
                 self.holder[key].append(row[key] * (1 - self.earn_splits[-1]))
 
-    def add_dependent(self, row, tax_unit_id, dependent_id, max_known_age):
+    def add_dependent(self, row, tax_unit_id, dependent_id):
         person_id = int(tax_unit_id * 1e2 + 3 + dependent_id)
         self.holder["person_id"].append(person_id)
         self.holder["person_tax_unit_id"].append(tax_unit_id)
@@ -480,14 +454,7 @@ def add_dependent(self, row, tax_unit_id, dependent_id, max_known_age):
         self.holder["is_tax_unit_spouse"].append(False)
         self.holder["is_tax_unit_dependent"].append(True)
 
-        if dependent_id < 3:
-            age = decode_age_dependent(round(row[f"AGEDP{dependent_id + 1}"]))
-            max_known_age = max(age, max_known_age)
-        else:  # AGEDP? not available so impute age uniformly over [A,16] range
-            if max_known_age < 16:
-                age = DEP_XAGE_RNG.choice(range(max_known_age + 1, 16 + 1))
-            else:
-                age = 30
+        age = decode_age_dependent(round(row[f"AGEDP{dependent_id + 1}"]))
         self.holder["age"].append(age)
 
         for key in FINANCIAL_SUBSET:

diff --git a/tax_microdata_benchmarking/examination/results3.md b/tax_microdata_benchmarking/examination/results3.md
@@ -1,7 +1,7 @@
 Phase 3 Data Examination Results
 ================================
 
-**PRELIMINARY RESULTS AS OF 2024-07-15 (after merge of PR 144)**
+**PRELIMINARY RESULTS AS OF 2024-07-16 (after merge of PR 148)**
 
 **FINAL RESULTS EXPECTED LATER IN JULY 2024**
 
@@ -30,7 +30,7 @@ merge of PR #2772 on 2024-07-11).
 | CY23 Amount | CY26 Amount | Estimate Source |
 | ---:   | ---:   | :---   |
 | 1580.0 | 1829.9 | CBO    |
-| 1519.9 | 1763.0 | Tax-Calculator + phase 3 dataset |
+| 1518.3 | 1761.0 | Tax-Calculator + phase 3 dataset |
 | 1489.3 | 1711.0 | Tax-Calculator + taxdata dataset |
 
 <br>
@@ -40,7 +40,7 @@ merge of PR #2772 on 2024-07-11).
 | CY23 Amount | CY26 Amount | Estimate Source |
 | ---:   | ---:   | :---   |
 | 2512.3 | 2849.4 | CBO    |
-| 2247.0 | 2820.2 | Tax-Calculator + phase 3 dataset |
+| 2215.0 | 2786.5 | Tax-Calculator + phase 3 dataset |
 | 2247.9 | 2742.2 | Tax-Calculator + taxdata dataset |
 
 <br>
@@ -51,7 +51,7 @@ merge of PR #2772 on 2024-07-11).
 | ---:   | ---:   | :---   |
 | 122.1  | 57.6   | JCT    |
 | 108.6  | 55.7   | TSY    |
-| 108.9  | 38.9   | Tax-Calculator + phase 3 dataset |
+| 132.2  | 48.6   | Tax-Calculator + phase 3 dataset |
 | 126.3  | 43.1   | Tax-Calculator + taxdata dataset |
 
 <br>
@@ -62,7 +62,7 @@ merge of PR #2772 on 2024-07-11).
 | ---:   | ---:   | :---   |
 | 71.9   | 78.0   | JCT    |
 | 63.6   | 71.2   | TSY    |
-| 82.1   | 92.4   | Tax-Calculator + phase 3 dataset |
+| 78.6   | 88.0   | Tax-Calculator + phase 3 dataset |
 | 73.5   | 82.0   | Tax-Calculator + taxdata dataset |
 
 <br>
@@ -73,7 +73,7 @@ merge of PR #2772 on 2024-07-11).
 | ---:   | ---:   | :---   |
 | 45.9   |  56.4  | JCT    |
 | 31.4   |  38.4  | TSY    |
-| 51.9   |  69.3  | Tax-Calculator + phase 3 dataset |
+| 52.2   |  70.9  | Tax-Calculator + phase 3 dataset |
 | 62.7   |  89.5  | Tax-Calculator + taxdata dataset |
 
 <br>
@@ -84,7 +84,7 @@ merge of PR #2772 on 2024-07-11).
 | ---:   | ---:   | :---   |
 | -56.5  | -53.8  | JCT    |
 |  ----  |  ----  | TSY    |
-| -53.9  | -50.7  | Tax-Calculator + phase 3 dataset |
+| -52.9  | -51.2  | Tax-Calculator + phase 3 dataset |
 | -55.8  | -52.1  | Tax-Calculator + taxdata dataset |
 
 <br>
@@ -95,7 +95,7 @@ merge of PR #2772 on 2024-07-11).
 | ---:   | ---:   | :---   |
 | 259.3  | 239.8  | JCT    |
 | 153.9  | 182.4  | TSY    |
-| 223.4  | 217.0  | Tax-Calculator + phase 3 dataset |
+| 221.7  | 221.9  | Tax-Calculator + phase 3 dataset |
 | 217.7  | 223.6  | Tax-Calculator + taxdata dataset |
 
 <br>
@@ -106,7 +106,7 @@ merge of PR #2772 on 2024-07-11).
 | ---:   | ---:   | :---   |
 | 56.2   |  0.0   | JCT    |
 | 50.4   |  0.0   | TSY    |
-| 55.9   |  0.0   | Tax-Calculator + phase 3 dataset |
+| 54.1   |  0.0   | Tax-Calculator + phase 3 dataset |
 | 19.3   |  0.0   | Tax-Calculator + taxdata dataset |
 
 <br>
@@ -117,5 +117,5 @@ merge of PR #2772 on 2024-07-11).
 | ---:   | ---:   | :---   |
 |  21.2  | 151.3  | JCT    |
 |  26.5  | 149.0  | TSY    |
-|  20.7  | 161.5  | Tax-Calculator + phase 3 dataset |
+|  20.4  | 160.5  | Tax-Calculator + phase 3 dataset |
 |  29.4  | 185.5  | Tax-Calculator + taxdata dataset |
diff --git a/tax_microdata_benchmarking/examination/taxcalculator/tmd-23.res-expect b/tax_microdata_benchmarking/examination/taxcalculator/tmd-23.res-expect
@@ -1,27 +1,27 @@
 Weighted Tax Reform Totals by Baseline Expanded-Income Decile
     Returns    ExpInc    IncTax    PayTax     LSTax    AllTax
- A   202.19   18596.5    2247.0    1519.9       0.0    3767.0
+ A   201.79   18540.2    2215.0    1518.3       0.0    3733.3
 
 ==> tmd-23-#-cgqd-#-tab.text <==
- A   202.19   18596.5     223.4       0.0       0.0     223.4
+ A   201.79   18540.2     221.7       0.0       0.0     221.7
 
 ==> tmd-23-#-clp-#-tab.text <==
- A   202.19   18596.5       0.0       0.0       0.0       0.0
+ A   201.79   18540.2       0.0       0.0       0.0       0.0
 
 ==> tmd-23-#-ctc-#-tab.text <==
- A   202.19   18596.5     108.9       0.0       0.0     108.9
+ A   201.79   18540.2     132.2       0.0       0.0     132.2
 
 ==> tmd-23-#-eitc-#-tab.text <==
- A   202.19   18596.5      82.1       0.0       0.0      82.1
+ A   201.79   18540.2      78.6       0.0       0.0      78.6
 
 ==> tmd-23-#-niit-#-tab.text <==
- A   202.19   18596.5     -53.9       0.0       0.0     -53.9
+ A   201.79   18540.2     -52.9       0.0       0.0     -52.9
 
 ==> tmd-23-#-qbid-#-tab.text <==
- A   202.19   18596.5      55.9       0.0       0.0      55.9
+ A   201.79   18540.2      54.1       0.0       0.0      54.1
 
 ==> tmd-23-#-salt-#-tab.text <==
- A   202.19   18596.5      20.7       0.0       0.0      20.7
+ A   201.79   18540.2      20.4       0.0       0.0      20.4
 
 ==> tmd-23-#-ssben-#-tab.text <==
- A   202.19   18596.5      51.9       0.0       0.0      51.9
+ A   201.79   18540.2      52.2       0.0       0.0      52.2
diff --git a/tax_microdata_benchmarking/examination/taxcalculator/tmd-26.res-expect b/tax_microdata_benchmarking/examination/taxcalculator/tmd-26.res-expect
@@ -1,27 +1,27 @@
 Weighted Tax Reform Totals by Baseline Expanded-Income Decile
     Returns    ExpInc    IncTax    PayTax     LSTax    AllTax
- A   208.79   21176.3    2820.2    1763.0       0.0    4583.1
+ A   208.38   21107.7    2786.5    1761.0       0.0    4547.4
 
 ==> tmd-26-#-cgqd-#-tab.text <==
- A   208.79   21176.3     217.0       0.0       0.0     217.0
+ A   208.38   21107.7     221.9       0.0       0.0     221.9
 
 ==> tmd-26-#-clp-#-tab.text <==
- A   208.79   21176.3       0.0       0.0       0.0       0.0
+ A   208.38   21107.7       0.0       0.0       0.0       0.0
 
 ==> tmd-26-#-ctc-#-tab.text <==
- A   208.79   21176.3      38.9       0.0       0.0      38.9
+ A   208.38   21107.7      48.6       0.0       0.0      48.6
 
 ==> tmd-26-#-eitc-#-tab.text <==
- A   208.79   21176.3      92.4       0.0       0.0      92.4
+ A   208.38   21107.7      88.0       0.0       0.0      88.0
 
 ==> tmd-26-#-niit-#-tab.text <==
- A   208.79   21176.3     -50.7       0.0       0.0     -50.7
+ A   208.38   21107.7     -51.2       0.0       0.0     -51.2
 
 ==> tmd-26-#-qbid-#-tab.text <==
- A   208.79   21176.3       0.0       0.0       0.0       0.0
+ A   208.38   21107.7       0.0       0.0       0.0       0.0
 
 ==> tmd-26-#-salt-#-tab.text <==
- A   208.79   21176.3     161.5       0.0       0.0     161.5
+ A   208.38   21107.7     160.5       0.0       0.0     160.5
 
 ==> tmd-26-#-ssben-#-tab.text <==
- A   208.79   21176.3      69.3       0.0       0.0      69.3
+ A   208.38   21107.7      70.9       0.0       0.0      70.9
diff --git a/tax_microdata_benchmarking/utils/reweight.py b/tax_microdata_benchmarking/utils/reweight.py
@@ -75,42 +75,30 @@ def build_loss_matrix(df):
         df = tc_to_soi(df, time_period)
         agi = df["adjusted_gross_income"].values
         filer = df["is_tax_filer"].values
-        taxable = df["is_taxable"].values
         targets_array = []
         soi_subset = targets
         soi_subset = soi_subset[soi_subset.Year == time_period]
         agi_level_targeted_variables = [
             "adjusted_gross_income",
             "count",
             "employment_income",
-            # "state_and_local_tax_deductions",
         ]
         aggregate_level_targeted_variables = [
             "business_net_losses",
             "business_net_profits",
             "capital_gains_distributions",
             "capital_gains_gross",
             "capital_gains_losses",
-            # "charitable_contributions_deduction",
-            # "count_of_exemptions",
             "estate_income",
             "estate_losses",
             "exempt_interest",
-            # "exemptions",
-            # "interest_paid_deductions",
             "ira_distributions",
-            # "itemized_real_estate_tax_deductions",
-            # "itemized_state_income_and_sales_tax_deductions",
-            # "medical_expense_deductions_capped",
-            # "medical_expense_deductions_uncapped",
             "ordinary_dividends",
             "partnership_and_s_corp_income",
             "partnership_and_s_corp_losses",
             "qualified_dividends",
-            # "qualified_business_income_deduction",
             "rent_and_royalty_net_income",
             "rent_and_royalty_net_losses",
-            # "standard_deduction",
             "taxable_interest_income",
             "taxable_pension_income",
             "taxable_social_security",
@@ -136,6 +124,9 @@ def build_loss_matrix(df):
             )
         ]
         for _, row in soi_subset.iterrows():
+            if row["Taxable only"]:
+                continue  # exclude "taxable returns" statistics
+
             mask = (
                 (agi >= row["AGI lower bound"])
                 * (agi < row["AGI upper bound"])
@@ -154,9 +145,6 @@ def build_loss_matrix(df):
             elif row["Filing status"] == "Married Filing Separately":
                 mask *= df["filing_status"].values == "SEPARATE"
 
-            if row["Taxable only"]:
-                mask *= taxable > 0
-
             values = df[row["Variable"]].values
 
             if row["Count"]:
@@ -173,11 +161,20 @@ def build_loss_matrix(df):
             variable_label = row["Variable"].replace("_", " ")
 
             if row["Count"] and not row["Variable"] == "count":
-                label = f"{variable_label}/count/AGI in {agi_range_label}/{taxable_label}/{filing_status_label}"
+                label = (
+                    f"{variable_label}/count/AGI in "
+                    f"{agi_range_label}/{taxable_label}/{filing_status_label}"
+                )
             elif row["Variable"] == "count":
-                label = f"{variable_label}/count/AGI in {agi_range_label}/{taxable_label}/{filing_status_label}"
+                label = (
+                    f"{variable_label}/count/AGI in "
+                    f"{agi_range_label}/{taxable_label}/{filing_status_label}"
+                )
             else:
-                label = f"{variable_label}/total/AGI in {agi_range_label}/{taxable_label}/{filing_status_label}"
+                label = (
+                    f"{variable_label}/total/AGI in "
+                    f"{agi_range_label}/{taxable_label}/{filing_status_label}"
+                )
 
             if label not in loss_matrix.columns:
                 loss_matrix[label] = mask * values