Skip to content

Commit

Permalink
Merge pull request #148 from PSLmodels/fix-reweighting
Browse files Browse the repository at this point in the history
Remove use of "taxable returns" statistics from reweighting calculations
  • Loading branch information
nikhilwoodruff authored Jul 17, 2024
2 parents b02a413 + bdbd0dd commit eb57dbd
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 84 deletions.
43 changes: 5 additions & 38 deletions tax_microdata_benchmarking/datasets/puf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
DEP_AGE_RNG = np.random.default_rng(seed=24354657)
DEP_GENDER_RNG = np.random.default_rng(seed=74382916)
EARN_SPLIT_RNG = np.random.default_rng(seed=18374659)
N24_UNCAP_RNG = np.random.default_rng(seed=34659781)
DEP_XAGE_RNG = np.random.default_rng(seed=46357918)


def impute_missing_demographics(
Expand Down Expand Up @@ -85,17 +83,6 @@ def impute_missing_demographics(
]
)

# change AGEDP? values for tax units with zero N24 in an
# attempt to generate more CTC-eligible children;
# also change AGEDP?==4 values to 5 to offset some of the
# rise in EITC-eligible children caused by the CTC changes
n24_is_zero = puf_combined.N24 == 0
for var in ["AGEDP1", "AGEDP2", "AGEDP3"]:
puf_combined[var] = np.where(n24_is_zero, 3, puf_combined[var])
puf_combined[var] = np.where(
puf_combined[var] == 4, 5, puf_combined[var]
)

return puf_combined


Expand Down Expand Up @@ -215,19 +202,7 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
)
puf["household_id"] = puf.RECID
puf["household_weight"] = puf.S006

# uncap XTOT using IRS statistics on uncapped N24 variable:
# 2015 PUF N24 variable is capped at three
# 2015 PUF XTOT variable is capped at five for JOINT filers; four otherwise
# Approach is to impute xtotuc by adding extra uncapped N24 dependents
n24vals = [3, 4, 5, 6, 7, 8]
n24cnts = [4843, 1239, 288, 65, 34, 13]
n24prbs = n24cnts / np.sum(n24cnts, dtype=float)
uncapped_n24 = N24_UNCAP_RNG.choice(n24vals, size=len(puf.N24), p=n24prbs)
n24uc = np.where(puf.N24 < 3, puf.N24, uncapped_n24)
extra = n24uc - puf.N24
xtotuc = puf.XTOT + extra
puf["exemptions_count"] = xtotuc
puf["exemptions_count"] = puf.XTOT

return puf

Expand Down Expand Up @@ -376,9 +351,8 @@ def generate(self, puf: pd.DataFrame, demographics: pd.DataFrame):
self.add_spouse(row, tax_unit_id)
exemptions -= 1

max_known_age = 0
for j in range(exemptions):
self.add_dependent(row, tax_unit_id, j, max_known_age)
for j in range(min(3, exemptions)):
self.add_dependent(row, tax_unit_id, j)

groups_assumed_to_be_tax_unit_like = [
"family",
Expand Down Expand Up @@ -470,7 +444,7 @@ def add_spouse(self, row, tax_unit_id):
if self.variable_to_entity[key] == "person":
self.holder[key].append(row[key] * (1 - self.earn_splits[-1]))

def add_dependent(self, row, tax_unit_id, dependent_id, max_known_age):
def add_dependent(self, row, tax_unit_id, dependent_id):
person_id = int(tax_unit_id * 1e2 + 3 + dependent_id)
self.holder["person_id"].append(person_id)
self.holder["person_tax_unit_id"].append(tax_unit_id)
Expand All @@ -480,14 +454,7 @@ def add_dependent(self, row, tax_unit_id, dependent_id, max_known_age):
self.holder["is_tax_unit_spouse"].append(False)
self.holder["is_tax_unit_dependent"].append(True)

if dependent_id < 3:
age = decode_age_dependent(round(row[f"AGEDP{dependent_id + 1}"]))
max_known_age = max(age, max_known_age)
else: # AGEDP? not available so impute age uniformly over [A,16] range
if max_known_age < 16:
age = DEP_XAGE_RNG.choice(range(max_known_age + 1, 16 + 1))
else:
age = 30
age = decode_age_dependent(round(row[f"AGEDP{dependent_id + 1}"]))
self.holder["age"].append(age)

for key in FINANCIAL_SUBSET:
Expand Down
20 changes: 10 additions & 10 deletions tax_microdata_benchmarking/examination/results3.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Phase 3 Data Examination Results
================================

**PRELIMINARY RESULTS AS OF 2024-07-15 (after merge of PR 144)**
**PRELIMINARY RESULTS AS OF 2024-07-16 (after merge of PR 148)**

**FINAL RESULTS EXPECTED LATER IN JULY 2024**

Expand Down Expand Up @@ -30,7 +30,7 @@ merge of PR #2772 on 2024-07-11).
| CY23 Amount | CY26 Amount | Estimate Source |
| ---: | ---: | :--- |
| 1580.0 | 1829.9 | CBO |
| 1519.9 | 1763.0 | Tax-Calculator + phase 3 dataset |
| 1518.3 | 1761.0 | Tax-Calculator + phase 3 dataset |
| 1489.3 | 1711.0 | Tax-Calculator + taxdata dataset |

<br>
Expand All @@ -40,7 +40,7 @@ merge of PR #2772 on 2024-07-11).
| CY23 Amount | CY26 Amount | Estimate Source |
| ---: | ---: | :--- |
| 2512.3 | 2849.4 | CBO |
| 2247.0 | 2820.2 | Tax-Calculator + phase 3 dataset |
| 2215.0 | 2786.5 | Tax-Calculator + phase 3 dataset |
| 2247.9 | 2742.2 | Tax-Calculator + taxdata dataset |

<br>
Expand All @@ -51,7 +51,7 @@ merge of PR #2772 on 2024-07-11).
| ---: | ---: | :--- |
| 122.1 | 57.6 | JCT |
| 108.6 | 55.7 | TSY |
| 108.9 | 38.9 | Tax-Calculator + phase 3 dataset |
| 132.2 | 48.6 | Tax-Calculator + phase 3 dataset |
| 126.3 | 43.1 | Tax-Calculator + taxdata dataset |

<br>
Expand All @@ -62,7 +62,7 @@ merge of PR #2772 on 2024-07-11).
| ---: | ---: | :--- |
| 71.9 | 78.0 | JCT |
| 63.6 | 71.2 | TSY |
| 82.1 | 92.4 | Tax-Calculator + phase 3 dataset |
| 78.6 | 88.0 | Tax-Calculator + phase 3 dataset |
| 73.5 | 82.0 | Tax-Calculator + taxdata dataset |

<br>
Expand All @@ -73,7 +73,7 @@ merge of PR #2772 on 2024-07-11).
| ---: | ---: | :--- |
| 45.9 | 56.4 | JCT |
| 31.4 | 38.4 | TSY |
| 51.9 | 69.3 | Tax-Calculator + phase 3 dataset |
| 52.2 | 70.9 | Tax-Calculator + phase 3 dataset |
| 62.7 | 89.5 | Tax-Calculator + taxdata dataset |

<br>
Expand All @@ -84,7 +84,7 @@ merge of PR #2772 on 2024-07-11).
| ---: | ---: | :--- |
| -56.5 | -53.8 | JCT |
| ---- | ---- | TSY |
| -53.9 | -50.7 | Tax-Calculator + phase 3 dataset |
| -52.9 | -51.2 | Tax-Calculator + phase 3 dataset |
| -55.8 | -52.1 | Tax-Calculator + taxdata dataset |

<br>
Expand All @@ -95,7 +95,7 @@ merge of PR #2772 on 2024-07-11).
| ---: | ---: | :--- |
| 259.3 | 239.8 | JCT |
| 153.9 | 182.4 | TSY |
| 223.4 | 217.0 | Tax-Calculator + phase 3 dataset |
| 221.7 | 221.9 | Tax-Calculator + phase 3 dataset |
| 217.7 | 223.6 | Tax-Calculator + taxdata dataset |

<br>
Expand All @@ -106,7 +106,7 @@ merge of PR #2772 on 2024-07-11).
| ---: | ---: | :--- |
| 56.2 | 0.0 | JCT |
| 50.4 | 0.0 | TSY |
| 55.9 | 0.0 | Tax-Calculator + phase 3 dataset |
| 54.1 | 0.0 | Tax-Calculator + phase 3 dataset |
| 19.3 | 0.0 | Tax-Calculator + taxdata dataset |

<br>
Expand All @@ -117,5 +117,5 @@ merge of PR #2772 on 2024-07-11).
| ---: | ---: | :--- |
| 21.2 | 151.3 | JCT |
| 26.5 | 149.0 | TSY |
| 20.7 | 161.5 | Tax-Calculator + phase 3 dataset |
| 20.4 | 160.5 | Tax-Calculator + phase 3 dataset |
| 29.4 | 185.5 | Tax-Calculator + taxdata dataset |
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
Weighted Tax Reform Totals by Baseline Expanded-Income Decile
Returns ExpInc IncTax PayTax LSTax AllTax
A 202.19 18596.5 2247.0 1519.9 0.0 3767.0
A 201.79 18540.2 2215.0 1518.3 0.0 3733.3

==> tmd-23-#-cgqd-#-tab.text <==
A 202.19 18596.5 223.4 0.0 0.0 223.4
A 201.79 18540.2 221.7 0.0 0.0 221.7

==> tmd-23-#-clp-#-tab.text <==
A 202.19 18596.5 0.0 0.0 0.0 0.0
A 201.79 18540.2 0.0 0.0 0.0 0.0

==> tmd-23-#-ctc-#-tab.text <==
A 202.19 18596.5 108.9 0.0 0.0 108.9
A 201.79 18540.2 132.2 0.0 0.0 132.2

==> tmd-23-#-eitc-#-tab.text <==
A 202.19 18596.5 82.1 0.0 0.0 82.1
A 201.79 18540.2 78.6 0.0 0.0 78.6

==> tmd-23-#-niit-#-tab.text <==
A 202.19 18596.5 -53.9 0.0 0.0 -53.9
A 201.79 18540.2 -52.9 0.0 0.0 -52.9

==> tmd-23-#-qbid-#-tab.text <==
A 202.19 18596.5 55.9 0.0 0.0 55.9
A 201.79 18540.2 54.1 0.0 0.0 54.1

==> tmd-23-#-salt-#-tab.text <==
A 202.19 18596.5 20.7 0.0 0.0 20.7
A 201.79 18540.2 20.4 0.0 0.0 20.4

==> tmd-23-#-ssben-#-tab.text <==
A 202.19 18596.5 51.9 0.0 0.0 51.9
A 201.79 18540.2 52.2 0.0 0.0 52.2
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
Weighted Tax Reform Totals by Baseline Expanded-Income Decile
Returns ExpInc IncTax PayTax LSTax AllTax
A 208.79 21176.3 2820.2 1763.0 0.0 4583.1
A 208.38 21107.7 2786.5 1761.0 0.0 4547.4

==> tmd-26-#-cgqd-#-tab.text <==
A 208.79 21176.3 217.0 0.0 0.0 217.0
A 208.38 21107.7 221.9 0.0 0.0 221.9

==> tmd-26-#-clp-#-tab.text <==
A 208.79 21176.3 0.0 0.0 0.0 0.0
A 208.38 21107.7 0.0 0.0 0.0 0.0

==> tmd-26-#-ctc-#-tab.text <==
A 208.79 21176.3 38.9 0.0 0.0 38.9
A 208.38 21107.7 48.6 0.0 0.0 48.6

==> tmd-26-#-eitc-#-tab.text <==
A 208.79 21176.3 92.4 0.0 0.0 92.4
A 208.38 21107.7 88.0 0.0 0.0 88.0

==> tmd-26-#-niit-#-tab.text <==
A 208.79 21176.3 -50.7 0.0 0.0 -50.7
A 208.38 21107.7 -51.2 0.0 0.0 -51.2

==> tmd-26-#-qbid-#-tab.text <==
A 208.79 21176.3 0.0 0.0 0.0 0.0
A 208.38 21107.7 0.0 0.0 0.0 0.0

==> tmd-26-#-salt-#-tab.text <==
A 208.79 21176.3 161.5 0.0 0.0 161.5
A 208.38 21107.7 160.5 0.0 0.0 160.5

==> tmd-26-#-ssben-#-tab.text <==
A 208.79 21176.3 69.3 0.0 0.0 69.3
A 208.38 21107.7 70.9 0.0 0.0 70.9
33 changes: 15 additions & 18 deletions tax_microdata_benchmarking/utils/reweight.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,42 +75,30 @@ def build_loss_matrix(df):
df = tc_to_soi(df, time_period)
agi = df["adjusted_gross_income"].values
filer = df["is_tax_filer"].values
taxable = df["is_taxable"].values
targets_array = []
soi_subset = targets
soi_subset = soi_subset[soi_subset.Year == time_period]
agi_level_targeted_variables = [
"adjusted_gross_income",
"count",
"employment_income",
# "state_and_local_tax_deductions",
]
aggregate_level_targeted_variables = [
"business_net_losses",
"business_net_profits",
"capital_gains_distributions",
"capital_gains_gross",
"capital_gains_losses",
# "charitable_contributions_deduction",
# "count_of_exemptions",
"estate_income",
"estate_losses",
"exempt_interest",
# "exemptions",
# "interest_paid_deductions",
"ira_distributions",
# "itemized_real_estate_tax_deductions",
# "itemized_state_income_and_sales_tax_deductions",
# "medical_expense_deductions_capped",
# "medical_expense_deductions_uncapped",
"ordinary_dividends",
"partnership_and_s_corp_income",
"partnership_and_s_corp_losses",
"qualified_dividends",
# "qualified_business_income_deduction",
"rent_and_royalty_net_income",
"rent_and_royalty_net_losses",
# "standard_deduction",
"taxable_interest_income",
"taxable_pension_income",
"taxable_social_security",
Expand All @@ -136,6 +124,9 @@ def build_loss_matrix(df):
)
]
for _, row in soi_subset.iterrows():
if row["Taxable only"]:
continue # exclude "taxable returns" statistics

mask = (
(agi >= row["AGI lower bound"])
* (agi < row["AGI upper bound"])
Expand All @@ -154,9 +145,6 @@ def build_loss_matrix(df):
elif row["Filing status"] == "Married Filing Separately":
mask *= df["filing_status"].values == "SEPARATE"

if row["Taxable only"]:
mask *= taxable > 0

values = df[row["Variable"]].values

if row["Count"]:
Expand All @@ -173,11 +161,20 @@ def build_loss_matrix(df):
variable_label = row["Variable"].replace("_", " ")

if row["Count"] and not row["Variable"] == "count":
label = f"{variable_label}/count/AGI in {agi_range_label}/{taxable_label}/{filing_status_label}"
label = (
f"{variable_label}/count/AGI in "
f"{agi_range_label}/{taxable_label}/{filing_status_label}"
)
elif row["Variable"] == "count":
label = f"{variable_label}/count/AGI in {agi_range_label}/{taxable_label}/{filing_status_label}"
label = (
f"{variable_label}/count/AGI in "
f"{agi_range_label}/{taxable_label}/{filing_status_label}"
)
else:
label = f"{variable_label}/total/AGI in {agi_range_label}/{taxable_label}/{filing_status_label}"
label = (
f"{variable_label}/total/AGI in "
f"{agi_range_label}/{taxable_label}/{filing_status_label}"
)

if label not in loss_matrix.columns:
loss_matrix[label] = mask * values
Expand Down

0 comments on commit eb57dbd

Please sign in to comment.