Skip to content

Commit

Permalink
Merge pull request #105 from PSLmodels/fixes
Browse files Browse the repository at this point in the history
Remove negative weights and aggregate records
  • Loading branch information
martinholmer authored Jul 3, 2024
2 parents 64114c1 + f5f0b7d commit d2d39e6
Show file tree
Hide file tree
Showing 18 changed files with 6,519 additions and 5,410 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ tax_microdata_benchmarking/calibration
!tax_microdata_benchmarking/storage/input/*.csv
**demographics_2015.csv
**puf_2015.csv
*.DS_STORE
1 change: 0 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ flat-file:
python tax_microdata_benchmarking/create_taxcalc_input_variables.py
python tax_microdata_benchmarking/create_taxcalc_growth_factors.py
python tax_microdata_benchmarking/create_taxcalc_sampling_weights.py
python tax_microdata_benchmarking/create_all_datasets.py

data: install flat-file test

Expand Down
66 changes: 66 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import streamlit as st

from tax_microdata_benchmarking.utils.soi_replication import *
from tax_microdata_benchmarking.storage import STORAGE_FOLDER
from tax_microdata_benchmarking.datasets import *
import pandas as pd

INPUTS = STORAGE_FOLDER / "input"
OUTPUTS = STORAGE_FOLDER / "output"


@st.cache_resource
def generate_comparsions():
tmd_2021 = pd.read_csv(OUTPUTS / "tmd_2021.csv")
soi_from_tmd_2021 = compare_soi_replication_to_soi(
tc_to_soi(tmd_2021, 2021), 2021
)
return soi_from_tmd_2021


def soi_statistic_passes_quality_test(df):
# Relative error lower than this => OK
RELATIVE_ERROR_THRESHOLD = 0.05

# Absolute error lower than this for filer counts => OK
COUNT_ABSOLUTE_ERROR_THRESHOLD = 1e6

# Absolute error lower than this for aggregates => OK
AGGREGATE_ABSOLUTE_ERROR_THRESHOLD = 1e9

relative_error_ok = (
df["Absolute relative error"] < RELATIVE_ERROR_THRESHOLD
)
absolute_error_threshold = np.where(
df.Count,
COUNT_ABSOLUTE_ERROR_THRESHOLD,
AGGREGATE_ABSOLUTE_ERROR_THRESHOLD,
)
absolute_error_ok = df["Absolute error"] < absolute_error_threshold

return relative_error_ok | absolute_error_ok


# 2021 datasets

comparisons = generate_comparsions()

st.title("SOI replication results")

st.write(
"""
This page shows the results of replicating the SOI dataset from the TMD-2021 output data file. It is sorted by absolute error."""
)

st.dataframe(comparisons.sort_values("Absolute error", ascending=False))

import plotly.express as px

histogram = px.histogram(
comparisons,
x="Absolute error",
marginal="rug",
title="Histogram of absolute relative errors",
)

st.plotly_chart(histogram)
104 changes: 104 additions & 0 deletions docs/app.py
Original file line number Diff line number Diff line change
@@ -1 +1,105 @@
import streamlit as st

from tax_microdata_benchmarking.utils.soi_replication import *
from tax_microdata_benchmarking.storage import STORAGE_FOLDER
from tax_microdata_benchmarking.datasets import *
import pandas as pd

INPUTS = STORAGE_FOLDER / "input"
OUTPUTS = STORAGE_FOLDER / "output"


@st.cache_resource
def generate_comparsions(use_original_weights: bool = False):
tmd_2021 = pd.read_csv(OUTPUTS / "tmd_2021.csv")
if use_original_weights:
tmd_2021.s006 = tmd_2021.s006_original
soi_from_tmd_2021 = compare_soi_replication_to_soi(
tc_to_soi(tmd_2021, 2021), 2021
)
return soi_from_tmd_2021


def soi_statistic_passes_quality_test(df):
# Relative error lower than this => OK
RELATIVE_ERROR_THRESHOLD = 0.05

# Absolute error lower than this for filer counts => OK
COUNT_ABSOLUTE_ERROR_THRESHOLD = 1e6

# Absolute error lower than this for aggregates => OK
AGGREGATE_ABSOLUTE_ERROR_THRESHOLD = 1e9

relative_error_ok = (
df["Absolute relative error"] < RELATIVE_ERROR_THRESHOLD
)
absolute_error_threshold = np.where(
df.Count,
COUNT_ABSOLUTE_ERROR_THRESHOLD,
AGGREGATE_ABSOLUTE_ERROR_THRESHOLD,
)
absolute_error_ok = df["Absolute error"] < absolute_error_threshold

return relative_error_ok | absolute_error_ok


# 2021 datasets

comparisons = generate_comparsions()
comparisons_original_weights = generate_comparsions(use_original_weights=True)
comparisons["Original weight value"] = comparisons_original_weights["Value"]
comparisons["Original weight error"] = comparisons_original_weights["Error"]
comparisons["Improved under reweighting"] = (
comparisons["Absolute error"] < comparisons["Original weight error"].abs()
)
soi_subset = comparisons
time_period = 2021

soi_subset = soi_subset[soi_subset["Filing status"] == "All"]
soi_subset = soi_subset[soi_subset["Taxable only"] == False]
agi_level_targeted_variables = [
"adjusted_gross_income",
"count",
]
aggregate_level_targeted_variables = [
# "qualified_business_income_deduction",
]
soi_subset = soi_subset[
soi_subset.Variable.isin(agi_level_targeted_variables)
& (
(soi_subset["AGI lower bound"] != -np.inf)
| (soi_subset["AGI upper bound"] != np.inf)
)
| (
soi_subset.Variable.isin(aggregate_level_targeted_variables)
& (soi_subset["AGI lower bound"] == -np.inf)
& (soi_subset["AGI upper bound"] == np.inf)
)
]

comparisons["Targeted"] = False
comparisons["Targeted"][soi_subset.index] = True

soi_subset["Targeted"] = True

st.title("SOI replication results")

st.write(
"""
This page shows the results of replicating the SOI dataset from the TMD-2021 output data file. It is sorted by absolute error."""
)

st.dataframe(comparisons.sort_values("Absolute error", ascending=False))

import plotly.express as px

histogram = px.histogram(
comparisons,
x="Absolute error",
marginal="rug",
title="Histogram of absolute relative errors",
)

st.plotly_chart(histogram)

st.subheader("Targets included in reweighting")
Loading

0 comments on commit d2d39e6

Please sign in to comment.