Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add chisquare_test.py to assess area weights generated with different targets #267

Merged
merged 20 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
33a2a96
Add areas/chisquare_test.py script
martinholmer Oct 21, 2024
47ebdbd
Add documentation on test in the chisquare_test.py docstring
martinholmer Oct 21, 2024
7bc0bbf
Add WGHT1 and WGHT2 weighted income tax liabilities to test output
martinholmer Oct 22, 2024
f93f543
Merge in changes on revise-caching branch
martinholmer Oct 22, 2024
df9aecb
Include recent changes on master branch
martinholmer Oct 22, 2024
5ec66cb
Use cached_iitax.npy as source of iitax in the chisquare_test.py script
martinholmer Oct 22, 2024
be463ef
Merge in recent changes on master branch
martinholmer Oct 24, 2024
4a49514
Revise valid_area logic to handle optional assumption characters
martinholmer Oct 25, 2024
22dbd8c
More informative chisquare_test.py output messages
martinholmer Oct 25, 2024
b7b7e6a
Fix merge conflict
martinholmer Oct 25, 2024
8bd0ce9
Merge in recent changes on master branch
martinholmer Oct 25, 2024
7c2bc56
Merge in recent changes on master branch
martinholmer Oct 28, 2024
e1c8a90
Make NUM_BINS an input to chisquare_test.py
martinholmer Oct 28, 2024
14dc680
Add numbins and dump as optional command-line arguments
martinholmer Oct 29, 2024
1b619ea
black format changes
martinholmer Oct 29, 2024
ee1fea9
Change DEFAULT_NUM_BINS from 2000 to 100
martinholmer Oct 29, 2024
e871b3d
Use national weights to define itax bins by default
martinholmer Oct 30, 2024
4bda956
Change DEFAULT_NUM_BINS from 100 to 200
martinholmer Oct 30, 2024
de9e1c6
black format change
martinholmer Oct 30, 2024
c411d1a
Add more detail to optional dump output
martinholmer Oct 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
228 changes: 228 additions & 0 deletions tmd/areas/chisquare_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
"""
Conduct chi-square test of the similarity of two distributions of weights by
income tax liability categories for the same area that are generated using
different targets. The two area weights files are assumed to be located in
the tmd/areas/weights folder.
All test output is written to stdout; error messages are written to stderr.

For background information on the chi-square test that compares two
categorical (that is, binned) data sets, see Section 14.3: Are Two
Distributions Different? in Press, et al., Numerical Recipies in C:
The Art of Scientific Computing, Second Edition (Cambridge University
Press, 1992). This script uses the chi2_contingency function in the
Python scipy package, which is documented at the following URL:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html

Critical chi-square statistic values (say, at p=0.05) for large
degrees-of-freedom values (which equals numbins-1 in this test)
are available from the following online calculator:
https://www.danielsoper.com/statcalc/calculator.aspx?id=12
For example: Xsq(p=0.05,dof= 100)= 124.34
Xsq(p=0.05,dof= 200)= 233.99
Xsq(p=0.05,dof=1000)= 1074.68
Xsq(p=0.05,dof=2000)= 2105.15

USAGE: python chisquare_test.py WGHT1 WGHT2 [numbins] [dump]

EXAMPLE using default numbins (equal to 100) and with no details dump:
areas% python chisquare_test.py pa08 pa08A
"""

import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
from tmd.areas import AREAS_FOLDER
from tmd.storage import STORAGE_FOLDER

USAGE = "USAGE: python chisquare_test.py WGHT1 WGHT2 [numbins] [dump]"
CACHED_ITAX_PATH = STORAGE_FOLDER / "output" / "cached_iitax.npy"
TAX_YEAR = 2021
MINIMUM_NUM_BINS = 50
DEFAULT_NUM_BINS = 200
ITXBINS_DEFINED_USING_AREA_WEIGHTS = False # default is using national weights


def check_arguments():
"""
Check validity of arguments and return the following tuple:
(wname1, wpath1, wname2, wpath2, numbins, dump)
containing name of and path to the WGHT1 and WGHT2 weights files,
plus requested number of income tax categories (or bins) and
whether or not detailed dump information is included in output.
"""
numargs = len(sys.argv) - 1
if not 2 <= numargs <= 4:
sys.stderr.write(
f"ERROR: number of arguments not in [2,4] range\n{USAGE}\n"
)
sys.exit(1)
all_ok = True
wname1 = sys.argv[1]
wpath1 = AREAS_FOLDER / "weights" / f"{wname1}_tmd_weights.csv.gz"
if not wpath1.exists():
sys.stderr.write(f"ERROR: WGHT1 {str(wpath1)} file does not exist\n")
all_ok = False
wname2 = sys.argv[2]
wpath2 = AREAS_FOLDER / "weights" / f"{wname2}_tmd_weights.csv.gz"
if not wpath2.exists():
sys.stderr.write(f"ERROR: WGHT2 {str(wpath2)} file does not exist\n")
all_ok = False
if not CACHED_ITAX_PATH.exists():
sys.stderr.write(
f"ERROR: {str(CACHED_ITAX_PATH)} file does not exist\n"
)
all_ok = False
numbins = DEFAULT_NUM_BINS
if numargs >= 3:
numbins = int(sys.argv[3])
if numbins < MINIMUM_NUM_BINS:
sys.stderr.write(
f"ERROR: numbins must be no less than {MINIMUM_NUM_BINS}\n"
)
all_ok = False
dump = False
if numargs >= 4:
if sys.argv[4] == "dump":
dump = True
else:
sys.stderr.write("ERROR: optional fourth argument must be dump\n")
all_ok = False
if not all_ok:
sys.stderr.write(f"{USAGE}\n")
sys.exit(1)
return (wname1, wpath1, wname2, wpath2, numbins, dump)


def weights_array(wghts_path: Path):
"""
Return array containing TAX_YEAR weights in area weights file with wpath.
"""
wdf = pd.read_csv(wghts_path)
warray = wdf[f"WT{TAX_YEAR}"]
return warray


def weighted_qcut_variable(values, weights, numbins):
"""
Return weighted quantile bin variable for specified values.
"""
quantiles = np.linspace(0, 1, numbins + 1)
order = weights.iloc[values.argsort()].cumsum()
bins = pd.cut(order / order.iloc[-1], quantiles, right=True, labels=False)
return bins.sort_index()


def sorted_vdf_with_itxbin(vdf: pd.DataFrame, numbins: int):
"""
Expects vdf to include two area weights arrays and an itax array.
Returns itax-sorted vdf with added itxbin given NUM_BINS environ variable.
"""
# check contents of vdf
assert "wght1" in vdf, "wght1 variable not in vdf"
assert "wght2" in vdf, "wght2 variable not in vdf"
assert "itax" in vdf, "itax variable not in vdf"
# add itxbin variable to vdf
if ITXBINS_DEFINED_USING_AREA_WEIGHTS:
wght = 0.5 * (vdf.wght1 + vdf.wght2)
else: # using national weights
wght = weights_array(STORAGE_FOLDER / "output" / "tmd_weights.csv.gz")
vdf["itxbin"] = weighted_qcut_variable(vdf.itax, wght, numbins)
# return sorted vdf
vdf.sort_values("itax", inplace=True)
return vdf


# -- High-level logic of the script:


def main(
wname1: str,
wpath1: Path,
wname2: str,
wpath2: Path,
numbins: int,
dump: bool,
):
"""
Conduct chi-square two-variable test using the WGHT1 weights (wpath1)
and the WGHT2 weights (wpath2), which are weights for the same area
generated using different targets, using specified income tax numbins.
"""
# read cached iitax amounts, which are the same for each area weights file
itax = np.load(CACHED_ITAX_PATH)

# construct vdf DataFrame and sort it and add itxbin variable to it
wght1 = weights_array(wpath1)
wght2 = weights_array(wpath2)
assert len(wght1) == len(wght2) == len(itax)
vdf = pd.DataFrame({"wght1": wght1, "wght2": wght2, "itax": itax})
vdf = sorted_vdf_with_itxbin(vdf, numbins)
if dump:
print("***** vdf=\n", vdf)
pct = np.linspace(0.1, 0.9, num=9)
print("***** vdf.describe=\n", vdf.describe(percentiles=pct))

# prepare frequency data for chi-square test
unweighted_count = len(vdf)
wght1_sum = wght1.sum()
wght2_sum = wght2.sum()
if dump:
print(f"***** {wname1:>5}_weight_total= {wght1_sum:.5f}")
print(f"***** {wname2:>5}_weight_total= {wght2_sum:.5f}")
ratio = wght2_sum / wght1_sum
print(f"***** ==> ratio= {ratio:.8f}")
gvdf = vdf.groupby(by=["itxbin"], sort=False, observed=True)
if dump:
itxtop = gvdf["itax"].max()
print("***** itax_at_bin_top=\n", itxtop)
freq1 = (unweighted_count / wght1_sum) * gvdf["wght1"].sum()
freq2 = (unweighted_count / wght2_sum) * gvdf["wght2"].sum()
if dump:
fdf = pd.DataFrame({"freq1": freq1, "freq2": freq2})
pct = np.linspace(0.1, 0.9, num=9)
print("***** freq.describe=\n", fdf.describe(percentiles=pct))
assert len(freq1) == len(freq2)
assert np.allclose([freq1.sum()], [freq2.sum()])
witax1 = (wght1 * itax).sum() * 1e-9
witax2 = (wght2 * itax).sum() * 1e-9
witax_ratio = witax2 / witax1
print(
f"{wname1},{wname2}_weighted_iitax($B)= {witax1:.3f} {witax2:.3f}"
f" ===> ratio= {witax_ratio:.4f}"
)
min_bin_cnt = min(np.min(freq1), np.min(freq2))
print(f"numbins,minimum_bin_freqency_count= {numbins} {min_bin_cnt:.1f}")
if min_bin_cnt < 5:
if numbins > MINIMUM_NUM_BINS:
print("WARNING: reduce value of numbins command-line argument")
print(" to get minimum_bin_fredquency_count above five")
else:
print("WARNING: minimum_bin_fredquency_count is below five")
print(" so use TMD_AREA_ITXBINS=1 environment variable")
print(USAGE)
return 1

# conduct chi-square test
print("Conducting chi-square two-variable independence test:")
ctable = pd.DataFrame({"1": freq1, "2": freq2})
res = chi2_contingency(ctable)
print(f"Xsq_statistic= {res.statistic:.3f}")
print(f"Xsq_test_pval= {res.pvalue:.3f} where dof= {res.dof}")
if dump:
print("***** Xsq(pval=0.05,dof= 100)= 124.34")
print("***** Xsq(pval=0.05,dof= 200)= 233.99")
print("***** Xsq(pval=0.05,dof=1000)= 1074.68")
print("***** Xsq(pval=0.05,dof=2000)= 2105.15")

return 0


if __name__ == "__main__":
if "TMD_AREA_ITXBINS" in os.environ:
ITXBINS_DEFINED_USING_AREA_WEIGHTS = True
awname1, awpath1, awname2, awpath2, num_bins, dump_ = check_arguments()
RCODE = main(awname1, awpath1, awname2, awpath2, num_bins, dump_)
sys.exit(RCODE)
8 changes: 7 additions & 1 deletion tmd/areas/create_area_weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
AREA prefix for state areas are the two lower-case character postal codes.
AREA prefix for congressional districts are the state prefix followed by
two digits (with a leading zero) identifying the district. States with
only one congressional district have 00 as the two digits.
only one congressional district have 00 as the two digits to align names
with IRS data.
"""

import sys
Expand Down Expand Up @@ -47,6 +48,11 @@
OPTIMIZE_IPRINT = 0 # 20 is a good diagnostic value; set to 0 for production
OPTIMIZE_RESULTS = False # set to True to see complete optimization results

# assumption about which Congress the congressional districts are defined for:
# ... 2010 implies districts for the 117th Congress
# ... 2020 implies districts for the 118th Congress
CD_CENSUS_YEAR = 2010


def valid_area(area: str):
"""
Expand Down
Loading