Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip Stage 2 years #390

Merged
merged 7 commits into from
Oct 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ __pycache__/

# IRS-SOI PUF and related CPS matching data files
puf*.csv
*puf.csv
puf.csv*
cps-matched-puf.csv
StatMatch/Matching/puf2011.csv
cpsmar2016.csv
Expand Down
2 changes: 0 additions & 2 deletions cps_stage2/dataprep.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,6 @@ def target(target_val, pop, factor, value):
vstack_vars.append(lhs_vars[var])
t = rhs_vars[var]
b.append(t)
# print(f'{var:14} {t:0.2f}') uncomment when moving to 3.6
print("{:14} {:0.2f}".format(var, t))

vstack_vars = tuple(vstack_vars)
one_half_lhs = np.vstack(vstack_vars)
Expand Down
15 changes: 9 additions & 6 deletions cps_stage2/solver.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@ using LinearAlgebra

function Solve_func(year, tol)

println("\nSolving weights for $year ...\n\n")

array = npzread(string(year, "_input.npz"))

# ddir = "/home/donboyd/Documents/python_projects/taxdata/puf_stage2/"
# array = npzread(string(ddir, year, "_input.npz"))
println("Solving weights for $year ...\n\n")
# we only solve the weights for years where the targets have changed. If the
# targets have not changed, we don't write the _input.npz file
if isfile(string(year, "_input.npz"))
array = npzread(string(year, "_input.npz"))
else
println("Skipping solver for $year \n")
return nothing
end

A1 = array["A1"]
A2 = array["A2"]
Expand Down
45 changes: 44 additions & 1 deletion cps_stage2/stage2.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import glob
import json
import hashlib
import numpy as np
import pandas as pd
from pathlib import Path
Expand All @@ -11,6 +13,37 @@
START_YEAR = 2014
END_YEAR = 2031

# Read hashes used to see which years can be skipped
with open(Path(CUR_PATH, "..", "datahashes.json")) as f:
HASHES = json.load(f)["cps"]

# compare hashes of all files used in stage 2 to ensure they didn't change
file_paths = [
Path(CUR_PATH, "..", "data", "cps.csv.gz"),
Path(CUR_PATH, "solver.jl"),
Path(CUR_PATH, "dataprep.py"),
Path(CUR_PATH, "stage2.py"),
]
key_names = ["data", "solver", "dataprep", "stage2"]
files_match = True
for key, file_path in zip(key_names, file_paths):
with open(file_path, "rb") as f:
file_hash = hashlib.sha256(f.read()).hexdigest()
files_match = HASHES[key] == file_hash
if not files_match:
break

# Read current factors and targets
CUR_FACTORS = pd.read_csv(
"https://raw.githubusercontent.com/PSLmodels/taxdata/master/puf_stage1/Stage_I_factors.csv",
index_col=0,
).transpose()
CUR_TARGETS = pd.read_csv(
"https://raw.githubusercontent.com/PSLmodels/taxdata/master/cps_stage1/stage_2_targets.csv",
index_col=0,
)
CUR_WEIGHTS = pd.read_csv(Path(CUR_PATH, "cps_weights.csv.gz"))


def main():
"""
Expand All @@ -21,12 +54,20 @@ def main():
)
cps = cps.fillna(0.0)
stage_1_factors = pd.read_csv(STAGE_1_PATH, index_col=0)
_factors = stage_1_factors.transpose()
stage_2_targets = pd.read_csv(STAGE_2_PATH, index_col=0)
# DataFrame for holding each year's weights
weights = pd.DataFrame()

# write .npz input files for solver
skipped_years = []
for year in range(START_YEAR, END_YEAR + 1):
factor_match = _factors[year].equals(CUR_FACTORS[year])
target_match = stage_2_targets[f"{year}"].equals(CUR_TARGETS[f"{year}"])
if files_match and factor_match and target_match:
print(f"Skipping {year}")
skipped_years.append(year)
continue
dataprep(cps, stage_1_factors, stage_2_targets, year)

# Solver (in Julia)
Expand All @@ -35,7 +76,9 @@ def main():

# write output files to dataframe columns
for year in range(START_YEAR, END_YEAR + 1):

if year in skipped_years:
weights[f"WT{year}"] = CUR_WEIGHTS[f"WT{year}"]
continue
s006 = np.where(
cps.e02400 > 0,
cps.s006 * stage_1_factors["APOPSNR"][year],
Expand Down
14 changes: 14 additions & 0 deletions datahashes.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"puf": {
"data": "b246b86245544f0c4fb362f331dd9c9f0f3805991117a3bb91a2a46c4a505fe8",
"solver": "435ab7f39d4b7dd8b12c19f43978c026c39a5fc5af9dd4a88b925287feecc6e9",
"dataprep": "85a9e87a9c978f1e4d558ddc4ecd19a7fe7ffc82883b7306f6857a48a8b6eb00",
"stage2": "4e7dff40ab434ae30a6349d769d33695007b08eb120ea784661c646a62a6bfaa"
},
"cps": {
"data": "492ead49db94fc4bb4109c33a6c9679aa32c41042e715333cc84df1fe49e578d",
"solver": "0d36a53fbec8850c29b109c309e41648d0737a80643def944efc59d9c804034b",
"dataprep": "a95922179111b9a78f91dc7bc2aeb28a3312900f518d63b47be28910a9b8b2b6",
"stage2": "6de103898e065b0847e6fdd2012196bbf194132d62d0ed05f9e5d64fcb4f26a4"
}
}
34 changes: 34 additions & 0 deletions inithash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""
This script creates the initial hashes for each stage 2 file
"""
import json
import hashlib
from pathlib import Path

CUR_PATH = Path(__file__).resolve().parent


def create_hashes(_file):
"""
Create the hash values for each of the files in the stage 2 process
"""
hashes = {}
with open(Path(CUR_PATH, "data", _file), "rb") as f:
hashes["data"] = hashlib.sha256(f.read()).hexdigest()
if _file == "cps-matched-puf.csv":
basepath = Path(CUR_PATH, "puf_stage2")
else:
basepath = Path(CUR_PATH, "cps_stage2")
filenames = ["solver.jl", "dataprep.py", "stage2.py"]
keynames = ["solver", "dataprep", "stage2"]
for filename, key in zip(filenames, keynames):
with open(Path(basepath, filename), "rb") as f:
hashes[key] = hashlib.sha256(f.read()).hexdigest()
return hashes


finalhashes = {}
finalhashes["puf"] = create_hashes("cps-matched-puf.csv")
finalhashes["cps"] = create_hashes("cps.csv.gz")
with open(Path(CUR_PATH, "datahashes.json"), "w") as f:
json.dump(finalhashes, f, indent=4)
9 changes: 8 additions & 1 deletion puf_stage2/solver.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,14 @@ function Solve_func(year, tol)

println("\nSolving weights for $year ...\n\n")

array = npzread(string(year, "_input.npz"))
# we only solve the weights for years where the targets have changed. If the
# targets have not changed, we don't write the _input.npz file
if isfile(string(year, "_input.npz"))
array = npzread(string(year, "_input.npz"))
else
println("Skipping solver for $year \n")
return nothing
end

A1 = array["A1"]
A2 = array["A2"]
Expand Down
53 changes: 48 additions & 5 deletions puf_stage2/stage2.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,70 @@
import os
import glob
import json
import hashlib
import numpy as np
import pandas as pd
from pathlib import Path
from dataprep import dataprep


CUR_PATH = os.path.abspath(os.path.dirname(__file__))
CUR_PATH = Path(__file__).resolve().parent

# Read hashes used to see which years can be skipped
with open(Path(CUR_PATH, "..", "datahashes.json")) as f:
HASHES = json.load(f)["puf"]

# compare hashes of all files used in stage 2 to ensure they didn't change
file_paths = [
Path(CUR_PATH, "..", "data", "cps-matched-puf.csv"),
Path(CUR_PATH, "solver.jl"),
Path(CUR_PATH, "dataprep.py"),
Path(CUR_PATH, "stage2.py"),
]
key_names = ["data", "solver", "dataprep", "stage2"]
files_match = True
for key, file_path in zip(key_names, file_paths):
with open(file_path, "rb") as f:
file_hash = hashlib.sha256(f.read()).hexdigest()
files_match = HASHES[key] == file_hash
if not files_match:
print(f"{key} has changed")
break

# Read current factors and targets
CUR_FACTORS = pd.read_csv(
"https://raw.githubusercontent.com/PSLmodels/taxdata/master/puf_stage1/Stage_I_factors.csv",
index_col=0,
).transpose()
CUR_TARGETS = pd.read_csv(
"https://raw.githubusercontent.com/PSLmodels/taxdata/master/puf_stage1/Stage_II_targets.csv",
index_col=0,
)
CUR_WEIGHTS = pd.read_csv(Path(CUR_PATH, "puf_weights.csv.gz"))
# Read private CPS-matched-PUF file into a Pandas DataFrame
puf = pd.read_csv(os.path.join(CUR_PATH, "../data/cps-matched-puf.csv"))
puf = pd.read_csv(Path(CUR_PATH, "..", "data", "cps-matched-puf.csv"))

# Read stage1 factors and stage2 targets written by stage1.py script
factors = pd.read_csv(
os.path.join(CUR_PATH, "../puf_stage1/Stage_I_factors.csv"), index_col=0
Path(CUR_PATH, "..", "puf_stage1", "Stage_I_factors.csv"), index_col=0
)
Stage_I_factors = factors.transpose()
stage2_path = os.path.join(CUR_PATH, "../puf_stage1/Stage_II_targets.csv")
stage2_path = Path(CUR_PATH, "..", "puf_stage1", "Stage_II_targets.csv")
Stage_II_targets = pd.read_csv(stage2_path, index_col=0)

# Use the matched_weight variable in CPS as the final weight
puf.s006 = puf.matched_weight * 100


# Dataprep
year_list = [x for x in range(2012, 2031 + 1)]
skipped_years = []
for i in year_list:
factor_match = Stage_I_factors[i].equals(CUR_FACTORS[i])
target_match = Stage_II_targets[f"{i}"].equals(CUR_TARGETS[f"{i}"])
if files_match and factor_match and target_match:
print(f"Skipping {i}")
skipped_years.append(i)
continue
dataprep(puf, Stage_I_factors, Stage_II_targets, year=i)

# Solver (in Julia)
Expand All @@ -38,6 +78,9 @@

# write solution to dataframe
for i in year_list:
if i in skipped_years:
z[f"WT{i}"] = CUR_WEIGHTS[f"WT{i}"]
continue
s006 = np.where(
puf.e02400 > 0,
puf.s006 * Stage_I_factors[i]["APOPSNR"] / 100,
Expand Down