Skip to content

Commit

Permalink
Merge pull request #390 from andersonfrailey/skipyears
Browse files Browse the repository at this point in the history
Skip Stage 2 years
  • Loading branch information
andersonfrailey authored Oct 27, 2021
2 parents bebc05d + b55bafa commit c949e40
Show file tree
Hide file tree
Showing 8 changed files with 159 additions and 15 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ __pycache__/

# IRS-SOI PUF and related CPS matching data files
puf*.csv
*puf.csv
puf.csv*
cps-matched-puf.csv
StatMatch/Matching/puf2011.csv
cpsmar2016.csv
Expand Down
2 changes: 0 additions & 2 deletions cps_stage2/dataprep.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,6 @@ def target(target_val, pop, factor, value):
vstack_vars.append(lhs_vars[var])
t = rhs_vars[var]
b.append(t)
# print(f'{var:14} {t:0.2f}') uncomment when moving to 3.6
print("{:14} {:0.2f}".format(var, t))

vstack_vars = tuple(vstack_vars)
one_half_lhs = np.vstack(vstack_vars)
Expand Down
15 changes: 9 additions & 6 deletions cps_stage2/solver.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@ using LinearAlgebra

function Solve_func(year, tol)

println("\nSolving weights for $year ...\n\n")

array = npzread(string(year, "_input.npz"))

# ddir = "/home/donboyd/Documents/python_projects/taxdata/puf_stage2/"
# array = npzread(string(ddir, year, "_input.npz"))
println("Solving weights for $year ...\n\n")
# we only solve the weights for years where the targets have changed. If the
# targets have not changed, we don't write the _input.npz file
if isfile(string(year, "_input.npz"))
array = npzread(string(year, "_input.npz"))
else
println("Skipping solver for $year \n")
return nothing
end

A1 = array["A1"]
A2 = array["A2"]
Expand Down
45 changes: 44 additions & 1 deletion cps_stage2/stage2.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import glob
import json
import hashlib
import numpy as np
import pandas as pd
from pathlib import Path
Expand All @@ -11,6 +13,37 @@
START_YEAR = 2014
END_YEAR = 2031

# Read hashes used to see which years can be skipped
with open(Path(CUR_PATH, "..", "datahashes.json")) as f:
HASHES = json.load(f)["cps"]

# compare hashes of all files used in stage 2 to ensure they didn't change
file_paths = [
Path(CUR_PATH, "..", "data", "cps.csv.gz"),
Path(CUR_PATH, "solver.jl"),
Path(CUR_PATH, "dataprep.py"),
Path(CUR_PATH, "stage2.py"),
]
key_names = ["data", "solver", "dataprep", "stage2"]
files_match = True
for key, file_path in zip(key_names, file_paths):
with open(file_path, "rb") as f:
file_hash = hashlib.sha256(f.read()).hexdigest()
files_match = HASHES[key] == file_hash
if not files_match:
break

# Read current factors and targets
CUR_FACTORS = pd.read_csv(
"https://raw.githubusercontent.com/PSLmodels/taxdata/master/puf_stage1/Stage_I_factors.csv",
index_col=0,
).transpose()
CUR_TARGETS = pd.read_csv(
"https://raw.githubusercontent.com/PSLmodels/taxdata/master/cps_stage1/stage_2_targets.csv",
index_col=0,
)
CUR_WEIGHTS = pd.read_csv(Path(CUR_PATH, "cps_weights.csv.gz"))


def main():
"""
Expand All @@ -21,12 +54,20 @@ def main():
)
cps = cps.fillna(0.0)
stage_1_factors = pd.read_csv(STAGE_1_PATH, index_col=0)
_factors = stage_1_factors.transpose()
stage_2_targets = pd.read_csv(STAGE_2_PATH, index_col=0)
# DataFrame for holding each year's weights
weights = pd.DataFrame()

# write .npz input files for solver
skipped_years = []
for year in range(START_YEAR, END_YEAR + 1):
factor_match = _factors[year].equals(CUR_FACTORS[year])
target_match = stage_2_targets[f"{year}"].equals(CUR_TARGETS[f"{year}"])
if files_match and factor_match and target_match:
print(f"Skipping {year}")
skipped_years.append(year)
continue
dataprep(cps, stage_1_factors, stage_2_targets, year)

# Solver (in Julia)
Expand All @@ -35,7 +76,9 @@ def main():

# write output files to dataframe columns
for year in range(START_YEAR, END_YEAR + 1):

if year in skipped_years:
weights[f"WT{year}"] = CUR_WEIGHTS[f"WT{year}"]
continue
s006 = np.where(
cps.e02400 > 0,
cps.s006 * stage_1_factors["APOPSNR"][year],
Expand Down
14 changes: 14 additions & 0 deletions datahashes.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"puf": {
"data": "b246b86245544f0c4fb362f331dd9c9f0f3805991117a3bb91a2a46c4a505fe8",
"solver": "435ab7f39d4b7dd8b12c19f43978c026c39a5fc5af9dd4a88b925287feecc6e9",
"dataprep": "85a9e87a9c978f1e4d558ddc4ecd19a7fe7ffc82883b7306f6857a48a8b6eb00",
"stage2": "4e7dff40ab434ae30a6349d769d33695007b08eb120ea784661c646a62a6bfaa"
},
"cps": {
"data": "492ead49db94fc4bb4109c33a6c9679aa32c41042e715333cc84df1fe49e578d",
"solver": "0d36a53fbec8850c29b109c309e41648d0737a80643def944efc59d9c804034b",
"dataprep": "a95922179111b9a78f91dc7bc2aeb28a3312900f518d63b47be28910a9b8b2b6",
"stage2": "6de103898e065b0847e6fdd2012196bbf194132d62d0ed05f9e5d64fcb4f26a4"
}
}
34 changes: 34 additions & 0 deletions inithash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""
This script creates the initial hashes for each stage 2 file
"""
import json
import hashlib
from pathlib import Path

CUR_PATH = Path(__file__).resolve().parent


def create_hashes(_file):
"""
Create the hash values for each of the files in the stage 2 process
"""
hashes = {}
with open(Path(CUR_PATH, "data", _file), "rb") as f:
hashes["data"] = hashlib.sha256(f.read()).hexdigest()
if _file == "cps-matched-puf.csv":
basepath = Path(CUR_PATH, "puf_stage2")
else:
basepath = Path(CUR_PATH, "cps_stage2")
filenames = ["solver.jl", "dataprep.py", "stage2.py"]
keynames = ["solver", "dataprep", "stage2"]
for filename, key in zip(filenames, keynames):
with open(Path(basepath, filename), "rb") as f:
hashes[key] = hashlib.sha256(f.read()).hexdigest()
return hashes


finalhashes = {}
finalhashes["puf"] = create_hashes("cps-matched-puf.csv")
finalhashes["cps"] = create_hashes("cps.csv.gz")
with open(Path(CUR_PATH, "datahashes.json"), "w") as f:
json.dump(finalhashes, f, indent=4)
9 changes: 8 additions & 1 deletion puf_stage2/solver.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,14 @@ function Solve_func(year, tol)

println("\nSolving weights for $year ...\n\n")

array = npzread(string(year, "_input.npz"))
# we only solve the weights for years where the targets have changed. If the
# targets have not changed, we don't write the _input.npz file
if isfile(string(year, "_input.npz"))
array = npzread(string(year, "_input.npz"))
else
println("Skipping solver for $year \n")
return nothing
end

A1 = array["A1"]
A2 = array["A2"]
Expand Down
53 changes: 48 additions & 5 deletions puf_stage2/stage2.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,70 @@
import os
import glob
import json
import hashlib
import numpy as np
import pandas as pd
from pathlib import Path
from dataprep import dataprep


CUR_PATH = os.path.abspath(os.path.dirname(__file__))
CUR_PATH = Path(__file__).resolve().parent

# Read hashes used to see which years can be skipped
with open(Path(CUR_PATH, "..", "datahashes.json")) as f:
HASHES = json.load(f)["puf"]

# compare hashes of all files used in stage 2 to ensure they didn't change
file_paths = [
Path(CUR_PATH, "..", "data", "cps-matched-puf.csv"),
Path(CUR_PATH, "solver.jl"),
Path(CUR_PATH, "dataprep.py"),
Path(CUR_PATH, "stage2.py"),
]
key_names = ["data", "solver", "dataprep", "stage2"]
files_match = True
for key, file_path in zip(key_names, file_paths):
with open(file_path, "rb") as f:
file_hash = hashlib.sha256(f.read()).hexdigest()
files_match = HASHES[key] == file_hash
if not files_match:
print(f"{key} has changed")
break

# Read current factors and targets
CUR_FACTORS = pd.read_csv(
"https://raw.githubusercontent.com/PSLmodels/taxdata/master/puf_stage1/Stage_I_factors.csv",
index_col=0,
).transpose()
CUR_TARGETS = pd.read_csv(
"https://raw.githubusercontent.com/PSLmodels/taxdata/master/puf_stage1/Stage_II_targets.csv",
index_col=0,
)
CUR_WEIGHTS = pd.read_csv(Path(CUR_PATH, "puf_weights.csv.gz"))
# Read private CPS-matched-PUF file into a Pandas DataFrame
puf = pd.read_csv(os.path.join(CUR_PATH, "../data/cps-matched-puf.csv"))
puf = pd.read_csv(Path(CUR_PATH, "..", "data", "cps-matched-puf.csv"))

# Read stage1 factors and stage2 targets written by stage1.py script
factors = pd.read_csv(
os.path.join(CUR_PATH, "../puf_stage1/Stage_I_factors.csv"), index_col=0
Path(CUR_PATH, "..", "puf_stage1", "Stage_I_factors.csv"), index_col=0
)
Stage_I_factors = factors.transpose()
stage2_path = os.path.join(CUR_PATH, "../puf_stage1/Stage_II_targets.csv")
stage2_path = Path(CUR_PATH, "..", "puf_stage1", "Stage_II_targets.csv")
Stage_II_targets = pd.read_csv(stage2_path, index_col=0)

# Use the matched_weight variable in CPS as the final weight
puf.s006 = puf.matched_weight * 100


# Dataprep
year_list = [x for x in range(2012, 2031 + 1)]
skipped_years = []
for i in year_list:
factor_match = Stage_I_factors[i].equals(CUR_FACTORS[i])
target_match = Stage_II_targets[f"{i}"].equals(CUR_TARGETS[f"{i}"])
if files_match and factor_match and target_match:
print(f"Skipping {i}")
skipped_years.append(i)
continue
dataprep(puf, Stage_I_factors, Stage_II_targets, year=i)

# Solver (in Julia)
Expand All @@ -38,6 +78,9 @@

# write solution to dataframe
for i in year_list:
if i in skipped_years:
z[f"WT{i}"] = CUR_WEIGHTS[f"WT{i}"]
continue
s006 = np.where(
puf.e02400 > 0,
puf.s006 * Stage_I_factors[i]["APOPSNR"] / 100,
Expand Down

0 comments on commit c949e40

Please sign in to comment.