Merge pull request #390 from andersonfrailey/skipyears

Skip Stage 2 years
PSLmodels · Oct 27, 2021 · c949e40 · c949e40
2 parents bebc05d + b55bafa
commit c949e40
Show file tree

Hide file tree

Showing 8 changed files with 159 additions and 15 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,6 +12,8 @@ __pycache__/
 
 # IRS-SOI PUF and related CPS matching data files
 puf*.csv
+*puf.csv
+puf.csv*
 cps-matched-puf.csv
 StatMatch/Matching/puf2011.csv
 cpsmar2016.csv

diff --git a/cps_stage2/dataprep.py b/cps_stage2/dataprep.py
@@ -171,8 +171,6 @@ def target(target_val, pop, factor, value):
         vstack_vars.append(lhs_vars[var])
         t = rhs_vars[var]
         b.append(t)
-        # print(f'{var:14} {t:0.2f}') uncomment when moving to 3.6
-        print("{:14} {:0.2f}".format(var, t))
 
     vstack_vars = tuple(vstack_vars)
     one_half_lhs = np.vstack(vstack_vars)

diff --git a/cps_stage2/solver.jl b/cps_stage2/solver.jl
@@ -5,12 +5,15 @@ using LinearAlgebra
 
 function Solve_func(year, tol)
 
-	println("\nSolving weights for $year ...\n\n")
-
-	array = npzread(string(year, "_input.npz"))
-
-	# ddir = "/home/donboyd/Documents/python_projects/taxdata/puf_stage2/"
-	# array = npzread(string(ddir, year, "_input.npz"))
+	println("Solving weights for $year ...\n\n")
+	# we only solve the weights for years where the targets have changed. If the
+	# targets have not changed, we don't write the _input.npz file
+	if isfile(string(year, "_input.npz"))
+		array = npzread(string(year, "_input.npz"))
+	else
+		println("Skipping solver for $year \n")
+		return nothing
+	end
 
 	A1 = array["A1"]
 	A2 = array["A2"]

diff --git a/cps_stage2/stage2.py b/cps_stage2/stage2.py
@@ -1,5 +1,7 @@
 import os
 import glob
+import json
+import hashlib
 import numpy as np
 import pandas as pd
 from pathlib import Path
@@ -11,6 +13,37 @@
 START_YEAR = 2014
 END_YEAR = 2031
 
+# Read hashes used to see which years can be skipped
+with open(Path(CUR_PATH, "..", "datahashes.json")) as f:
+    HASHES = json.load(f)["cps"]
+
+# compare hashes of all files used in stage 2 to ensure they didn't change
+file_paths = [
+    Path(CUR_PATH, "..", "data", "cps.csv.gz"),
+    Path(CUR_PATH, "solver.jl"),
+    Path(CUR_PATH, "dataprep.py"),
+    Path(CUR_PATH, "stage2.py"),
+]
+key_names = ["data", "solver", "dataprep", "stage2"]
+files_match = True
+for key, file_path in zip(key_names, file_paths):
+    with open(file_path, "rb") as f:
+        file_hash = hashlib.sha256(f.read()).hexdigest()
+    files_match = HASHES[key] == file_hash
+    if not files_match:
+        break
+
+# Read current factors and targets
+CUR_FACTORS = pd.read_csv(
+    "https://raw.githubusercontent.com/PSLmodels/taxdata/master/puf_stage1/Stage_I_factors.csv",
+    index_col=0,
+).transpose()
+CUR_TARGETS = pd.read_csv(
+    "https://raw.githubusercontent.com/PSLmodels/taxdata/master/cps_stage1/stage_2_targets.csv",
+    index_col=0,
+)
+CUR_WEIGHTS = pd.read_csv(Path(CUR_PATH, "cps_weights.csv.gz"))
+
 
 def main():
     """
@@ -21,12 +54,20 @@ def main():
     )
     cps = cps.fillna(0.0)
     stage_1_factors = pd.read_csv(STAGE_1_PATH, index_col=0)
+    _factors = stage_1_factors.transpose()
     stage_2_targets = pd.read_csv(STAGE_2_PATH, index_col=0)
     # DataFrame for holding each year's weights
     weights = pd.DataFrame()
 
     # write .npz input files for solver
+    skipped_years = []
     for year in range(START_YEAR, END_YEAR + 1):
+        factor_match = _factors[year].equals(CUR_FACTORS[year])
+        target_match = stage_2_targets[f"{year}"].equals(CUR_TARGETS[f"{year}"])
+        if files_match and factor_match and target_match:
+            print(f"Skipping {year}")
+            skipped_years.append(year)
+            continue
         dataprep(cps, stage_1_factors, stage_2_targets, year)
 
     # Solver (in Julia)
@@ -35,7 +76,9 @@ def main():
 
     # write output files to dataframe columns
     for year in range(START_YEAR, END_YEAR + 1):
-
+        if year in skipped_years:
+            weights[f"WT{year}"] = CUR_WEIGHTS[f"WT{year}"]
+            continue
         s006 = np.where(
             cps.e02400 > 0,
             cps.s006 * stage_1_factors["APOPSNR"][year],

diff --git a/datahashes.json b/datahashes.json
@@ -0,0 +1,14 @@
+{
+    "puf": {
+        "data": "b246b86245544f0c4fb362f331dd9c9f0f3805991117a3bb91a2a46c4a505fe8",
+        "solver": "435ab7f39d4b7dd8b12c19f43978c026c39a5fc5af9dd4a88b925287feecc6e9",
+        "dataprep": "85a9e87a9c978f1e4d558ddc4ecd19a7fe7ffc82883b7306f6857a48a8b6eb00",
+        "stage2": "4e7dff40ab434ae30a6349d769d33695007b08eb120ea784661c646a62a6bfaa"
+    },
+    "cps": {
+        "data": "492ead49db94fc4bb4109c33a6c9679aa32c41042e715333cc84df1fe49e578d",
+        "solver": "0d36a53fbec8850c29b109c309e41648d0737a80643def944efc59d9c804034b",
+        "dataprep": "a95922179111b9a78f91dc7bc2aeb28a3312900f518d63b47be28910a9b8b2b6",
+        "stage2": "6de103898e065b0847e6fdd2012196bbf194132d62d0ed05f9e5d64fcb4f26a4"
+    }
+}
diff --git a/inithash.py b/inithash.py
@@ -0,0 +1,34 @@
+"""
+This script creates the initial hashes for each stage 2 file
+"""
+import json
+import hashlib
+from pathlib import Path
+
+CUR_PATH = Path(__file__).resolve().parent
+
+
+def create_hashes(_file):
+    """
+    Create the hash values for each of the files in the stage 2 process
+    """
+    hashes = {}
+    with open(Path(CUR_PATH, "data", _file), "rb") as f:
+        hashes["data"] = hashlib.sha256(f.read()).hexdigest()
+    if _file == "cps-matched-puf.csv":
+        basepath = Path(CUR_PATH, "puf_stage2")
+    else:
+        basepath = Path(CUR_PATH, "cps_stage2")
+    filenames = ["solver.jl", "dataprep.py", "stage2.py"]
+    keynames = ["solver", "dataprep", "stage2"]
+    for filename, key in zip(filenames, keynames):
+        with open(Path(basepath, filename), "rb") as f:
+            hashes[key] = hashlib.sha256(f.read()).hexdigest()
+    return hashes
+
+
+finalhashes = {}
+finalhashes["puf"] = create_hashes("cps-matched-puf.csv")
+finalhashes["cps"] = create_hashes("cps.csv.gz")
+with open(Path(CUR_PATH, "datahashes.json"), "w") as f:
+    json.dump(finalhashes, f, indent=4)
diff --git a/puf_stage2/solver.jl b/puf_stage2/solver.jl
@@ -7,7 +7,14 @@ function Solve_func(year, tol)
 
 	println("\nSolving weights for $year ...\n\n")
 
-	array = npzread(string(year, "_input.npz"))
+	# we only solve the weights for years where the targets have changed. If the
+	# targets have not changed, we don't write the _input.npz file
+	if isfile(string(year, "_input.npz"))
+		array = npzread(string(year, "_input.npz"))
+	else
+		println("Skipping solver for $year \n")
+		return nothing
+	end
 
 	A1 = array["A1"]
 	A2 = array["A2"]

diff --git a/puf_stage2/stage2.py b/puf_stage2/stage2.py
@@ -1,30 +1,70 @@
 import os
 import glob
+import json
+import hashlib
 import numpy as np
 import pandas as pd
+from pathlib import Path
 from dataprep import dataprep
 
 
-CUR_PATH = os.path.abspath(os.path.dirname(__file__))
+CUR_PATH = Path(__file__).resolve().parent
 
+# Read hashes used to see which years can be skipped
+with open(Path(CUR_PATH, "..", "datahashes.json")) as f:
+    HASHES = json.load(f)["puf"]
+
+# compare hashes of all files used in stage 2 to ensure they didn't change
+file_paths = [
+    Path(CUR_PATH, "..", "data", "cps-matched-puf.csv"),
+    Path(CUR_PATH, "solver.jl"),
+    Path(CUR_PATH, "dataprep.py"),
+    Path(CUR_PATH, "stage2.py"),
+]
+key_names = ["data", "solver", "dataprep", "stage2"]
+files_match = True
+for key, file_path in zip(key_names, file_paths):
+    with open(file_path, "rb") as f:
+        file_hash = hashlib.sha256(f.read()).hexdigest()
+    files_match = HASHES[key] == file_hash
+    if not files_match:
+        print(f"{key} has changed")
+        break
+
+# Read current factors and targets
+CUR_FACTORS = pd.read_csv(
+    "https://raw.githubusercontent.com/PSLmodels/taxdata/master/puf_stage1/Stage_I_factors.csv",
+    index_col=0,
+).transpose()
+CUR_TARGETS = pd.read_csv(
+    "https://raw.githubusercontent.com/PSLmodels/taxdata/master/puf_stage1/Stage_II_targets.csv",
+    index_col=0,
+)
+CUR_WEIGHTS = pd.read_csv(Path(CUR_PATH, "puf_weights.csv.gz"))
 # Read private CPS-matched-PUF file into a Pandas DataFrame
-puf = pd.read_csv(os.path.join(CUR_PATH, "../data/cps-matched-puf.csv"))
+puf = pd.read_csv(Path(CUR_PATH, "..", "data", "cps-matched-puf.csv"))
 
 # Read stage1 factors and stage2 targets written by stage1.py script
 factors = pd.read_csv(
-    os.path.join(CUR_PATH, "../puf_stage1/Stage_I_factors.csv"), index_col=0
+    Path(CUR_PATH, "..", "puf_stage1", "Stage_I_factors.csv"), index_col=0
 )
 Stage_I_factors = factors.transpose()
-stage2_path = os.path.join(CUR_PATH, "../puf_stage1/Stage_II_targets.csv")
+stage2_path = Path(CUR_PATH, "..", "puf_stage1", "Stage_II_targets.csv")
 Stage_II_targets = pd.read_csv(stage2_path, index_col=0)
 
 # Use the matched_weight variable in CPS as the final weight
 puf.s006 = puf.matched_weight * 100
 
-
 # Dataprep
 year_list = [x for x in range(2012, 2031 + 1)]
+skipped_years = []
 for i in year_list:
+    factor_match = Stage_I_factors[i].equals(CUR_FACTORS[i])
+    target_match = Stage_II_targets[f"{i}"].equals(CUR_TARGETS[f"{i}"])
+    if files_match and factor_match and target_match:
+        print(f"Skipping {i}")
+        skipped_years.append(i)
+        continue
     dataprep(puf, Stage_I_factors, Stage_II_targets, year=i)
 
 # Solver (in Julia)
@@ -38,6 +78,9 @@
 
 # write solution to dataframe
 for i in year_list:
+    if i in skipped_years:
+        z[f"WT{i}"] = CUR_WEIGHTS[f"WT{i}"]
+        continue
     s006 = np.where(
         puf.e02400 > 0,
         puf.s006 * Stage_I_factors[i]["APOPSNR"] / 100,