diff --git a/setup.py b/setup.py index a7330d3f..933d0f84 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,6 @@ "tensorboard", "jupyter-book", "furo", - "survey_enhance", + "scikit-learn", ], ) diff --git a/tax_microdata_benchmarking/create_taxcalc_input_variables.py b/tax_microdata_benchmarking/create_taxcalc_input_variables.py index e35a0432..95ef00fb 100644 --- a/tax_microdata_benchmarking/create_taxcalc_input_variables.py +++ b/tax_microdata_benchmarking/create_taxcalc_input_variables.py @@ -2,9 +2,22 @@ Construct tmd.csv, a Tax-Calculator-style input variable file for 2021. """ +import taxcalc as tc +from tax_microdata_benchmarking.datasets.tmd import create_tmd_2021 +from tax_microdata_benchmarking.utils.qbi import ( + add_pt_w2_wages, +) +from tax_microdata_benchmarking.imputation_assumptions import ( + IMPUTATION_RF_RNG_SEED, + IMPUTATION_BETA_RNG_SEED, + W2_WAGES_SCALE, +) +from tax_microdata_benchmarking.storage import STORAGE_FOLDER + + TAXYEAR = 2021 +INITIAL_W2_WAGES_SCALE = W2_WAGES_SCALE DO_REWEIGHTING = True -INITIAL_W2_WAGES_SCALE = 0.19980 INCLUDE_ORIGINAL_WEIGHTS = True @@ -12,29 +25,26 @@ def create_variable_file(write_file=True): """ Create Tax-Calculator-style input variable file for TAXYEAR. """ - import taxcalc as tc - from tax_microdata_benchmarking.datasets.tmd import create_tmd_2021 - from tax_microdata_benchmarking.utils.qbi import ( - add_pt_w2_wages, - ) - from tax_microdata_benchmarking.storage import STORAGE_FOLDER - # construct dataframe containing input and output variables - print(f"Creating {TAXYEAR} PUF-ECPS file assuming:") - print(f" DO_REWEIGHTING = {DO_REWEIGHTING}") + print(f"Creating {TAXYEAR} PUF+CPS file assuming:") + print(f" IMPUTATION_RF_RNG_SEED = {IMPUTATION_RF_RNG_SEED}") + print(f" IMPUTATION_BETA_RNG_SEED = {IMPUTATION_BETA_RNG_SEED}") print(f" INITIAL_W2_WAGES_SCALE = {INITIAL_W2_WAGES_SCALE:.5f}") + print(f" DO_REWEIGHTING = {DO_REWEIGHTING}") print(f" INCLUDE_ORIGINAL_WEIGHTS = {INCLUDE_ORIGINAL_WEIGHTS}") vdf = create_tmd_2021() vdf.FLPDYR = TAXYEAR (vdf, pt_w2_wages_scale) = add_pt_w2_wages(vdf) abs_diff = abs(pt_w2_wages_scale - INITIAL_W2_WAGES_SCALE) - if abs_diff > 1e-6: - msg = ( - f"\nFINAL vs INITIAL scale diff = {abs_diff:.6f}" - f"\n INITIAL pt_w2_wages_scale = {INITIAL_W2_WAGES_SCALE:.6f}" - f"\n FINAL pt_w2_wages_scale = {pt_w2_wages_scale:.6f}" - ) - raise ValueError(msg) + msg = ( + f" FINAL vs INITIAL scale diff = {abs_diff:.6f}\n" + f" INITIAL pt_w2_wages_scale = {INITIAL_W2_WAGES_SCALE:.6f}\n" + f" FINAL pt_w2_wages_scale = {pt_w2_wages_scale:.6f}" + ) + print(msg) + if abs_diff > 1e-3: + emsg = "INITIAL and FINAL scale values are substantially inconsistent" + raise ValueError(emsg) # streamline dataframe so that it includes only input variables rec = tc.Records( data=vdf, @@ -59,6 +69,7 @@ def create_variable_file(write_file=True): # write streamlined variables dataframe to CSV-formatted file if write_file: tmd_csv_fname = STORAGE_FOLDER / "output" / "tmd.csv.gz" + print(f"Writing PUF+CPS file... [{tmd_csv_fname}]") vdf.to_csv(tmd_csv_fname, index=False, float_format="%.2f") diff --git a/tax_microdata_benchmarking/datasets/cps.py b/tax_microdata_benchmarking/datasets/cps.py index d7b99f21..1d8e4ca6 100644 --- a/tax_microdata_benchmarking/datasets/cps.py +++ b/tax_microdata_benchmarking/datasets/cps.py @@ -1,6 +1,5 @@ from io import BytesIO from zipfile import ZipFile -from policyengine_core.data import Dataset import pandas as pd import requests from tqdm import tqdm @@ -714,13 +713,10 @@ def add_household_variables(cps: h5py.File, household: DataFrame) -> None: def add_previous_year_income(self, cps: h5py.File) -> None: if self.previous_year_raw_cps is None: - print( - "No previous year data available for this dataset, skipping previous year income imputation." - ) + msg = "Skipping CPS previous year income imputation given lack of data" + print(f"{msg}...") return - from survey_enhance.impute import Imputation - cps_current_year_data = self.raw_cps(require=True).load() cps_previous_year_data = self.previous_year_raw_cps(require=True).load() cps_previous_year = cps_previous_year_data.person.set_index( diff --git a/tax_microdata_benchmarking/datasets/puf.py b/tax_microdata_benchmarking/datasets/puf.py index cbf889bb..076de76b 100644 --- a/tax_microdata_benchmarking/datasets/puf.py +++ b/tax_microdata_benchmarking/datasets/puf.py @@ -1,14 +1,17 @@ import pandas as pd import numpy as np import yaml -from survey_enhance import Imputation from microdf import MicroDataFrame from tax_microdata_benchmarking.storage import STORAGE_FOLDER from tax_microdata_benchmarking.utils.pension_contributions import ( impute_pension_contributions_to_puf, ) - -DEFAULT_W2_WAGE_RATE = 0.19824 # Solved for JCT Tax Expenditures in 2021 +from tax_microdata_benchmarking.utils.imputation import Imputation +from tax_microdata_benchmarking.imputation_assumptions import ( + IMPUTATION_RF_RNG_SEED, + IMPUTATION_BETA_RNG_SEED, + W2_WAGES_SCALE, +) def impute_missing_demographics( @@ -37,6 +40,9 @@ def impute_missing_demographics( ] demographics_from_puf = Imputation() + demographics_from_puf.rf_rng_seed = IMPUTATION_RF_RNG_SEED + demographics_from_puf.beta_rng_seed = IMPUTATION_BETA_RNG_SEED + demographics_from_puf.train( X=puf_with_demographics[NON_DEMOGRAPHIC_VARIABLES], Y=puf_with_demographics[DEMOGRAPHIC_VARIABLES], @@ -46,7 +52,7 @@ def impute_missing_demographics( ~puf.RECID.isin(puf_with_demographics.RECID) ].reset_index() predicted_demographics = demographics_from_puf.predict( - puf_without_demographics + X=puf_without_demographics, ) puf_with_imputed_demographics = pd.concat( [puf_without_demographics, predicted_demographics], axis=1 @@ -174,8 +180,8 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame: # Ignore f2441 (AMT form attached) # Ignore cmbtp (estimate of AMT income not in AGI) # Ignore k1bx14s and k1bx14p (partner self-employment income included in partnership and S-corp income) - qbi = puf.E00900 + puf.E26270 + puf.E02100 + puf.E27200 - puf["w2_wages_from_qualified_business"] = qbi * DEFAULT_W2_WAGE_RATE + qbi = np.maximum(0, puf.E00900 + puf.E26270 + puf.E02100 + puf.E27200) + puf["w2_wages_from_qualified_business"] = qbi * W2_WAGES_SCALE puf["filing_status"] = puf.MARS.map( { @@ -264,15 +270,14 @@ def generate(self, puf: pd.DataFrame, demographics: pd.DataFrame): from tax_microdata_benchmarking.datasets.uprate_puf import uprate_puf if self.time_period > 2015: - print("Uprating PUF...") puf = uprate_puf(puf, 2015, self.time_period) - print("Loading and pre-processing PUF...") + print("Pre-processing PUF...") original_recid = puf.RECID.values.copy() puf = preprocess_puf(puf) - print("Imputing missing demographics...") + print("Imputing missing PUF demographics...") puf = impute_missing_demographics(puf, demographics) - print("Imputing pension contributions...") + print("Imputing PUF pension contributions...") puf["pre_tax_contributions"] = impute_pension_contributions_to_puf( puf[["employment_income"]] ) diff --git a/tax_microdata_benchmarking/datasets/taxcalc_dataset.py b/tax_microdata_benchmarking/datasets/taxcalc_dataset.py index db02cdb2..8aabe1df 100644 --- a/tax_microdata_benchmarking/datasets/taxcalc_dataset.py +++ b/tax_microdata_benchmarking/datasets/taxcalc_dataset.py @@ -14,6 +14,8 @@ def create_tc_dataset(pe_dataset: Type, year: int = 2015) -> pd.DataFrame: pe_sim = Microsimulation(dataset=pe_dataset) df = pd.DataFrame() + print(f"Creating tc dataset for year {year}...") + is_non_dep = ~pe_sim.calculate("is_tax_unit_dependent").values tax_unit = pe_sim.populations["tax_unit"] diff --git a/tax_microdata_benchmarking/datasets/tmd.py b/tax_microdata_benchmarking/datasets/tmd.py index f419a226..9aad3a99 100644 --- a/tax_microdata_benchmarking/datasets/tmd.py +++ b/tax_microdata_benchmarking/datasets/tmd.py @@ -13,16 +13,18 @@ def create_tmd_2021(): - if not CPS_2021().exists: - # Don't recreate if already exists. - create_cps_2021() - if not PUF_2021().exists: - # Don't recreate if already exists. - create_pe_puf_2021() + # always create CPS_2021 and PUF_2021 + # (because imputation assumptions may have changed) + create_cps_2021() + create_pe_puf_2021() + tc_puf_21 = create_tc_dataset(PUF_2021) tc_cps_21 = create_tc_dataset(CPS_2021) - # Add nonfiler flag to tc_cps_21 with 2022 filing rules (2021 had large changes) + print("Combining PUF and CPS nonfilers...") + + # Add nonfiler flag to tc_cps_21 with 2022 filing rules + # (2021 had large changes) from policyengine_us import Microsimulation sim = Microsimulation(dataset=CPS_2021) @@ -31,8 +33,6 @@ def create_tmd_2021(): combined = pd.concat([tc_puf_21, tc_cps_21], ignore_index=True) - print("Combined PUF and CPS nonfilers.") - # Add Tax-Calculator outputs print("Adding Tax-Calculator outputs...") combined = add_taxcalc_outputs(combined, 2021) @@ -40,8 +40,6 @@ def create_tmd_2021(): print("Reweighting...") combined = reweight(combined, 2021, weight_deviation_penalty=0) - print("Completed.") - return combined diff --git a/tax_microdata_benchmarking/datasets/uprate_puf.py b/tax_microdata_benchmarking/datasets/uprate_puf.py index f824a53e..b1519a35 100644 --- a/tax_microdata_benchmarking/datasets/uprate_puf.py +++ b/tax_microdata_benchmarking/datasets/uprate_puf.py @@ -134,13 +134,13 @@ def get_growth(variable, from_year, to_year): def uprate_puf(puf, from_year, to_year): + print(f"Uprating PUF from {from_year} to {to_year}...") puf = puf.copy() for variable in SOI_TO_PUF_STRAIGHT_RENAMES: growth = get_growth(variable, from_year, to_year) puf[SOI_TO_PUF_STRAIGHT_RENAMES[variable]] *= growth # Positive and negative split variables - for variable in SOI_TO_PUF_POS_ONLY_RENAMES: growth = get_growth(variable, from_year, to_year) puf_variable = SOI_TO_PUF_POS_ONLY_RENAMES[variable] @@ -151,20 +151,18 @@ def uprate_puf(puf, from_year, to_year): puf_variable = SOI_TO_PUF_NEG_ONLY_RENAMES[variable] puf[puf_variable][puf[puf_variable] < 0] *= growth - # Remaining variables, uprate purely by AGI growth (for now, because I'm not sure how to handle the deductions, credits and incomes separately) - + # Remaining variables, uprate purely by AGI growth + # (for now, because I'm not sure how to handle the deductions, + # credits, and incomes separately) for variable in REMAINING_VARIABLES: growth = get_growth("adjusted_gross_income", from_year, to_year) puf[variable] *= growth # Uprate the weights - returns_start = get_soi_aggregate("count", from_year, True) returns_end = get_soi_aggregate("count", to_year, True) puf.S006 *= returns_end / returns_start - print(f"Uprated PUF from {from_year} to {to_year}") - return puf diff --git a/tax_microdata_benchmarking/imputation_assumptions.py b/tax_microdata_benchmarking/imputation_assumptions.py new file mode 100644 index 00000000..def44e66 --- /dev/null +++ b/tax_microdata_benchmarking/imputation_assumptions.py @@ -0,0 +1,9 @@ +""" +Central location for data imputation assumptions. +""" + +IMPUTATION_RF_RNG_SEED = 1928374 # random number seed used by RandomForest + +IMPUTATION_BETA_RNG_SEED = 37465 # random number seed used for Beta variates + +W2_WAGES_SCALE = 0.19979 # parameter used to impute pass-through W-2 wages diff --git a/tax_microdata_benchmarking/utils/imputation.py b/tax_microdata_benchmarking/utils/imputation.py new file mode 100644 index 00000000..332e6fcc --- /dev/null +++ b/tax_microdata_benchmarking/utils/imputation.py @@ -0,0 +1,375 @@ +from pathlib import Path +from typing import List, Dict +import numpy as np +import pandas as pd +from sklearn.ensemble import RandomForestRegressor +from tqdm import tqdm + + +def to_array(values) -> np.ndarray: + if isinstance(values, (pd.Series, pd.DataFrame)): + return values.values + return values + + +def get_category_mapping(values: pd.Series) -> Dict[str, int]: + return {category: i for i, category in enumerate(values.unique())} + + +class Imputation: + """ + An `Imputation` represents a learned function + f(`input_variables`) -> `output_variables`. + """ + + models: List["ManyToOneImputation"] + """Each column of the output variables is predicted by a separate model, + stored in this list.""" + X_columns: List[str] + """The names of the input variables.""" + Y_columns: List[str] + """The names of the output variables.""" + X_category_mappings: List[Dict[str, int]] = None + """The mapping from category names to integers for each input variable.""" + rf_rng_seed: int = None + """The random number generator seed used by RandomForestRegressor.""" + beta_rng_seed: int = None + """Random number generator seed used to generate Beta variates.""" + + def encode_categories(self, X: pd.DataFrame) -> pd.DataFrame: + if self.X_category_mappings is None: + self.X_category_mappings = { + i: ( + get_category_mapping(X[column]) + if X[column].dtype == "object" + else None + ) + for i, column in enumerate(X.columns) + } + X = X.copy() + for i, column in enumerate(X.columns): + if self.X_category_mappings.get(i) is not None: + X[column] = X[column].map(self.X_category_mappings[i]) + # If we got NaNs, raise an exception with the offending values. + if X[column].isna().any(): + emsg = ( + f"Column {column} has NaNs after encoding categories. " + "Unknown values: " + f"{X[column][X[column].isnull()].unique()}" + ) + raise ValueError(emsg) + return X + + def train( + self, + X: pd.DataFrame, + Y: pd.DataFrame, + num_trees: int = 100, + verbose: bool = False, + sample_weight: pd.Series = None, + ): + """ + Train a random forest model to predict the output variables + from the input variables. + + Args: + X (pd.DataFrame): The dataset containing the input variables. + Y (pd.DataFrame): The dataset containing the output variables. + num_trees (int): The number of trees to use in the random forest. + verbose (bool): Whether to print progress. + """ + + self.X_columns = X.columns + self.Y_columns = Y.columns + + X = self.encode_categories(X) + + self.models = [] + # We train a separate model for each output variable. + # For example, if X = [income, age] and Y = [height, weight], + # we train two models: + # 1. Predict height from income and age. + # 2. Predict weight from income, age and (predicted) height. + + if verbose: + iterator = tqdm(range(len(Y.columns)), desc="Training models") + else: + iterator = range(len(Y.columns)) + for i in iterator: + Y_columns = Y.columns[:i] + if i == 0: + X_ = to_array(X) + else: + X_ = to_array(pd.concat([X, Y[Y_columns]], axis=1)) + y_ = to_array(Y[Y.columns[i]]) + model = ManyToOneImputation() + model.rf_rng_seed = self.rf_rng_seed + model.beta_rng_seed = self.beta_rng_seed + model.encode_categories = self.encode_categories + model.train( + X_, y_, num_trees=num_trees, sample_weight=sample_weight + ) + self.models.append(model) + + def predict( + self, + X: pd.DataFrame, + mean_quantile: float = 0.5, + verbose: bool = False, + ) -> pd.DataFrame: + """ + Predict the output variables for the input dataset. + + Args: + X (pd.DataFrame): The dataset to predict on. + mean_quantile (float): The beta parameter for the imputation. + + Returns: + pd.DataFrame: The predicted dataset. + """ + + if isinstance(X, list): + X = pd.DataFrame(X, columns=self.X_columns) + X = pd.DataFrame(X, columns=self.X_columns) + X = to_array(self.encode_categories(X)) + Y = np.zeros((X.shape[0], len(self.models))) + if verbose: + iterator = tqdm(enumerate(self.models), desc="Predicting") + else: + iterator = enumerate(self.models) + for i, model in iterator: + if isinstance(mean_quantile, list): + quantile = mean_quantile[i] + else: + quantile = mean_quantile + X_ = np.concatenate([X, Y[:, :i]], axis=1) + model.encode_categories = self.encode_categories + Y[:, i] = model.predict(X_, quantile) + return pd.DataFrame(Y, columns=self.Y_columns) + + def save(self, path: str): + """ + Save the imputation model to disk. + + Args: + path (str): The path to save the model to. + """ + + import pickle + + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "wb") as f: + # Store the models only in a dictionary. + data = dict( + models=self.models, + X_columns=self.X_columns, + X_category_mappings=self.X_category_mappings, + Y_columns=self.Y_columns, + ) + pickle.dump(data, f) + + @staticmethod + def load(path: str) -> "Imputation": + """ + Load the imputation model from disk. + + Args: + path (str): The path to load the model from. + + Returns: + Imputation: The imputation model. + """ + + import pickle + + imputation = Imputation() + with open(path, "rb") as f: + data = pickle.load(f) + imputation.models = data["models"] + imputation.X_columns = data["X_columns"] + imputation.X_category_mappings = data["X_category_mappings"] + imputation.Y_columns = data["Y_columns"] + for model in imputation.models: + model.encode_categories = imputation.encode_categories + model.X_category_mappings = imputation.X_category_mappings + return imputation + + def solve_for_mean_quantiles( + self, + targets: list, + input_data: pd.DataFrame, + weights: pd.Series, + max_iterations: int = 10, + ): + mean_quantiles = [] + input_data = input_data.copy() + for i, model in enumerate(self.models): + print(f"Imputing {self.Y_columns[i]}...") + mean_quantiles.append( + model.solve_for_mean_quantile( + target=targets[i], + input_df=input_data, + weights=weights, + verbose=True, + max_iterations=max_iterations, + ) + ) + predicted_column = model.predict(input_data, mean_quantiles[-1]) + input_data[self.Y_columns[i]] = predicted_column + return mean_quantiles + + +class ManyToOneImputation: + """ + An `Imputation` consists of a set of `ManyToOneImputation` models, + one for each output variable. + """ + + model: RandomForestRegressor + """The random forest model.""" + is_integer_coded: bool = False + """Whether the output variable is integer coded.""" + rf_rng_seed: int = None + """Random number generator seed used by RandomForestRegressor.""" + beta_rng_seed: int = None + """Random number generator seed used to generate Beta variates.""" + + def train( + self, + X: pd.DataFrame, + y: pd.Series, + sample_weight: pd.Series = None, + num_trees: int = 100, + ): + """ + Train a random forest model to predict the output variable from + the input variables. + + Args: + X (pd.DataFrame): The dataset containing the input variables. + y (pd.Series): The dataset containing the output variable. + sample_weight (pd.Series): The sample weights. + """ + + X = to_array(X) + y = to_array(y) + self.model = RandomForestRegressor( + n_estimators=num_trees, + bootstrap=True, + max_samples=0.01, + random_state=self.rf_rng_seed, + ) + try: + self.is_integer_coded = ( + isinstance(y[0], str) or (y - y.round()).mean() < 1e-3 + ) + except Exception as e: + pass + self.model.fit(X, y, sample_weight=sample_weight) + + def predict( + self, + X: pd.DataFrame, + mean_quantile: float = 0.5, + ) -> pd.DataFrame: + """ + Predict the output variable for the input dataset. + + Args: + X (pd.DataFrame): dataset to predict on. + mean_quantile (float): mean quantile under the Beta distribution. + + Returns: + pd.Series: The predicted distribution of values for each input row. + """ + if isinstance(X, pd.DataFrame) and any( + [X[column].dtype == "O" for column in X.columns] + ): + X = self.encode_categories(X) + X = to_array(X) + tree_predictions = [tree.predict(X) for tree in self.model.estimators_] + + # Get the percentiles of the predictions. + tree_predictions = np.array(tree_predictions).transpose() + if mean_quantile is None: + mean_quantile = 0.5 + a = mean_quantile / (1 - mean_quantile) + random_generator = np.random.default_rng(seed=self.beta_rng_seed) + input_quantiles = random_generator.beta( + a, 1, size=tree_predictions.shape[0] + ) + x = np.apply_along_axis( + lambda x: np.percentile( + x[1:], x[0], interpolation="nearest" + ), # Privacy?? + 1, + np.concatenate( + [ + np.array(input_quantiles)[:, np.newaxis] * 100, + tree_predictions, + ], + axis=1, + ), + ) + + if self.is_integer_coded: + x = np.round(x).astype(int) + + return x + + def solve_for_mean_quantile( + self, + target: float, + input_df: pd.DataFrame, + weights: np.ndarray, + max_iterations: int = 10, + verbose: bool = False, + ): + """ + Solve for the mean quantile that produces the target value. + + Args: + target (float): The target value. + input_df (pd.DataFrame): The input dataset. + weights (np.ndarray): The sample weights. + max_iterations (int, optional): The maximum number of iterations. + Defaults to 5. + verbose (bool, optional): Whether to print the loss at each + iteration. Defaults to False. + + Returns: + float: The mean quantile. + """ + + def loss(mean_quantile): + pred_values = self.predict(input_df, mean_quantile) + pred_aggregate = (pred_values * weights).sum() + msg = ( + f"PREDICTED: {pred_aggregate/1e9:.1f} " + f"(target: {target/1e9:.1f})" + ) + print(msg) + return (pred_aggregate - target) ** 2, pred_aggregate + + best_loss = float("inf") + min_quantile = 0 + max_quantile = 1 + + # Binary search for the mean quantile. + for i in range(max_iterations): + mean_quantile = (min_quantile + max_quantile) / 2 + loss_value, pred_agg = loss(mean_quantile) + if verbose: + msg = ( + f"Iteration {i}: {mean_quantile:.4f} " + f"(loss: {loss_value:.4f})" + ) + print(msg) + if loss_value < best_loss: + best_loss = loss_value + if pred_agg < target: + min_quantile = mean_quantile + else: + max_quantile = mean_quantile + return mean_quantile diff --git a/tax_microdata_benchmarking/utils/pension_contributions.py b/tax_microdata_benchmarking/utils/pension_contributions.py index a081f784..94c0b9ff 100644 --- a/tax_microdata_benchmarking/utils/pension_contributions.py +++ b/tax_microdata_benchmarking/utils/pension_contributions.py @@ -1,6 +1,11 @@ -from survey_enhance import Imputation +import numpy as np from policyengine_us import Microsimulation from tax_microdata_benchmarking.datasets.cps import CPS_2021 +from tax_microdata_benchmarking.utils.imputation import Imputation +from tax_microdata_benchmarking.imputation_assumptions import ( + IMPUTATION_RF_RNG_SEED, + IMPUTATION_BETA_RNG_SEED, +) def impute_pension_contributions_to_puf(puf_df): @@ -11,9 +16,14 @@ def impute_pension_contributions_to_puf(puf_df): ) pension_contributions = Imputation() + pension_contributions.rf_rng_seed = IMPUTATION_RF_RNG_SEED + pension_contributions.beta_rng_seed = IMPUTATION_BETA_RNG_SEED + pension_contributions.train( X=cps_df[["employment_income"]], Y=cps_df[["pre_tax_contributions"]], sample_weight=cps_df["household_weight"], ) - return pension_contributions.predict(X=puf_df[["employment_income"]]) + return pension_contributions.predict( + X=puf_df[["employment_income"]], + ) diff --git a/tax_microdata_benchmarking/utils/qbi.py b/tax_microdata_benchmarking/utils/qbi.py index d6215916..b156acdd 100644 --- a/tax_microdata_benchmarking/utils/qbi.py +++ b/tax_microdata_benchmarking/utils/qbi.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd -from scipy.optimize import minimize, bisect +from scipy.optimize import bisect import taxcalc as tc @@ -21,7 +21,7 @@ def add_pt_w2_wages(df, verbose: bool = True): pt_w2_wages_scale: rounded to five decimal digits """ if verbose: - print("Finding scale to use in imputing pass-through W-2 wages") + print("Finding scale to use in imputing pass-through W-2 wages...") QBID_TOTAL = 205.8 # from IRS SOI P4801 tabulations of 2021 data (in $B) qbi = np.maximum(0, df.e00900 + df.e26270 + df.e02100 + df.e27200) @@ -43,10 +43,10 @@ def deduction_deviation(scale): qbided = (sim.array("qbided") * df.s006).sum() / 1e9 dev = qbided - QBID_TOTAL if verbose: - print(f"scale: {scale:8.6f}, dev: {dev:6.2f}, tot: {qbided:.2f}") + print(f"scale: {scale:8.6f}, dev: {dev:7.3f}, tot: {qbided:.3f}") return dev - scale = bisect(deduction_deviation, 0.1, 0.5, rtol=0.001) + scale = bisect(deduction_deviation, 0.1, 0.5, rtol=0.0001) rounded_scale = round(scale, 5) print(f"Final (rounded) scale: {rounded_scale}") df["PT_binc_w2_wages"] = qbi * rounded_scale diff --git a/tax_microdata_benchmarking/utils/reweight.py b/tax_microdata_benchmarking/utils/reweight.py index 762e93fb..0b249702 100644 --- a/tax_microdata_benchmarking/utils/reweight.py +++ b/tax_microdata_benchmarking/utils/reweight.py @@ -60,6 +60,7 @@ def reweight( if time_period not in targets.year.unique(): raise ValueError(f"Year {time_period} not in targets.") + print(f"...reweighting for year {time_period}") def build_loss_matrix(df): loss_matrix = pd.DataFrame() @@ -158,7 +159,7 @@ def build_loss_matrix(df): i, ) - print("Finished optimisation") + print("...reweighting finished") flat_file["s006"] = weights.detach().numpy() return flat_file