Merge pull request #112 from rickecon/ntbkrun

Update file calls in calibration modules
PSLmodels · Jun 4, 2024 · 29acbee · 29acbee
2 parents 60f204f + 379ebdf
commit 29acbee
Show file tree

Hide file tree

Showing 14 changed files with 200 additions and 109,309 deletions.
diff --git a/.gitignore b/.gitignore
@@ -24,7 +24,6 @@ htmlcov/*
 *.asv
 *.nav
 *.snm
-*.gz
 *.bib.bak
 *.fls
 *.m~
@@ -47,6 +46,7 @@ examples/OG-USA-Example/*
 cs-config/cs_config/OUTPUT_BASELINE/*
 data/csv_output_files/*
 data/images/*
+data/PSID/psid_lifetime_income.csv
 ogusa/csv_output_files/*
 ogusa/images/*
 .vscode/

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+
+## [0.1.8] - 2024-05-20 12:00:00
+
+### Added
+
+- Updates the `ogusa` package to include the zipped `psid_lifetime_income.csv.gz` file, which is now called in some calibration modules (`bequest_transmission.py`,  `deterministic_profiles.py`, and `transfer_distirbution.py`), but with an option for the user to provide their own custom datafile.  These changes allow for Jupyter notebook users to execute the `Calibration` class object and for those who install the `ogusa` package from PyPI to have the required datafile for the major calibration modules.
+
+
 ## [0.1.7] - 2024-05-14 16:30:00
 
 ### Added
@@ -97,6 +105,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 
 
+[0.1.8]: https://github.com/PSLmodels/OG-USA/compare/v0.1.7...v0.1.8
 [0.1.7]: https://github.com/PSLmodels/OG-USA/compare/v0.1.6...v0.1.7
 [0.1.6]: https://github.com/PSLmodels/OG-USA/compare/v0.1.5...v0.1.6
 [0.1.5]: https://github.com/PSLmodels/OG-USA/compare/v0.1.4...v0.1.5

diff --git a/data/PSID/psid_lifetime_income.csv b/data/PSID/psid_lifetime_income.csv
diff --git a/ogusa/bequest_transmission.py b/ogusa/bequest_transmission.py
@@ -3,42 +3,40 @@
 import matplotlib.pyplot as plt
 import os
 from ogusa.utils import MVKDE
-
-CURDIR = os.path.split(os.path.abspath(__file__))[0]
+from ogusa.constants import CODE_PATH
 
 
 def get_bequest_matrix(
     J=7,
     lambdas=np.array([0.25, 0.25, 0.2, 0.1, 0.1, 0.09, 0.01]),
-    graphs=False,
+    data_path=None,
+    output_path=None,
 ):
     """
     Returns S x J matrix representing the fraction of aggregate
     bequests that go to each household by age and lifetime income group.
 
+    Args:
+        J (int): number of lifetime income groups
+        lambdas (Numpy array): length J array of lifetime income group
+            proportions
+        data_path (str): path to PSID data
+        output_path (str): path to save output plots and data
+
+    Returns:
+        kde_matrix (Numpy array): SxJ shaped array that represents the
+            smoothed distribution of proportions going to each (s,j)
+
     """
-    # Create directory if output directory does not already exist
-    CURDIR = os.path.split(os.path.abspath(__file__))[0]
-    output_fldr = "csv_output_files"
-    output_dir = os.path.join(CURDIR, "..", "data", output_fldr)
-    if not os.access(output_dir, os.F_OK):
-        os.makedirs(output_dir)
-    image_fldr = "images"
-    image_dir = os.path.join(CURDIR, "..", "data", image_fldr)
-    if not os.access(image_dir, os.F_OK):
-        os.makedirs(image_dir)
-
-    # Define a lambda function to compute the weighted mean:
-    # wm = lambda x: np.average(
-    #     x, weights=df.loc[x.index, "fam_smpl_wgt_core"])
-
-    # Read in dataframe of PSID data
-    # df = ogcore.utils.safe_read_pickle(
-    #     os.path.join(CURDIR, "data", "PSID", "psid_lifetime_income.pkl")
-    # )
-    df = pd.read_csv(
-        os.path.join(CURDIR, "..", "data", "PSID", "psid_lifetime_income.csv")
-    )
+    # Read in PSID data
+    if data_path is None:
+        # Read data file shipped with OG-USA package
+        df = pd.read_csv(
+            os.path.join(CODE_PATH, "psid_lifetime_income.csv.gz")
+        )
+    else:
+        # This is the case when running this from a branch of the OG-USA repo
+        df = pd.read_csv(data_path)
 
     # Do some tabs with data file...
     # 'net_wealth', 'inheritance', 'value_inheritance_1st',
@@ -54,12 +52,15 @@ def get_bequest_matrix(
     )
     # print(df[['sum_inherit', 'inheritance']].describe())
 
-    if graphs:
+    if output_path is not None:
+        # Create plot path directory if it doesn't already exist
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
         # Total inheritances by year
         df.groupby("year_data").mean(numeric_only=True).plot(y="inheritance")
-        plt.savefig(os.path.join(image_dir, "inheritance_year.png"))
+        plt.savefig(os.path.join(output_path, "inheritance_year.png"))
         df.groupby("year_data").mean(numeric_only=True).plot(y="sum_inherit")
-        plt.savefig(os.path.join(image_dir, "sum_inherit_year.png"))
+        plt.savefig(os.path.join(output_path, "sum_inherit_year.png"))
         # not that summing up inheritances gives a much larger value than
         # taking the inheritance variable
 
@@ -68,22 +69,22 @@ def get_bequest_matrix(
         df[df["year_data"] >= 1988].groupby("age").mean(
             numeric_only=True
         ).plot(y="net_wealth")
-        plt.savefig(os.path.join(image_dir, "net_wealth_age.png"))
+        plt.savefig(os.path.join(output_path, "net_wealth_age.png"))
         df[df["year_data"] >= 1988].groupby("age").mean(
             numeric_only=True
         ).plot(y="inheritance")
-        plt.savefig(os.path.join(image_dir, "inheritance_age.png"))
+        plt.savefig(os.path.join(output_path, "inheritance_age.png"))
 
         # Inheritances by lifetime income group
         # bar plot
         df[df["year_data"] >= 1988].groupby("li_group").mean(
             numeric_only=True
         ).plot.bar(y="net_wealth")
-        plt.savefig(os.path.join(image_dir, "net_wealth_li.png"))
+        plt.savefig(os.path.join(output_path, "net_wealth_li.png"))
         df[df["year_data"] >= 1988].groupby("li_group").mean(
             numeric_only=True
         ).plot.bar(y="inheritance")
-        plt.savefig(os.path.join(image_dir, "inheritance_li.png"))
+        plt.savefig(os.path.join(output_path, "inheritance_li.png"))
 
         # lifecycle plots with line for each ability type
         pd.pivot_table(
@@ -93,15 +94,15 @@ def get_bequest_matrix(
             columns="li_group",
             aggfunc="mean",
         ).plot(legend=True)
-        plt.savefig(os.path.join(image_dir, "net_wealth_age_li.png"))
+        plt.savefig(os.path.join(output_path, "net_wealth_age_li.png"))
         pd.pivot_table(
             df[df["year_data"] >= 1988],
             values="inheritance",
             index="age",
             columns="li_group",
             aggfunc="mean",
         ).plot(legend=True)
-        plt.savefig(os.path.join(image_dir, "inheritance_age_li.png"))
+        plt.savefig(os.path.join(output_path, "inheritance_age_li.png"))
 
     # Matrix Fraction of inheritances in a year by age and lifetime_inc
     inheritance_matrix = pd.pivot_table(
@@ -114,16 +115,18 @@ def get_bequest_matrix(
     # replace NaN with zero
     inheritance_matrix.fillna(value=0, inplace=True)
     inheritance_matrix = inheritance_matrix / inheritance_matrix.sum().sum()
-    # inheritance_matrix.to_csv(os.path.join(
-    #     output_dir, 'bequest_matrix.csv'))
 
     # estimate kernel density of bequests
+    if output_path is not None:
+        filename = os.path.join(output_path, "inheritance_kde.png")
+    else:
+        filename = None
     kde_matrix = MVKDE(
         80,
         7,
         inheritance_matrix.to_numpy(),
-        filename=os.path.join(image_dir, "inheritance_kde.png"),
-        plot=graphs,
+        filename=filename,
+        plot=(output_path is not None),
         bandwidth=0.5,
     )
 
@@ -139,10 +142,11 @@ def get_bequest_matrix(
         )
         kde_matrix = kde_matrix_new
 
-    np.savetxt(
-        os.path.join(output_dir, "bequest_matrix_kde.csv"),
-        kde_matrix,
-        delimiter=",",
-    )
+    if output_path is not None:
+        np.savetxt(
+            os.path.join(output_path, "bequest_matrix_kde.csv"),
+            kde_matrix,
+            delimiter=",",
+        )
 
     return kde_matrix
diff --git a/ogusa/calibrate.py b/ogusa/calibrate.py
@@ -1,6 +1,6 @@
 from ogusa import estimate_beta_j, bequest_transmission
 from ogusa import macro_params, transfer_distribution, income
-from ogusa import get_micro_data, psid_data_setup
+from ogusa import get_micro_data
 import os
 import numpy as np
 from ogcore import txfunc, demographics
@@ -25,6 +25,8 @@ def __init__(
         data="cps",
         client=None,
         num_workers=1,
+        demographic_data_path=None,
+        output_path=None,
     ):
         """
         Constructor for the Calibration class.  This class is used to find
@@ -43,10 +45,15 @@ def __init__(
             data (str): data source for microsimulation model
             client (Dask client object): client
             num_workers (int): number of workers for Dask client
+            output_path (str): path to save output to
 
         Returns:
             Calibration class object instance
         """
+        # Create output_path if it doesn't exist
+        if output_path is not None:
+            if not os.path.exists(output_path):
+                os.makedirs(output_path)
         self.estimate_tax_functions = estimate_tax_functions
         self.estimate_beta = estimate_beta
         self.estimate_chi_n = estimate_chi_n
@@ -76,10 +83,14 @@ def __init__(
         self.macro_params = macro_params.get_macro_params()
 
         # eta estimation
-        self.eta = transfer_distribution.get_transfer_matrix(p.J, p.lambdas)
+        self.eta = transfer_distribution.get_transfer_matrix(
+            p.J, p.lambdas, output_path=output_path
+        )
 
         # zeta estimation
-        self.zeta = bequest_transmission.get_bequest_matrix(p.J, p.lambdas)
+        self.zeta = bequest_transmission.get_bequest_matrix(
+            p.J, p.lambdas, output_path=output_path
+        )
 
         # demographics
         if estimate_pop:
@@ -92,6 +103,7 @@ def __init__(
                 initial_data_year=p.start_year - 1,
                 final_data_year=p.start_year,
                 GraphDiag=False,
+                download_path=demographic_data_path,
             )
 
             # demographics for 80 period lives (needed for getting e below)
@@ -112,15 +124,15 @@ def __init__(
                 self.demographic_params["omega_SS"],
                 demog80["omega_SS"],
                 p.lambdas,
-                plot=False,
+                plot_path=output_path,
             )
         else:
             self.e = income.get_e_interp(
                 p.S,
                 p.omega_SS,
                 p.omega_SS,
                 p.lambdas,
-                plot=False,
+                plot_path=output_path,
             )
 
     # Tax Functions

diff --git a/ogusa/constants.py b/ogusa/constants.py
@@ -1,10 +1,13 @@
 import taxcalc
+import os
 
 SHOW_RUNTIME = False  # Flag to display RuntimeWarnings when run model
 
 REFORM_DIR = "OUTPUT_REFORM"
 BASELINE_DIR = "OUTPUT_BASELINE"
 
+CODE_PATH = os.path.abspath(os.path.dirname(__file__))
+
 # Default year for model runs
 DEFAULT_START_YEAR = 2021
 # Tax-Calculator start year