Merge pull request #96 from PSLmodels/docs-and-pencon

Improve documentation and pension contributions
PSLmodels · Jun 19, 2024 · fa39ac6 · fa39ac6
2 parents 97e2661 + 3a8d97b
commit fa39ac6
Show file tree

Hide file tree

Showing 7 changed files with 334 additions and 26 deletions.
diff --git a/Makefile b/Makefile
@@ -18,3 +18,6 @@ data: install flat-file test
 
 documentation:
 	jb build docs/book
+
+reweighting-visualisation:
+	tensorboard --logdir=tax_microdata_benchmarking/storage/output/reweighting
diff --git a/docs/book/uprating.ipynb b/docs/book/uprating.ipynb
@@ -0,0 +1,242 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Uprating\n",
+    "\n",
+    "As part of the data generation process, we uprate the 2015 PUF to match 2021 SOI statistics. The table below shows all variables which are uprated *directly* from SOI aggregates."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>E18500</th>\n",
+       "      <th>E19200</th>\n",
+       "      <th>E26270</th>\n",
+       "      <th>E26270</th>\n",
+       "      <th>E18400</th>\n",
+       "      <th>E18400</th>\n",
+       "      <th>E17500</th>\n",
+       "      <th>E00400</th>\n",
+       "      <th>E00300</th>\n",
+       "      <th>E19800</th>\n",
+       "      <th>E01700</th>\n",
+       "      <th>E00200</th>\n",
+       "      <th>E01500</th>\n",
+       "      <th>E02400</th>\n",
+       "      <th>E00650</th>\n",
+       "      <th>E00600</th>\n",
+       "      <th>E02500</th>\n",
+       "      <th>E01400</th>\n",
+       "      <th>E00900</th>\n",
+       "      <th>E00900</th>\n",
+       "      <th>E01100</th>\n",
+       "      <th>E01000</th>\n",
+       "      <th>E01000</th>\n",
+       "      <th>E02300</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2015 total ($bn)</th>\n",
+       "      <td>188.9</td>\n",
+       "      <td>306.6</td>\n",
+       "      <td>633.1</td>\n",
+       "      <td>633.1</td>\n",
+       "      <td>352.8</td>\n",
+       "      <td>352.8</td>\n",
+       "      <td>132.1</td>\n",
+       "      <td>61.6</td>\n",
+       "      <td>97.8</td>\n",
+       "      <td>163.5</td>\n",
+       "      <td>693.0</td>\n",
+       "      <td>7156.3</td>\n",
+       "      <td>1178.9</td>\n",
+       "      <td>604.8</td>\n",
+       "      <td>204.0</td>\n",
+       "      <td>260.9</td>\n",
+       "      <td>277.0</td>\n",
+       "      <td>251.8</td>\n",
+       "      <td>332.4</td>\n",
+       "      <td>332.4</td>\n",
+       "      <td>11.8</td>\n",
+       "      <td>701.4</td>\n",
+       "      <td>701.4</td>\n",
+       "      <td>26.7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021 total ($bn)</th>\n",
+       "      <td>100.1</td>\n",
+       "      <td>164.4</td>\n",
+       "      <td>419.9</td>\n",
+       "      <td>419.9</td>\n",
+       "      <td>254.2</td>\n",
+       "      <td>254.2</td>\n",
+       "      <td>100.6</td>\n",
+       "      <td>55.3</td>\n",
+       "      <td>105.7</td>\n",
+       "      <td>194.0</td>\n",
+       "      <td>861.8</td>\n",
+       "      <td>9078.3</td>\n",
+       "      <td>1519.6</td>\n",
+       "      <td>790.7</td>\n",
+       "      <td>297.1</td>\n",
+       "      <td>388.0</td>\n",
+       "      <td>412.2</td>\n",
+       "      <td>406.1</td>\n",
+       "      <td>560.2</td>\n",
+       "      <td>560.2</td>\n",
+       "      <td>24.3</td>\n",
+       "      <td>2051.5</td>\n",
+       "      <td>2051.5</td>\n",
+       "      <td>204.6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Growth (%)</th>\n",
+       "      <td>-47.0</td>\n",
+       "      <td>-46.4</td>\n",
+       "      <td>-33.7</td>\n",
+       "      <td>-33.7</td>\n",
+       "      <td>-28.0</td>\n",
+       "      <td>-28.0</td>\n",
+       "      <td>-23.9</td>\n",
+       "      <td>-10.3</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>18.7</td>\n",
+       "      <td>24.4</td>\n",
+       "      <td>26.9</td>\n",
+       "      <td>28.9</td>\n",
+       "      <td>30.7</td>\n",
+       "      <td>45.6</td>\n",
+       "      <td>48.7</td>\n",
+       "      <td>48.8</td>\n",
+       "      <td>61.3</td>\n",
+       "      <td>68.5</td>\n",
+       "      <td>68.5</td>\n",
+       "      <td>106.6</td>\n",
+       "      <td>192.5</td>\n",
+       "      <td>192.5</td>\n",
+       "      <td>667.2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  E18500  E19200  E26270  E26270  E18400  E18400  E17500  \\\n",
+       "2015 total ($bn)   188.9   306.6   633.1   633.1   352.8   352.8   132.1   \n",
+       "2021 total ($bn)   100.1   164.4   419.9   419.9   254.2   254.2   100.6   \n",
+       "Growth (%)         -47.0   -46.4   -33.7   -33.7   -28.0   -28.0   -23.9   \n",
+       "\n",
+       "                  E00400  E00300  E19800  E01700  E00200  E01500  E02400  \\\n",
+       "2015 total ($bn)    61.6    97.8   163.5   693.0  7156.3  1178.9   604.8   \n",
+       "2021 total ($bn)    55.3   105.7   194.0   861.8  9078.3  1519.6   790.7   \n",
+       "Growth (%)         -10.3     8.0    18.7    24.4    26.9    28.9    30.7   \n",
+       "\n",
+       "                  E00650  E00600  E02500  E01400  E00900  E00900  E01100  \\\n",
+       "2015 total ($bn)   204.0   260.9   277.0   251.8   332.4   332.4    11.8   \n",
+       "2021 total ($bn)   297.1   388.0   412.2   406.1   560.2   560.2    24.3   \n",
+       "Growth (%)          45.6    48.7    48.8    61.3    68.5    68.5   106.6   \n",
+       "\n",
+       "                  E01000  E01000  E02300  \n",
+       "2015 total ($bn)   701.4   701.4    26.7  \n",
+       "2021 total ($bn)  2051.5  2051.5   204.6  \n",
+       "Growth (%)         192.5   192.5   667.2  "
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from tax_microdata_benchmarking.storage import STORAGE_FOLDER\n",
+    "from microdf import MicroDataFrame\n",
+    "from tax_microdata_benchmarking.datasets.uprate_puf import (\n",
+    "    SOI_TO_PUF_STRAIGHT_RENAMES,\n",
+    "    SOI_TO_PUF_NEG_ONLY_RENAMES,\n",
+    "    SOI_TO_PUF_POS_ONLY_RENAMES,\n",
+    ")\n",
+    "\n",
+    "# Set maximum number of columns to display in pandas output to infinite\n",
+    "pd.set_option(\"display.max_columns\", None)\n",
+    "\n",
+    "puf_2015 = pd.read_csv(STORAGE_FOLDER / \"input\" / \"puf_2015.csv\")\n",
+    "puf_2021 = pd.read_csv(STORAGE_FOLDER / \"output\" / \"puf_2021.csv\")\n",
+    "\n",
+    "puf_2015.S006 /= 100\n",
+    "puf_2021.S006 /= 100\n",
+    "puf_2015 = MicroDataFrame(puf_2015, weights=\"S006\")\n",
+    "puf_2021 = MicroDataFrame(puf_2021, weights=\"S006\")\n",
+    "\n",
+    "totals_2015 = puf_2015.sum()\n",
+    "totals_2021 = puf_2021.sum()\n",
+    "\n",
+    "uprating_df = pd.DataFrame()\n",
+    "uprating_df[\"2015 total ($bn)\"] = (puf_2015.sum() / 1e9).round(1)\n",
+    "uprating_df[\"2021 total ($bn)\"] = (puf_2021.sum() / 1e9).round(1)\n",
+    "uprating_df[\"Growth (%)\"] = ((totals_2021 / totals_2015 - 1) * 100).round(1)\n",
+    "# uprating_df[\"Annualised growth (%)\"] = (((totals_2021 / totals_2015) ** (1 / (2021 - 2015)) - 1) * 100).round(1)\n",
+    "\n",
+    "UPRATED_DIRECTLY_FROM_SOI = list(\n",
+    "    list(SOI_TO_PUF_STRAIGHT_RENAMES.values())\n",
+    "    + list(SOI_TO_PUF_NEG_ONLY_RENAMES.values())\n",
+    "    + list(SOI_TO_PUF_POS_ONLY_RENAMES.values())\n",
+    ")\n",
+    "\n",
+    "uprating_df.T[UPRATED_DIRECTLY_FROM_SOI].T.sort_values(\"Growth (%)\").T"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/book/validation.ipynb b/docs/book/validation.ipynb
@@ -127,9 +127,16 @@
     "puf_2015 = pd.read_csv(INPUTS / \"puf_2015.csv\")\n",
     "tc_puf_2015 = pd.read_csv(OUTPUTS / \"tc_puf_2015.csv\")\n",
     "\n",
-    "soi_from_puf_2015 = compare_soi_replication_to_soi(puf_to_soi(puf_2015, 2015), 2015)\n",
-    "soi_from_pe_puf_2015 = compare_soi_replication_to_soi(pe_to_soi(PUF_2015, 2015), 2015)\n",
-    "soi_from_tc_puf_2015 = compare_soi_replication_to_soi(tc_to_soi(tc_puf_2015, 2015), 2015)\n",
+    "soi_from_puf_2015 = compare_soi_replication_to_soi(\n",
+    "    puf_to_soi(puf_2015, 2015), 2015\n",
+    ")\n",
+    "soi_from_pe_puf_2015 = compare_soi_replication_to_soi(\n",
+    "    pe_to_soi(PUF_2015, 2015), 2015\n",
+    ")\n",
+    "soi_from_tc_puf_2015 = compare_soi_replication_to_soi(\n",
+    "    tc_to_soi(tc_puf_2015, 2015), 2015\n",
+    ")\n",
+    "\n",
     "\n",
     "def soi_statistic_passes_quality_test(df):\n",
     "    # Relative error lower than this => OK\n",
@@ -141,22 +148,37 @@
     "    # Absolute error lower than this for aggregates => OK\n",
     "    AGGREGATE_ABSOLUTE_ERROR_THRESHOLD = 1e9\n",
     "\n",
-    "    relative_error_ok = df[\"Absolute relative error\"] < RELATIVE_ERROR_THRESHOLD\n",
-    "    absolute_error_threshold = np.where(df.Count, COUNT_ABSOLUTE_ERROR_THRESHOLD, AGGREGATE_ABSOLUTE_ERROR_THRESHOLD)\n",
+    "    relative_error_ok = (\n",
+    "        df[\"Absolute relative error\"] < RELATIVE_ERROR_THRESHOLD\n",
+    "    )\n",
+    "    absolute_error_threshold = np.where(\n",
+    "        df.Count,\n",
+    "        COUNT_ABSOLUTE_ERROR_THRESHOLD,\n",
+    "        AGGREGATE_ABSOLUTE_ERROR_THRESHOLD,\n",
+    "    )\n",
     "    absolute_error_ok = df[\"Absolute error\"] < absolute_error_threshold\n",
     "\n",
     "    return relative_error_ok | absolute_error_ok\n",
     "\n",
+    "\n",
     "# 2021 datasets\n",
     "\n",
     "puf_2021 = pd.read_csv(OUTPUTS / \"puf_2021.csv\")\n",
     "tc_puf_2021 = pd.read_csv(OUTPUTS / \"tc_puf_2021.csv\")\n",
     "tmd_2021 = pd.read_csv(OUTPUTS / \"tmd_2021.csv\")\n",
     "\n",
-    "soi_from_puf_2021 = compare_soi_replication_to_soi(puf_to_soi(puf_2021, 2021), 2021)\n",
-    "soi_from_pe_puf_2021 = compare_soi_replication_to_soi(pe_to_soi(PUF_2021, 2021), 2021)\n",
-    "soi_from_tc_puf_2021 = compare_soi_replication_to_soi(tc_to_soi(tc_puf_2021, 2021), 2021)\n",
-    "soi_from_tmd_2021 = compare_soi_replication_to_soi(tc_to_soi(tmd_2021, 2021), 2021)\n",
+    "soi_from_puf_2021 = compare_soi_replication_to_soi(\n",
+    "    puf_to_soi(puf_2021, 2021), 2021\n",
+    ")\n",
+    "soi_from_pe_puf_2021 = compare_soi_replication_to_soi(\n",
+    "    pe_to_soi(PUF_2021, 2021), 2021\n",
+    ")\n",
+    "soi_from_tc_puf_2021 = compare_soi_replication_to_soi(\n",
+    "    tc_to_soi(tc_puf_2021, 2021), 2021\n",
+    ")\n",
+    "soi_from_tmd_2021 = compare_soi_replication_to_soi(\n",
+    "    tc_to_soi(tmd_2021, 2021), 2021\n",
+    ")\n",
     "\n",
     "dataset_soi_comparisons = [\n",
     "    soi_from_puf_2015,\n",
@@ -165,7 +187,7 @@
     "    soi_from_puf_2021,\n",
     "    soi_from_pe_puf_2021,\n",
     "    soi_from_tc_puf_2021,\n",
-    "    soi_from_tmd_2021\n",
+    "    soi_from_tmd_2021,\n",
     "]\n",
     "\n",
     "for dataset in dataset_soi_comparisons:\n",
@@ -178,15 +200,19 @@
     "    \"PUF (2021)\",\n",
     "    \"PE PUF (2021)\",\n",
     "    \"TC PUF (2021)\",\n",
-    "    \"TMD (2021)\"\n",
+    "    \"TMD (2021)\",\n",
     "]\n",
     "\n",
-    "comparison_df = pd.DataFrame({\n",
-    "    \"Dataset\": dataset_names,\n",
-    "    \"SOI match score\": [(df[\"OK\"].mean() * 100).round(1) for df in dataset_soi_comparisons]\n",
-    "})\n",
+    "comparison_df = pd.DataFrame(\n",
+    "    {\n",
+    "        \"Dataset\": dataset_names,\n",
+    "        \"SOI match score\": [\n",
+    "            (df[\"OK\"].mean() * 100).round(1) for df in dataset_soi_comparisons\n",
+    "        ],\n",
+    "    }\n",
+    ")\n",
     "\n",
-    "comparison_df\n"
+    "comparison_df"
    ]
   },
   {
@@ -675,8 +701,15 @@
    ],
    "source": [
     "score_by_dataset = pd.DataFrame(\n",
-    "    {dataset_name: (dataset.groupby(\"Variable\").OK.mean() * 100).round(1) for dataset_name, dataset in zip(dataset_names, dataset_soi_comparisons)}\n",
-    ").fillna(100) # Fillna because some variables aren't in the 2021 SOI releases.\n",
+    "    {\n",
+    "        dataset_name: (dataset.groupby(\"Variable\").OK.mean() * 100).round(1)\n",
+    "        for dataset_name, dataset in zip(\n",
+    "            dataset_names, dataset_soi_comparisons\n",
+    "        )\n",
+    "    }\n",
+    ").fillna(\n",
+    "    100\n",
+    ")  # Fillna because some variables aren't in the 2021 SOI releases.\n",
     "score_by_dataset.sort_values(\"TMD (2021)\")"
    ]
   }