Skip to content

Commit

Permalink
Merge pull request #96 from PSLmodels/docs-and-pencon
Browse files Browse the repository at this point in the history
Improve documentation and pension contributions
  • Loading branch information
nikhilwoodruff authored Jun 19, 2024
2 parents 97e2661 + 3a8d97b commit fa39ac6
Show file tree
Hide file tree
Showing 7 changed files with 334 additions and 26 deletions.
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,6 @@ data: install flat-file test

documentation:
jb build docs/book

reweighting-visualisation:
tensorboard --logdir=tax_microdata_benchmarking/storage/output/reweighting
242 changes: 242 additions & 0 deletions docs/book/uprating.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Uprating\n",
"\n",
"As part of the data generation process, we uprate the 2015 PUF to match 2021 SOI statistics. The table below shows all variables which are uprated *directly* from SOI aggregates."
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"tags": [
"hide-input"
]
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>E18500</th>\n",
" <th>E19200</th>\n",
" <th>E26270</th>\n",
" <th>E26270</th>\n",
" <th>E18400</th>\n",
" <th>E18400</th>\n",
" <th>E17500</th>\n",
" <th>E00400</th>\n",
" <th>E00300</th>\n",
" <th>E19800</th>\n",
" <th>E01700</th>\n",
" <th>E00200</th>\n",
" <th>E01500</th>\n",
" <th>E02400</th>\n",
" <th>E00650</th>\n",
" <th>E00600</th>\n",
" <th>E02500</th>\n",
" <th>E01400</th>\n",
" <th>E00900</th>\n",
" <th>E00900</th>\n",
" <th>E01100</th>\n",
" <th>E01000</th>\n",
" <th>E01000</th>\n",
" <th>E02300</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2015 total ($bn)</th>\n",
" <td>188.9</td>\n",
" <td>306.6</td>\n",
" <td>633.1</td>\n",
" <td>633.1</td>\n",
" <td>352.8</td>\n",
" <td>352.8</td>\n",
" <td>132.1</td>\n",
" <td>61.6</td>\n",
" <td>97.8</td>\n",
" <td>163.5</td>\n",
" <td>693.0</td>\n",
" <td>7156.3</td>\n",
" <td>1178.9</td>\n",
" <td>604.8</td>\n",
" <td>204.0</td>\n",
" <td>260.9</td>\n",
" <td>277.0</td>\n",
" <td>251.8</td>\n",
" <td>332.4</td>\n",
" <td>332.4</td>\n",
" <td>11.8</td>\n",
" <td>701.4</td>\n",
" <td>701.4</td>\n",
" <td>26.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2021 total ($bn)</th>\n",
" <td>100.1</td>\n",
" <td>164.4</td>\n",
" <td>419.9</td>\n",
" <td>419.9</td>\n",
" <td>254.2</td>\n",
" <td>254.2</td>\n",
" <td>100.6</td>\n",
" <td>55.3</td>\n",
" <td>105.7</td>\n",
" <td>194.0</td>\n",
" <td>861.8</td>\n",
" <td>9078.3</td>\n",
" <td>1519.6</td>\n",
" <td>790.7</td>\n",
" <td>297.1</td>\n",
" <td>388.0</td>\n",
" <td>412.2</td>\n",
" <td>406.1</td>\n",
" <td>560.2</td>\n",
" <td>560.2</td>\n",
" <td>24.3</td>\n",
" <td>2051.5</td>\n",
" <td>2051.5</td>\n",
" <td>204.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Growth (%)</th>\n",
" <td>-47.0</td>\n",
" <td>-46.4</td>\n",
" <td>-33.7</td>\n",
" <td>-33.7</td>\n",
" <td>-28.0</td>\n",
" <td>-28.0</td>\n",
" <td>-23.9</td>\n",
" <td>-10.3</td>\n",
" <td>8.0</td>\n",
" <td>18.7</td>\n",
" <td>24.4</td>\n",
" <td>26.9</td>\n",
" <td>28.9</td>\n",
" <td>30.7</td>\n",
" <td>45.6</td>\n",
" <td>48.7</td>\n",
" <td>48.8</td>\n",
" <td>61.3</td>\n",
" <td>68.5</td>\n",
" <td>68.5</td>\n",
" <td>106.6</td>\n",
" <td>192.5</td>\n",
" <td>192.5</td>\n",
" <td>667.2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" E18500 E19200 E26270 E26270 E18400 E18400 E17500 \\\n",
"2015 total ($bn) 188.9 306.6 633.1 633.1 352.8 352.8 132.1 \n",
"2021 total ($bn) 100.1 164.4 419.9 419.9 254.2 254.2 100.6 \n",
"Growth (%) -47.0 -46.4 -33.7 -33.7 -28.0 -28.0 -23.9 \n",
"\n",
" E00400 E00300 E19800 E01700 E00200 E01500 E02400 \\\n",
"2015 total ($bn) 61.6 97.8 163.5 693.0 7156.3 1178.9 604.8 \n",
"2021 total ($bn) 55.3 105.7 194.0 861.8 9078.3 1519.6 790.7 \n",
"Growth (%) -10.3 8.0 18.7 24.4 26.9 28.9 30.7 \n",
"\n",
" E00650 E00600 E02500 E01400 E00900 E00900 E01100 \\\n",
"2015 total ($bn) 204.0 260.9 277.0 251.8 332.4 332.4 11.8 \n",
"2021 total ($bn) 297.1 388.0 412.2 406.1 560.2 560.2 24.3 \n",
"Growth (%) 45.6 48.7 48.8 61.3 68.5 68.5 106.6 \n",
"\n",
" E01000 E01000 E02300 \n",
"2015 total ($bn) 701.4 701.4 26.7 \n",
"2021 total ($bn) 2051.5 2051.5 204.6 \n",
"Growth (%) 192.5 192.5 667.2 "
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"from tax_microdata_benchmarking.storage import STORAGE_FOLDER\n",
"from microdf import MicroDataFrame\n",
"from tax_microdata_benchmarking.datasets.uprate_puf import (\n",
" SOI_TO_PUF_STRAIGHT_RENAMES,\n",
" SOI_TO_PUF_NEG_ONLY_RENAMES,\n",
" SOI_TO_PUF_POS_ONLY_RENAMES,\n",
")\n",
"\n",
"# Set maximum number of columns to display in pandas output to infinite\n",
"pd.set_option(\"display.max_columns\", None)\n",
"\n",
"puf_2015 = pd.read_csv(STORAGE_FOLDER / \"input\" / \"puf_2015.csv\")\n",
"puf_2021 = pd.read_csv(STORAGE_FOLDER / \"output\" / \"puf_2021.csv\")\n",
"\n",
"puf_2015.S006 /= 100\n",
"puf_2021.S006 /= 100\n",
"puf_2015 = MicroDataFrame(puf_2015, weights=\"S006\")\n",
"puf_2021 = MicroDataFrame(puf_2021, weights=\"S006\")\n",
"\n",
"totals_2015 = puf_2015.sum()\n",
"totals_2021 = puf_2021.sum()\n",
"\n",
"uprating_df = pd.DataFrame()\n",
"uprating_df[\"2015 total ($bn)\"] = (puf_2015.sum() / 1e9).round(1)\n",
"uprating_df[\"2021 total ($bn)\"] = (puf_2021.sum() / 1e9).round(1)\n",
"uprating_df[\"Growth (%)\"] = ((totals_2021 / totals_2015 - 1) * 100).round(1)\n",
"# uprating_df[\"Annualised growth (%)\"] = (((totals_2021 / totals_2015) ** (1 / (2021 - 2015)) - 1) * 100).round(1)\n",
"\n",
"UPRATED_DIRECTLY_FROM_SOI = list(\n",
" list(SOI_TO_PUF_STRAIGHT_RENAMES.values())\n",
" + list(SOI_TO_PUF_NEG_ONLY_RENAMES.values())\n",
" + list(SOI_TO_PUF_POS_ONLY_RENAMES.values())\n",
")\n",
"\n",
"uprating_df.T[UPRATED_DIRECTLY_FROM_SOI].T.sort_values(\"Growth (%)\").T"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
69 changes: 51 additions & 18 deletions docs/book/validation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,16 @@
"puf_2015 = pd.read_csv(INPUTS / \"puf_2015.csv\")\n",
"tc_puf_2015 = pd.read_csv(OUTPUTS / \"tc_puf_2015.csv\")\n",
"\n",
"soi_from_puf_2015 = compare_soi_replication_to_soi(puf_to_soi(puf_2015, 2015), 2015)\n",
"soi_from_pe_puf_2015 = compare_soi_replication_to_soi(pe_to_soi(PUF_2015, 2015), 2015)\n",
"soi_from_tc_puf_2015 = compare_soi_replication_to_soi(tc_to_soi(tc_puf_2015, 2015), 2015)\n",
"soi_from_puf_2015 = compare_soi_replication_to_soi(\n",
" puf_to_soi(puf_2015, 2015), 2015\n",
")\n",
"soi_from_pe_puf_2015 = compare_soi_replication_to_soi(\n",
" pe_to_soi(PUF_2015, 2015), 2015\n",
")\n",
"soi_from_tc_puf_2015 = compare_soi_replication_to_soi(\n",
" tc_to_soi(tc_puf_2015, 2015), 2015\n",
")\n",
"\n",
"\n",
"def soi_statistic_passes_quality_test(df):\n",
" # Relative error lower than this => OK\n",
Expand All @@ -141,22 +148,37 @@
" # Absolute error lower than this for aggregates => OK\n",
" AGGREGATE_ABSOLUTE_ERROR_THRESHOLD = 1e9\n",
"\n",
" relative_error_ok = df[\"Absolute relative error\"] < RELATIVE_ERROR_THRESHOLD\n",
" absolute_error_threshold = np.where(df.Count, COUNT_ABSOLUTE_ERROR_THRESHOLD, AGGREGATE_ABSOLUTE_ERROR_THRESHOLD)\n",
" relative_error_ok = (\n",
" df[\"Absolute relative error\"] < RELATIVE_ERROR_THRESHOLD\n",
" )\n",
" absolute_error_threshold = np.where(\n",
" df.Count,\n",
" COUNT_ABSOLUTE_ERROR_THRESHOLD,\n",
" AGGREGATE_ABSOLUTE_ERROR_THRESHOLD,\n",
" )\n",
" absolute_error_ok = df[\"Absolute error\"] < absolute_error_threshold\n",
"\n",
" return relative_error_ok | absolute_error_ok\n",
"\n",
"\n",
"# 2021 datasets\n",
"\n",
"puf_2021 = pd.read_csv(OUTPUTS / \"puf_2021.csv\")\n",
"tc_puf_2021 = pd.read_csv(OUTPUTS / \"tc_puf_2021.csv\")\n",
"tmd_2021 = pd.read_csv(OUTPUTS / \"tmd_2021.csv\")\n",
"\n",
"soi_from_puf_2021 = compare_soi_replication_to_soi(puf_to_soi(puf_2021, 2021), 2021)\n",
"soi_from_pe_puf_2021 = compare_soi_replication_to_soi(pe_to_soi(PUF_2021, 2021), 2021)\n",
"soi_from_tc_puf_2021 = compare_soi_replication_to_soi(tc_to_soi(tc_puf_2021, 2021), 2021)\n",
"soi_from_tmd_2021 = compare_soi_replication_to_soi(tc_to_soi(tmd_2021, 2021), 2021)\n",
"soi_from_puf_2021 = compare_soi_replication_to_soi(\n",
" puf_to_soi(puf_2021, 2021), 2021\n",
")\n",
"soi_from_pe_puf_2021 = compare_soi_replication_to_soi(\n",
" pe_to_soi(PUF_2021, 2021), 2021\n",
")\n",
"soi_from_tc_puf_2021 = compare_soi_replication_to_soi(\n",
" tc_to_soi(tc_puf_2021, 2021), 2021\n",
")\n",
"soi_from_tmd_2021 = compare_soi_replication_to_soi(\n",
" tc_to_soi(tmd_2021, 2021), 2021\n",
")\n",
"\n",
"dataset_soi_comparisons = [\n",
" soi_from_puf_2015,\n",
Expand All @@ -165,7 +187,7 @@
" soi_from_puf_2021,\n",
" soi_from_pe_puf_2021,\n",
" soi_from_tc_puf_2021,\n",
" soi_from_tmd_2021\n",
" soi_from_tmd_2021,\n",
"]\n",
"\n",
"for dataset in dataset_soi_comparisons:\n",
Expand All @@ -178,15 +200,19 @@
" \"PUF (2021)\",\n",
" \"PE PUF (2021)\",\n",
" \"TC PUF (2021)\",\n",
" \"TMD (2021)\"\n",
" \"TMD (2021)\",\n",
"]\n",
"\n",
"comparison_df = pd.DataFrame({\n",
" \"Dataset\": dataset_names,\n",
" \"SOI match score\": [(df[\"OK\"].mean() * 100).round(1) for df in dataset_soi_comparisons]\n",
"})\n",
"comparison_df = pd.DataFrame(\n",
" {\n",
" \"Dataset\": dataset_names,\n",
" \"SOI match score\": [\n",
" (df[\"OK\"].mean() * 100).round(1) for df in dataset_soi_comparisons\n",
" ],\n",
" }\n",
")\n",
"\n",
"comparison_df\n"
"comparison_df"
]
},
{
Expand Down Expand Up @@ -675,8 +701,15 @@
],
"source": [
"score_by_dataset = pd.DataFrame(\n",
" {dataset_name: (dataset.groupby(\"Variable\").OK.mean() * 100).round(1) for dataset_name, dataset in zip(dataset_names, dataset_soi_comparisons)}\n",
").fillna(100) # Fillna because some variables aren't in the 2021 SOI releases.\n",
" {\n",
" dataset_name: (dataset.groupby(\"Variable\").OK.mean() * 100).round(1)\n",
" for dataset_name, dataset in zip(\n",
" dataset_names, dataset_soi_comparisons\n",
" )\n",
" }\n",
").fillna(\n",
" 100\n",
") # Fillna because some variables aren't in the 2021 SOI releases.\n",
"score_by_dataset.sort_values(\"TMD (2021)\")"
]
}
Expand Down
Loading

0 comments on commit fa39ac6

Please sign in to comment.