diff --git a/docs/book/validation.ipynb b/docs/book/validation.ipynb index eebc95a1..ed673dc4 100644 --- a/docs/book/validation.ipynb +++ b/docs/book/validation.ipynb @@ -127,9 +127,16 @@ "puf_2015 = pd.read_csv(INPUTS / \"puf_2015.csv\")\n", "tc_puf_2015 = pd.read_csv(OUTPUTS / \"tc_puf_2015.csv\")\n", "\n", - "soi_from_puf_2015 = compare_soi_replication_to_soi(puf_to_soi(puf_2015, 2015), 2015)\n", - "soi_from_pe_puf_2015 = compare_soi_replication_to_soi(pe_to_soi(PUF_2015, 2015), 2015)\n", - "soi_from_tc_puf_2015 = compare_soi_replication_to_soi(tc_to_soi(tc_puf_2015, 2015), 2015)\n", + "soi_from_puf_2015 = compare_soi_replication_to_soi(\n", + " puf_to_soi(puf_2015, 2015), 2015\n", + ")\n", + "soi_from_pe_puf_2015 = compare_soi_replication_to_soi(\n", + " pe_to_soi(PUF_2015, 2015), 2015\n", + ")\n", + "soi_from_tc_puf_2015 = compare_soi_replication_to_soi(\n", + " tc_to_soi(tc_puf_2015, 2015), 2015\n", + ")\n", + "\n", "\n", "def soi_statistic_passes_quality_test(df):\n", " # Relative error lower than this => OK\n", @@ -141,22 +148,37 @@ " # Absolute error lower than this for aggregates => OK\n", " AGGREGATE_ABSOLUTE_ERROR_THRESHOLD = 1e9\n", "\n", - " relative_error_ok = df[\"Absolute relative error\"] < RELATIVE_ERROR_THRESHOLD\n", - " absolute_error_threshold = np.where(df.Count, COUNT_ABSOLUTE_ERROR_THRESHOLD, AGGREGATE_ABSOLUTE_ERROR_THRESHOLD)\n", + " relative_error_ok = (\n", + " df[\"Absolute relative error\"] < RELATIVE_ERROR_THRESHOLD\n", + " )\n", + " absolute_error_threshold = np.where(\n", + " df.Count,\n", + " COUNT_ABSOLUTE_ERROR_THRESHOLD,\n", + " AGGREGATE_ABSOLUTE_ERROR_THRESHOLD,\n", + " )\n", " absolute_error_ok = df[\"Absolute error\"] < absolute_error_threshold\n", "\n", " return relative_error_ok | absolute_error_ok\n", "\n", + "\n", "# 2021 datasets\n", "\n", "puf_2021 = pd.read_csv(OUTPUTS / \"puf_2021.csv\")\n", "tc_puf_2021 = pd.read_csv(OUTPUTS / \"tc_puf_2021.csv\")\n", "tmd_2021 = pd.read_csv(OUTPUTS / \"tmd_2021.csv\")\n", "\n", - "soi_from_puf_2021 = compare_soi_replication_to_soi(puf_to_soi(puf_2021, 2021), 2021)\n", - "soi_from_pe_puf_2021 = compare_soi_replication_to_soi(pe_to_soi(PUF_2021, 2021), 2021)\n", - "soi_from_tc_puf_2021 = compare_soi_replication_to_soi(tc_to_soi(tc_puf_2021, 2021), 2021)\n", - "soi_from_tmd_2021 = compare_soi_replication_to_soi(tc_to_soi(tmd_2021, 2021), 2021)\n", + "soi_from_puf_2021 = compare_soi_replication_to_soi(\n", + " puf_to_soi(puf_2021, 2021), 2021\n", + ")\n", + "soi_from_pe_puf_2021 = compare_soi_replication_to_soi(\n", + " pe_to_soi(PUF_2021, 2021), 2021\n", + ")\n", + "soi_from_tc_puf_2021 = compare_soi_replication_to_soi(\n", + " tc_to_soi(tc_puf_2021, 2021), 2021\n", + ")\n", + "soi_from_tmd_2021 = compare_soi_replication_to_soi(\n", + " tc_to_soi(tmd_2021, 2021), 2021\n", + ")\n", "\n", "dataset_soi_comparisons = [\n", " soi_from_puf_2015,\n", @@ -165,7 +187,7 @@ " soi_from_puf_2021,\n", " soi_from_pe_puf_2021,\n", " soi_from_tc_puf_2021,\n", - " soi_from_tmd_2021\n", + " soi_from_tmd_2021,\n", "]\n", "\n", "for dataset in dataset_soi_comparisons:\n", @@ -178,15 +200,19 @@ " \"PUF (2021)\",\n", " \"PE PUF (2021)\",\n", " \"TC PUF (2021)\",\n", - " \"TMD (2021)\"\n", + " \"TMD (2021)\",\n", "]\n", "\n", - "comparison_df = pd.DataFrame({\n", - " \"Dataset\": dataset_names,\n", - " \"SOI match score\": [(df[\"OK\"].mean() * 100).round(1) for df in dataset_soi_comparisons]\n", - "})\n", + "comparison_df = pd.DataFrame(\n", + " {\n", + " \"Dataset\": dataset_names,\n", + " \"SOI match score\": [\n", + " (df[\"OK\"].mean() * 100).round(1) for df in dataset_soi_comparisons\n", + " ],\n", + " }\n", + ")\n", "\n", - "comparison_df\n" + "comparison_df" ] }, { @@ -675,8 +701,15 @@ ], "source": [ "score_by_dataset = pd.DataFrame(\n", - " {dataset_name: (dataset.groupby(\"Variable\").OK.mean() * 100).round(1) for dataset_name, dataset in zip(dataset_names, dataset_soi_comparisons)}\n", - ").fillna(100) # Fillna because some variables aren't in the 2021 SOI releases.\n", + " {\n", + " dataset_name: (dataset.groupby(\"Variable\").OK.mean() * 100).round(1)\n", + " for dataset_name, dataset in zip(\n", + " dataset_names, dataset_soi_comparisons\n", + " )\n", + " }\n", + ").fillna(\n", + " 100\n", + ") # Fillna because some variables aren't in the 2021 SOI releases.\n", "score_by_dataset.sort_values(\"TMD (2021)\")" ] }