diff --git a/.gitignore b/.gitignore
index 9e85f611..5e5f852c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,5 @@
 tax_microdata_benchmarking/calibration
 
 *.ipynb
+tmp/
+tax_microdata_benchmarking/examination/taxcalculator/pe23-23*
diff --git a/README.md b/README.md
index f816d289..eec12598 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ development will proceed in several phases.
 
 To install, clone the repository and run `pip install -e .` from the
 root directory.  To check that the installation was successful, run
-`make test` or `pytest .` from the root directory.
+`make test` or `pytest .` from the root directory. This needs the POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN environment variable to be set- contact [Nikhil Woodruff](mailto:nikhil@policyengine.org) for the code if you have access to the IRS PUF.
 
 To assess, review the data examination results that compare federal
 agency tax estimates with those generated using the microdata file
diff --git a/tax_microdata_benchmarking/create_flat_file.py b/tax_microdata_benchmarking/create_flat_file.py
index acb830cf..4d8e8a24 100644
--- a/tax_microdata_benchmarking/create_flat_file.py
+++ b/tax_microdata_benchmarking/create_flat_file.py
@@ -720,6 +720,8 @@ def get_variable_uprating(
         str: The uprating factor.
     """
 
+    population = system.parameters.calibration.gov.census.populations.total
+
     calibration = system.parameters.calibration
     if variable in calibration.gov.irs.soi.children:
         parameter = calibration.gov.irs.soi.children[variable]
@@ -728,8 +730,12 @@ def get_variable_uprating(
     source_value = parameter(source_time_period)
     target_value = parameter(target_time_period)
 
+    population_change = population(target_time_period) / population(
+        source_time_period
+    )
+
     uprating_factor = target_value / source_value
-    return uprating_factor
+    return uprating_factor / population_change
 
 
 def assert_no_duplicate_columns(df):
@@ -780,7 +786,7 @@ def create_stacked_flat_file(
         )
         stacked_file["PT_binc_w2_wages"] = (
             qbi * 0.314  # Solved in 2021 using adjust_qbi.py
-        )  # Solved in 2021 using adjust_qbi.py
+        )
         input_data = tc.Records(data=stacked_file, start_year=target_year)
         policy = tc.Policy()
         simulation = tc.Calculator(records=input_data, policy=policy)
@@ -797,6 +803,7 @@ def create_stacked_flat_file(
             try:
                 from tax_microdata_benchmarking.reweight import reweight
 
+                combined_file["s006_original"] = combined_file.s006
                 combined_file = reweight(
                     combined_file, time_period=target_year
                 )
@@ -834,6 +841,12 @@ def summary_analytics(df):
 population = system.parameters.calibration.gov.census.populations.total
 
 
+def get_population_growth(target_year: int, source_year: int):
+    return population(f"{target_year}-01-01") / population(
+        f"{source_year}-01-01"
+    )
+
+
 def create_all_files():
     PRIORITY_YEARS = [2021, 2023, 2026, 2015]
     REMAINING_YEARS = [
@@ -845,11 +858,12 @@ def create_all_files():
         if target_year == 2021:
             latest_weights = stacked_file.s006
         elif target_year > 2021:
-            population_uprating = population(
-                f"{target_year}-01-01"
-            ) / population("2021-01-01")
+            population_uprating = get_population_growth(target_year, 2021)
+            stacked_file["s006_original"] = stacked_file.s006
             stacked_file.s006 = latest_weights * population_uprating
             print(f"Using 2021 solved weights for {target_year}")
+        else:
+            stacked_file["s006_original"] = stacked_file.s006
         stacked_file.to_csv(
             f"tax_microdata_{target_year}.csv.gz",
             index=False,
diff --git a/tests/test_basic_flat_file.py b/tests/test_basic_flat_file.py
deleted file mode 100644
index 99486696..00000000
--- a/tests/test_basic_flat_file.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import os
-import pytest
-import yaml
-from pathlib import Path
-
-test_mode = os.environ.get("TEST_MODE", "lite")
-
-FOLDER = Path(__file__).parent
-with open(FOLDER / "tc_variable_totals.yaml") as f:
-    tc_variable_totals = yaml.safe_load(f)
-
-with open(
-    FOLDER.parent
-    / "tax_microdata_benchmarking"
-    / "taxcalc_variable_metadata.yaml"
-) as f:
-    taxcalc_variable_metadata = yaml.safe_load(f)
-
-EXEMPTED_VARIABLES = [
-    "DSI",  # Issue here but deprioritized.
-    "EIC",  # PUF-PE file almost certainly more correct by including CPS data
-    "MIDR",  # Issue here but deprioritized.
-    "RECID",  # No reason to compare.
-    "a_lineno",  # No reason to compare.
-    "agi_bin",  # No reason to compare.
-    "blind_spouse",  # Issue here but deprioritized.
-    "cmbtp",  # No reason to compare.
-    "data_source",  # No reason to compare.
-    "s006",  # No reason to compare.
-    "h_seq",  # No reason to compare.
-    "fips",  # No reason to compare.
-    "ffpos",  # No reason to compare.
-    "p23250",  # PE-PUF likely closer to truth than taxdata (needs triple check).
-    "e01200",  # Unknown but deprioritized for now.
-    "e17500",  # Unknown but deprioritized for now.
-    "e18500",  # Unknown but deprioritized for now.
-    "e02100",  # Farm income, unsure who's closer.
-]
-
-# Exempt any variable split between filer and spouse for now.
-EXEMPTED_VARIABLES += [
-    variable
-    for variable in taxcalc_variable_metadata["read"]
-    if variable.endswith("p") or variable.endswith("s")
-]
-
-
-def pytest_namespace():
-    return {"flat_file": None}
-
-
-@pytest.mark.dependency()
-def test_flat_file_builds():
-    from tax_microdata_benchmarking.create_flat_file import (
-        create_stacked_flat_file,
-    )
-
-    flat_file = create_stacked_flat_file(2021, reweight=test_mode == "full")
-
-    pytest.flat_file = flat_file
-
-
-variables_to_test = [
-    variable
-    for variable in tc_variable_totals.keys()
-    if variable not in EXEMPTED_VARIABLES
-]
-
-
-@pytest.mark.dependency(depends=["test_flat_file_builds"])
-@pytest.mark.parametrize("variable", variables_to_test)
-def test_tc_variable_totals(variable):
-    meta = taxcalc_variable_metadata["read"][variable]
-    name = meta.get("desc")
-    flat_file = pytest.flat_file
-    weight = flat_file.s006
-    total = (flat_file[variable] * weight).sum()
-    if tc_variable_totals[variable] == 0:
-        # If the taxdata file has a zero total, we'll assume the PE file is still correct.
-        return
-    # 20% and more than 10bn off taxdata is a failure.
-    assert (
-        abs(total / tc_variable_totals[variable] - 1) < 0.45
-        or abs(total / 1e9 - tc_variable_totals[variable] / 1e9) < 30
-    ), f"{variable} ({name}) differs to tax-data by {total / tc_variable_totals[variable] - 1:.1%} ({total/1e9:.1f}bn vs {tc_variable_totals[variable]/1e9:.1f}bn)"
diff --git a/tests/test_flat_file.py b/tests/test_flat_file.py
new file mode 100644
index 00000000..ee19c402
--- /dev/null
+++ b/tests/test_flat_file.py
@@ -0,0 +1,196 @@
+import os
+import pytest
+import yaml
+from pathlib import Path
+import pytest
+import pandas as pd
+import subprocess
+import warnings
+
+warnings.filterwarnings("ignore")
+
+test_mode = os.environ.get("TEST_MODE", "lite")
+
+FOLDER = Path(__file__).parent
+with open(FOLDER / "tc_variable_totals.yaml") as f:
+    tc_variable_totals = yaml.safe_load(f)
+
+with open(
+    FOLDER.parent
+    / "tax_microdata_benchmarking"
+    / "taxcalc_variable_metadata.yaml"
+) as f:
+    taxcalc_variable_metadata = yaml.safe_load(f)
+
+EXEMPTED_VARIABLES = [
+    "DSI",  # Issue here but deprioritized.
+    "EIC",  # PUF-PE file almost certainly more correct by including CPS data
+    "MIDR",  # Issue here but deprioritized.
+    "RECID",  # No reason to compare.
+    "a_lineno",  # No reason to compare.
+    "agi_bin",  # No reason to compare.
+    "blind_spouse",  # Issue here but deprioritized.
+    "cmbtp",  # No reason to compare.
+    "data_source",  # No reason to compare.
+    "s006",  # No reason to compare.
+    "h_seq",  # No reason to compare.
+    "fips",  # No reason to compare.
+    "ffpos",  # No reason to compare.
+    "p23250",  # PE-PUF likely closer to truth than taxdata (needs triple check).
+    "e01200",  # Unknown but deprioritized for now.
+    "e17500",  # Unknown but deprioritized for now.
+    "e18500",  # Unknown but deprioritized for now.
+    "e02100",  # Farm income, unsure who's closer.
+]
+
+# Exempt any variable split between filer and spouse for now.
+EXEMPTED_VARIABLES += [
+    variable
+    for variable in taxcalc_variable_metadata["read"]
+    if variable.endswith("p") or variable.endswith("s")
+]
+
+
+def pytest_namespace():
+    return {"flat_file": None}
+
+
+@pytest.mark.dependency()
+def test_2021_flat_file_builds():
+    from tax_microdata_benchmarking.create_flat_file import (
+        create_stacked_flat_file,
+    )
+
+    flat_file = create_stacked_flat_file(2021, reweight=test_mode == "full")
+
+    pytest.flat_file_2021 = flat_file
+
+
+variables_to_test = [
+    variable
+    for variable in tc_variable_totals.keys()
+    if variable not in EXEMPTED_VARIABLES
+]
+
+
+@pytest.mark.dependency(depends=["test_2021_flat_file_builds"])
+@pytest.mark.parametrize("variable", variables_to_test)
+def test_2021_tc_variable_totals(variable):
+    meta = taxcalc_variable_metadata["read"][variable]
+    name = meta.get("desc")
+    flat_file = pytest.flat_file_2021
+    weight = flat_file.s006
+    total = (flat_file[variable] * weight).sum()
+    if tc_variable_totals[variable] == 0:
+        # If the taxdata file has a zero total, we'll assume the PE file is still correct.
+        return
+    # 20% and more than 10bn off taxdata is a failure.
+    assert (
+        abs(total / tc_variable_totals[variable] - 1) < 0.45
+        or abs(total / 1e9 - tc_variable_totals[variable] / 1e9) < 30
+    ), f"{variable} ({name}) differs to tax-data by {total / tc_variable_totals[variable] - 1:.1%} ({total/1e9:.1f}bn vs {tc_variable_totals[variable]/1e9:.1f}bn)"
+
+
+FOLDER = Path(__file__).parent
+
+test_mode = os.environ.get("TEST_MODE", "lite")
+
+RUN_TE_TESTS = False
+
+
+@pytest.mark.skipif(not RUN_TE_TESTS, reason="TE tests are disabled.")
+@pytest.mark.dependency(depends=["test_2021_flat_file_builds"])
+def test_2023_tax_expenditures():
+    flat_file_2021 = pytest.flat_file_2021
+
+    from tax_microdata_benchmarking.create_flat_file import (
+        create_stacked_flat_file,
+        get_population_growth,
+    )
+
+    flat_file_2023 = create_stacked_flat_file(
+        2023, reweight=test_mode == "full"
+    )
+
+    flat_file_2023.s006 = flat_file_2021.s006 * get_population_growth(
+        2023, 2021
+    )
+
+    tc_folder = (
+        FOLDER.parent
+        / "tax_microdata_benchmarking"
+        / "examination"
+        / "taxcalculator"
+    )
+
+    flat_file_2023.to_csv(tc_folder / "pe23.csv.zip")
+
+    # cd into taxcalculator and run bash ./runs.sh pe23 23. That produces a file called pe23-23.res.actual. Print it out.
+
+    subprocess.run(["./runs.sh", "pe23", "23"], cwd=tc_folder.resolve())
+
+    with open(tc_folder / "pe23-23.res-actual") as f:
+        data = f.read().splitlines()
+
+    import warnings
+
+    warnings.filterwarnings("ignore")
+    import pandas as pd
+
+    df = pd.DataFrame(
+        columns=["Returns", "ExpInc", "IncTax", "PayTax", "LSTax", "AllTax"]
+    )
+    for line in data[2::3]:
+        line = line.split()[1:]
+        df = df.append(
+            pd.DataFrame(
+                [line],
+                columns=[
+                    "Returns",
+                    "ExpInc",
+                    "IncTax",
+                    "PayTax",
+                    "LSTax",
+                    "AllTax",
+                ],
+            )
+        )
+
+    df.index = [
+        "Baseline",
+        "CGQD",
+        "CLP",
+        "CTC",
+        "EITC",
+        "NIIT",
+        "QBID",
+        "SALT",
+        "SSBEN",
+    ]
+    df = df.astype(float)
+
+    taxdata_exp_results = [
+        3976.5,
+        274.5,
+        0.0,
+        125.6,
+        68.7,
+        -67.5,
+        59.5,
+        13.9,
+        76.6,
+    ]
+
+    for i in range(len(taxdata_exp_results)):
+        name = df.index[i]
+        if name in ("QBID", "SALT"):
+            continue  # QBID: PE far closer to truth. SALT: known issue.
+        rel_error = (
+            abs(df["AllTax"][i] - taxdata_exp_results[i])
+            / taxdata_exp_results[i]
+        )
+        if taxdata_exp_results[i] == 0:
+            rel_error = 0
+        assert (
+            rel_error < 0.25
+        ), f"Tax Expenditure for {name} is ${df['AllTax'][i]}bn compared to Tax-Data's ${taxdata_exp_results[i]}bn (relative error {rel_error:.1%})"