refactor test data generation

EC-USGS · Sep 13, 2023 · ecd8c80 · ecd8c80
1 parent a17937a
commit ecd8c80
Show file tree

Hide file tree

Showing 6 changed files with 203 additions and 196 deletions.
diff --git a/environment_w_jupyter.yml b/environment_w_jupyter.yml
@@ -48,6 +48,7 @@ dependencies:
       - black
       - click != 8.1.0
       - isort
+      - filelock
       - flake8
       - git+https://github.com/modflowpy/flopy.git
       - jupyter_black

diff --git a/test_data/README.md b/test_data/README.md
@@ -1,9 +1,8 @@
-# test_data/
+# Test data
 
-This directory contains
+The `test_data` directory contains
   * domain directories for domain tests
-  * scripts/ which document how domain tests are established.
-
+  * `scripts/`, which contains scripts to run domain simulations and generate data.
 
 ## Domain directories
 
@@ -15,15 +14,18 @@ output to a disk faster than the one where the repository lives. However, we do
 version control certain files in the directory. See the `conus_2yr/README.md` for details 
 on how it is setup.
 
-
 Other domain directories can be run from pytest given the repository. The pytest 
 flag `--all_domains` will detect these repos and run them.
 
+Original source data can be found on Denali for most (all?) domains in `/home/jmccreight/pywatershed_data`.
 
+# Generating data
 
-# scripts/
+The `test_data/scripts` subdirectory contains code for reproducing test data in the domains. Importantly, `test_run_domains.py` should be run occasionally to update the test data.  After running the domains, NetCDF files can be created from simulation outputs by running the tests in `test_nc_domains.py`. E.g.,
 
-This contains code for reproducing test data in the domains. Importantly, `run_domains.py` should be run
-occasionally to update the test data.
+```shell
+pytest -v -n auto test_run_domains.py
+pytest -v -n auto test_nc_domains.py
+```
 
-Original source data can be found on Denali for most (all?) domains in `/home/jmccreight/pywatershed_data`.
+NetCDF dependencies are encoded implicitly into the `pytest` fixture system: `test_nc_domains.py` uses a custom test parametrization with `pytest_generate_tests` to map each CSV file created by the domain simulation to one or more NetCDF files, which are then aggregated into further files on session teardown by [yield fixtures](https://docs.pytest.org/en/7.2.x/how-to/fixtures.html#teardown-cleanup-aka-fixture-finalization). A [filelock](https://pytest-xdist.readthedocs.io/en/latest/how-to.html#making-session-scoped-fixtures-execute-only-once) is used to ensure aggregate files are only created once, even with multiple `pytest-xdist` workers.
diff --git a/test_data/scripts/conftest.py b/test_data/scripts/conftest.py
@@ -3,9 +3,13 @@
 import sys
 from fnmatch import fnmatch
 from platform import processor
+from typing import List
+import numpy as np
 
 import pytest
 
+from pywatershed import CsvFile, Soltab
+
 
 def pytest_addoption(parser):
     parser.addoption(
@@ -55,32 +59,6 @@ def exe():
 # This would change to handle other/additional schedulers
 domain_globs_schedule = ["*conus*"]
 
-# For generating timeseries of previous states
-previous_vars = [
-    "dprst_stor_hru",
-    "freeh2o",
-    "hru_impervstor",
-    "pk_ice",
-    "pref_flow_stor",
-    "slow_stor",
-    "soil_lower",
-    "soil_moist",
-    "soil_rechr",
-    "ssres_stor",
-]
-
-misc_nc_file_vars = [
-    "infil",
-    "sroff",
-    "ssres_flow",
-    "gwres_flow",
-]
-
-
-final_nc_file_vars = [
-    "through_rain",
-]
-
 
 def scheduler_active():
     slurm = os.getenv("SLURM_JOB_ID") is not None
@@ -102,26 +80,27 @@ def enforce_scheduler(test_dir):
     return None
 
 
-def collect_simulations(domain_list: list, force: bool):
+def collect_simulations(domain_list: list, force: bool = True, verbose: bool = False):
     simulations = {}
     for test_dir in test_dirs:
-        for pth in test_dir.iterdir():
-            # checking for prcp.cbh ensure this is a self-contained run (all
-            # files in repo)
-            if (
-                (test_dir / "prcp.cbh").exists()
-                and pth.is_file()
-                and pth.name == "control.test"
-            ):
-                if len(domain_list) and (test_dir.name not in domain_list):
-                    continue
-
-                if not force:
-                    enforce_scheduler(test_dir)
-
-                # add simulation
-                simulations[str(test_dir)] = pth.name
-
+        # ensure this is a self-contained run (all files in repo)
+        if not (test_dir / "prcp.cbh").exists():
+            continue
+
+        # filter selected domains
+        if len(domain_list) and (test_dir.name not in domain_list):
+            continue
+
+        # optionally enforce scheduler
+        if not force:
+            enforce_scheduler(test_dir)
+
+        # if control file is found, add simulation
+        ctrl_file = next(iter([p for p in test_dir.iterdir() if p.is_file() and p.name == "control.test"]), None)
+        if ctrl_file:
+            simulations[str(test_dir)] = ctrl_file.name
+
+    # make sure all requested domains were found
     if len(domain_list) and (len(simulations) < len(domain_list)):
         requested = set(domain_list)
         found = [pl.Path(dd).name for dd in simulations.keys()]
@@ -132,13 +111,14 @@ def collect_simulations(domain_list: list, force: bool):
         )
         pytest.exit(msg)
 
-    print("\nrun_domains.py found the following domains to run:\n")
-    print(f"{list(simulations.keys())}")
+    if verbose:
+        print("\nrun_domains.py found the following domains to run:\n")
+        print(f"{list(simulations.keys())}")
+
     return simulations
 
 
-def collect_csv_files(domain_list: list, force: bool):
-    simulations = collect_simulations(domain_list, force)
+def collect_csv_files(simulations: list) -> List[pl.Path]:
     csv_files = []
     for key, value in simulations.items():
         output_pth = pl.Path(key) / "output"
@@ -147,63 +127,38 @@ def collect_csv_files(domain_list: list, force: bool):
     return csv_files
 
 
-def collect_misc_nc_files(domain_list: list, var_list: list, force: bool):
-    simulations = collect_simulations(domain_list, force)
+def collect_nc_files(simulations: list, var_list: list):
     sim_dirs = list(simulations.keys())
-    misc_nc_files = []
+    nc_files = []
     for var in var_list:
         for sim in sim_dirs:
-            the_file = pl.Path(sim) / f"output/{var}.nc"
-            # assert the_file.exists()
-            misc_nc_files += [the_file.with_suffix("")]
-
-    return misc_nc_files
+            nc_files += [(pl.Path(sim) / f"output/{var}.nc")]
+    return nc_files
 
 
 def pytest_generate_tests(metafunc):
     domain_list = metafunc.config.getoption("domain")
     force = metafunc.config.getoption("force")
+    simulations = collect_simulations(domain_list, force)
+    csv_files = collect_csv_files(simulations)
 
-    if "simulations" in metafunc.fixturenames:
-        simulations = collect_simulations(domain_list, force)
-        sim_list = [
-            {"ws": key, "control_file": val}
-            for key, val in simulations.items()
-        ]
-        ids = [pl.Path(ss).name for ss in simulations.keys()]
-        metafunc.parametrize("simulations", sim_list, ids=ids)
-
-    if "csv_files" in metafunc.fixturenames:
-        csv_files = collect_csv_files(domain_list, force)
-        ids = [ff.parent.parent.name + ":" + ff.name for ff in csv_files]
-        metafunc.parametrize("csv_files", csv_files, ids=ids)
-
-    if "csv_files_prev" in metafunc.fixturenames:
-        csv_files = collect_csv_files(domain_list, force)
-        csv_files = [
-            ff for ff in csv_files if ff.with_suffix("").name in previous_vars
+    key = "simulation"
+    if key in metafunc.fixturenames:
+        sims = [
+            {"ws": key, "control_file": val} for key, val in simulations.items()
         ]
-        ids = [ff.parent.parent.name + ":" + ff.name for ff in csv_files]
-        metafunc.parametrize("csv_files_prev", csv_files, ids=ids)
+        ids = [pl.Path(k).name for k in simulations.keys()]
+        metafunc.parametrize(key, sims, ids=ids, scope="session")
 
-    if "misc_nc_files_input" in metafunc.fixturenames:
-        misc_nc_files = collect_misc_nc_files(
-            domain_list, misc_nc_file_vars, force
-        )
-        ids = [ff.parent.parent.name + ":" + ff.name for ff in misc_nc_files]
-        metafunc.parametrize("misc_nc_files_input", misc_nc_files, ids=ids)
+    key = "soltab_file"
+    if key in metafunc.fixturenames:
+        soltab_files = [pl.Path(k) / "soltab_debug" for k in simulations.keys()]
+        ids = [f.parent.name + ":" + f.name for f in soltab_files]
+        metafunc.parametrize(key, soltab_files, ids=ids, scope="session")
 
-    if "misc_nc_final_input" in metafunc.fixturenames:
-        misc_nc_files = collect_misc_nc_files(
-            domain_list, final_nc_file_vars, force
-        )
-        ids = [ff.parent.parent.name + ":" + ff.name for ff in misc_nc_files]
-        metafunc.parametrize("misc_nc_final_input", misc_nc_files, ids=ids)
+    key = "csv_file"
+    if key in metafunc.fixturenames:
+        ids = [f.parent.name + ":" + f.name for f in csv_files]
+        metafunc.parametrize(key, csv_files, ids=ids)
 
-    if "soltab_file" in metafunc.fixturenames:
-        simulations = collect_simulations(domain_list, force)
-        soltab_files = [
-            pl.Path(kk) / "soltab_debug" for kk in simulations.keys()
-        ]
-        ids = [ff.parent.name + ":" + ff.name for ff in soltab_files]
-        metafunc.parametrize("soltab_file", soltab_files, ids=ids)
+
diff --git a/test_data/scripts/pytest.ini b/test_data/scripts/pytest.ini
@@ -1,2 +1,8 @@
 [pytest]
-addopts = --order-dependencies
+addopts = --order-dependencies
+python_files =
+    test_*.py
+python_functions =
+    create_*
+    make_*
+    test_*