Skip to content

Commit

Permalink
refactor test data generation
Browse files Browse the repository at this point in the history
  • Loading branch information
wpbonelli committed Sep 13, 2023
1 parent a17937a commit ecd8c80
Show file tree
Hide file tree
Showing 6 changed files with 203 additions and 196 deletions.
1 change: 1 addition & 0 deletions environment_w_jupyter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ dependencies:
- black
- click != 8.1.0
- isort
- filelock
- flake8
- git+https://github.com/modflowpy/flopy.git
- jupyter_black
Expand Down
20 changes: 11 additions & 9 deletions test_data/README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
# test_data/
# Test data

This directory contains
The `test_data` directory contains
* domain directories for domain tests
* scripts/ which document how domain tests are established.

* `scripts/`, which contains scripts to run domain simulations and generate data.

## Domain directories

Expand All @@ -15,15 +14,18 @@ output to a disk faster than the one where the repository lives. However, we do
version control certain files in the directory. See the `conus_2yr/README.md` for details
on how it is setup.


Other domain directories can be run from pytest given the repository. The pytest
flag `--all_domains` will detect these repos and run them.

Original source data can be found on Denali for most (all?) domains in `/home/jmccreight/pywatershed_data`.

# Generating data

# scripts/
The `test_data/scripts` subdirectory contains code for reproducing test data in the domains. Importantly, `test_run_domains.py` should be run occasionally to update the test data. After running the domains, NetCDF files can be created from simulation outputs by running the tests in `test_nc_domains.py`. E.g.,

This contains code for reproducing test data in the domains. Importantly, `run_domains.py` should be run
occasionally to update the test data.
```shell
pytest -v -n auto test_run_domains.py
pytest -v -n auto test_nc_domains.py
```

Original source data can be found on Denali for most (all?) domains in `/home/jmccreight/pywatershed_data`.
NetCDF dependencies are encoded implicitly into the `pytest` fixture system: `test_nc_domains.py` uses a custom test parametrization with `pytest_generate_tests` to map each CSV file created by the domain simulation to one or more NetCDF files, which are then aggregated into further files on session teardown by [yield fixtures](https://docs.pytest.org/en/7.2.x/how-to/fixtures.html#teardown-cleanup-aka-fixture-finalization). A [filelock](https://pytest-xdist.readthedocs.io/en/latest/how-to.html#making-session-scoped-fixtures-execute-only-once) is used to ensure aggregate files are only created once, even with multiple `pytest-xdist` workers.
145 changes: 50 additions & 95 deletions test_data/scripts/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@
import sys
from fnmatch import fnmatch
from platform import processor
from typing import List
import numpy as np

import pytest

from pywatershed import CsvFile, Soltab


def pytest_addoption(parser):
parser.addoption(
Expand Down Expand Up @@ -55,32 +59,6 @@ def exe():
# This would change to handle other/additional schedulers
domain_globs_schedule = ["*conus*"]

# For generating timeseries of previous states
previous_vars = [
"dprst_stor_hru",
"freeh2o",
"hru_impervstor",
"pk_ice",
"pref_flow_stor",
"slow_stor",
"soil_lower",
"soil_moist",
"soil_rechr",
"ssres_stor",
]

misc_nc_file_vars = [
"infil",
"sroff",
"ssres_flow",
"gwres_flow",
]


final_nc_file_vars = [
"through_rain",
]


def scheduler_active():
slurm = os.getenv("SLURM_JOB_ID") is not None
Expand All @@ -102,26 +80,27 @@ def enforce_scheduler(test_dir):
return None


def collect_simulations(domain_list: list, force: bool):
def collect_simulations(domain_list: list, force: bool = True, verbose: bool = False):
simulations = {}
for test_dir in test_dirs:
for pth in test_dir.iterdir():
# checking for prcp.cbh ensure this is a self-contained run (all
# files in repo)
if (
(test_dir / "prcp.cbh").exists()
and pth.is_file()
and pth.name == "control.test"
):
if len(domain_list) and (test_dir.name not in domain_list):
continue

if not force:
enforce_scheduler(test_dir)

# add simulation
simulations[str(test_dir)] = pth.name

# ensure this is a self-contained run (all files in repo)
if not (test_dir / "prcp.cbh").exists():
continue

# filter selected domains
if len(domain_list) and (test_dir.name not in domain_list):
continue

# optionally enforce scheduler
if not force:
enforce_scheduler(test_dir)

# if control file is found, add simulation
ctrl_file = next(iter([p for p in test_dir.iterdir() if p.is_file() and p.name == "control.test"]), None)
if ctrl_file:
simulations[str(test_dir)] = ctrl_file.name

# make sure all requested domains were found
if len(domain_list) and (len(simulations) < len(domain_list)):
requested = set(domain_list)
found = [pl.Path(dd).name for dd in simulations.keys()]
Expand All @@ -132,13 +111,14 @@ def collect_simulations(domain_list: list, force: bool):
)
pytest.exit(msg)

print("\nrun_domains.py found the following domains to run:\n")
print(f"{list(simulations.keys())}")
if verbose:
print("\nrun_domains.py found the following domains to run:\n")
print(f"{list(simulations.keys())}")

return simulations


def collect_csv_files(domain_list: list, force: bool):
simulations = collect_simulations(domain_list, force)
def collect_csv_files(simulations: list) -> List[pl.Path]:
csv_files = []
for key, value in simulations.items():
output_pth = pl.Path(key) / "output"
Expand All @@ -147,63 +127,38 @@ def collect_csv_files(domain_list: list, force: bool):
return csv_files


def collect_misc_nc_files(domain_list: list, var_list: list, force: bool):
simulations = collect_simulations(domain_list, force)
def collect_nc_files(simulations: list, var_list: list):
sim_dirs = list(simulations.keys())
misc_nc_files = []
nc_files = []
for var in var_list:
for sim in sim_dirs:
the_file = pl.Path(sim) / f"output/{var}.nc"
# assert the_file.exists()
misc_nc_files += [the_file.with_suffix("")]

return misc_nc_files
nc_files += [(pl.Path(sim) / f"output/{var}.nc")]
return nc_files


def pytest_generate_tests(metafunc):
domain_list = metafunc.config.getoption("domain")
force = metafunc.config.getoption("force")
simulations = collect_simulations(domain_list, force)
csv_files = collect_csv_files(simulations)

if "simulations" in metafunc.fixturenames:
simulations = collect_simulations(domain_list, force)
sim_list = [
{"ws": key, "control_file": val}
for key, val in simulations.items()
]
ids = [pl.Path(ss).name for ss in simulations.keys()]
metafunc.parametrize("simulations", sim_list, ids=ids)

if "csv_files" in metafunc.fixturenames:
csv_files = collect_csv_files(domain_list, force)
ids = [ff.parent.parent.name + ":" + ff.name for ff in csv_files]
metafunc.parametrize("csv_files", csv_files, ids=ids)

if "csv_files_prev" in metafunc.fixturenames:
csv_files = collect_csv_files(domain_list, force)
csv_files = [
ff for ff in csv_files if ff.with_suffix("").name in previous_vars
key = "simulation"
if key in metafunc.fixturenames:
sims = [
{"ws": key, "control_file": val} for key, val in simulations.items()
]
ids = [ff.parent.parent.name + ":" + ff.name for ff in csv_files]
metafunc.parametrize("csv_files_prev", csv_files, ids=ids)
ids = [pl.Path(k).name for k in simulations.keys()]
metafunc.parametrize(key, sims, ids=ids, scope="session")

if "misc_nc_files_input" in metafunc.fixturenames:
misc_nc_files = collect_misc_nc_files(
domain_list, misc_nc_file_vars, force
)
ids = [ff.parent.parent.name + ":" + ff.name for ff in misc_nc_files]
metafunc.parametrize("misc_nc_files_input", misc_nc_files, ids=ids)
key = "soltab_file"
if key in metafunc.fixturenames:
soltab_files = [pl.Path(k) / "soltab_debug" for k in simulations.keys()]
ids = [f.parent.name + ":" + f.name for f in soltab_files]
metafunc.parametrize(key, soltab_files, ids=ids, scope="session")

if "misc_nc_final_input" in metafunc.fixturenames:
misc_nc_files = collect_misc_nc_files(
domain_list, final_nc_file_vars, force
)
ids = [ff.parent.parent.name + ":" + ff.name for ff in misc_nc_files]
metafunc.parametrize("misc_nc_final_input", misc_nc_files, ids=ids)
key = "csv_file"
if key in metafunc.fixturenames:
ids = [f.parent.name + ":" + f.name for f in csv_files]
metafunc.parametrize(key, csv_files, ids=ids)

if "soltab_file" in metafunc.fixturenames:
simulations = collect_simulations(domain_list, force)
soltab_files = [
pl.Path(kk) / "soltab_debug" for kk in simulations.keys()
]
ids = [ff.parent.name + ":" + ff.name for ff in soltab_files]
metafunc.parametrize("soltab_file", soltab_files, ids=ids)

8 changes: 7 additions & 1 deletion test_data/scripts/pytest.ini
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
[pytest]
addopts = --order-dependencies
addopts = --order-dependencies
python_files =
test_*.py
python_functions =
create_*
make_*
test_*
Loading

0 comments on commit ecd8c80

Please sign in to comment.