From 01474ccd8af6b52196e25acfe6d6bee9d60e14c7 Mon Sep 17 00:00:00 2001 From: Joe Zuntz Date: Mon, 17 Jul 2023 11:58:31 +0100 Subject: [PATCH 01/18] Add methods to look up input files from the data registry --- .gitignore | 4 ++ ceci/pipeline.py | 124 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 125 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 47069f2..5368f30 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,7 @@ nb/Untitled.ipynb out/ test/ dask-worker-space/ +ceci.code-workspace +dask-scratch-space +inprogress* +ceci/_version.py diff --git a/ceci/pipeline.py b/ceci/pipeline.py index 1551034..35d6015 100644 --- a/ceci/pipeline.py +++ b/ceci/pipeline.py @@ -319,7 +319,7 @@ def __init__(self, stages, launcher_config, **kwargs): """ self.launcher_config = launcher_config - self.overall_inputs = kwargs.get("overall_inputs", {}).copy() + self.overall_inputs = self.process_overall_inputs(kwargs.get("overall_inputs", {})) self.modules = kwargs.get("modules", "") # These are populated as we add stages below @@ -333,6 +333,7 @@ def __init__(self, stages, launcher_config, **kwargs): self.pipeline_outputs = None self.stages_config = None self.stage_config_data = None + self.data_registry = None self.global_config = {} # Store the individual stage informaton @@ -364,6 +365,7 @@ def create(pipe_config): "log_dir": pipe_config.get("log_dir", "."), "resume": pipe_config.get("resume", False), "flow_chart": pipe_config.get("flow_chart", ""), + "registrar": pipe_config.get("registrar", None), } launcher_dict = dict(cwl=CWLPipeline, parsl=ParslPipeline, mini=MiniPipeline) @@ -451,6 +453,118 @@ def print_stages(self, stream=sys.stdout): stream.write(f"{stage.instance_name:20}: {str(stage)}") stream.write("\n") + def setup_data_registry(self, registry_config): + """ + Set up the data registry. + + Parameters + ---------- + registry_config : dict + A dictionary with information about the data registry to use + """ + from dataregistry import Registrar, Query, create_db_engine, SCHEMA_VERSION + + # Use the default config file recommended by the data registry docs + # if none is specified. + config_file = registry_config.get("config") + if config_file is None: + config_file = "~/.config_reg_access" + config_file = os.path.expandvars(config_file) + + # Make the database connection and the two main objects + # we use to connect with it. + engine, dialect = create_db_engine(config_file=config) + reg = Registrar(db_engine=engine, dialect=dialect, schema_version=SCHEMA_VERSION) + query = Query(engine, dialect, schema_version=SCHEMA_VERSION) + + # Save the things that may be useful. + return data_registry = { + "registrar": reg, + "query": query, + "config": registry_config, + "root_dir": reg.root_dir, + } + + + def data_registry_lookup(self, info): + """ + Look up a dataset in the data registry + + Parameters + ---------- + info : dict + A dictionary with information about the dataset to look up. Must contain + either an id, and alias, or a name + """ + from dataregistry.query import Filter + + if self.data_registry is None: + raise ValueError("No data registry configured") + + root_dir = self.data_registry["root_dir"] + query = self.data_registry["query"] + + # We have various ways of looking up a dataset + # 1. By id + # 2. By name + # 3. By alias + if "id" in info: + filter = Filter("dataset.dataset_id", "==", info["id"]) + elif "name" in info: + filter = Filter("dataset.name", "==", info["name"]) + elif "alias" in info: + raise NotImplementedError("Alias lookup not yet implemented") + else: + raise ValueError("Must specify either id or name in registry lookup") + + # Main finder method + results = query.find_datasets( + ["dataset.dataset_id", "dataset.name", "dataset.relative_path"], + [filter], + ) + # Check that we find exactly one dataset matching the query + results = list(results) + if len(results) == 0: + raise ValueError(f"Could not find dataset matching {info} in registry") + elif len(results) > 1: + raise ValueError(f"Found multiple datasets matching {info} in registry") + else: + relative_path = results[0]["dataset"]["relative_path"] + + return os.path.join(root_dir, relative_path) + + + def process_overall_inputs(self, inputs): + """ + Find the correct paths for the overall inputs to the pipeline. + + Paths may be explicit strings, or may be looked up in the data registry. + + Parameters + ---------- + inputs : dict + A dictionary of inputs to the pipeline + """ + paths = {} + for tag, value in inputs.items(): + # Case 1, explicit lookup (the original version) + if isinstance(value, str): + paths[tag] = value + # Case 2, dictionary with lookup method + elif isinstance(value, dict): + # For now the only lookup method is registry, + # but we might add others in the future, e.g. + # for remote lookup by URL. + if "lookup" not in value: + raise ValueError(f"Missing lookup method for input {tag}") + if paths["lookup"] == "registry": + paths[tag] = self.data_registry_lookup(value) + else: + raise ValueError(f"Unknown lookup method {value['lookup']}") + else: + raise ValueError(f"Unknown input type {type(value)}") + return paths + @classmethod def read(cls, pipeline_config_filename, extra_config=None, dry_run=False): """Create a pipeline for a configuration dictionary from a yaml file and extra optional parameters @@ -779,9 +893,13 @@ def initialize(self, overall_inputs, run_config, stages_config): self.run_config : copy of configuration parameters on how to run the pipeline """ - # Make a copy, since we maybe be modifying these - self.overall_inputs = overall_inputs.copy() + # Set up paths to our overall input files + if run_config.get("registry") is not None: + self.data_registry = self.setup_data_registry(run_config["registry"]) + self.overall_inputs = self.process_overall_inputs(overall_inputs) self.pipeline_files.insert_paths(self.overall_inputs) + + # Make a copy, since we maybe be modifying these self.run_config = run_config.copy() self.stages_config = stages_config From 20f20570a437f96fbfabe8198a5d7a446819ca50 Mon Sep 17 00:00:00 2001 From: Joe Zuntz Date: Mon, 17 Jul 2023 14:48:00 +0100 Subject: [PATCH 02/18] bug fixes --- ceci/pipeline.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ceci/pipeline.py b/ceci/pipeline.py index 35d6015..4a9cc72 100644 --- a/ceci/pipeline.py +++ b/ceci/pipeline.py @@ -466,6 +466,9 @@ def setup_data_registry(self, registry_config): # Use the default config file recommended by the data registry docs # if none is specified. + + # TODO: interactive version + config_file = registry_config.get("config") if config_file is None: config_file = "~/.config_reg_access" @@ -473,12 +476,12 @@ def setup_data_registry(self, registry_config): # Make the database connection and the two main objects # we use to connect with it. - engine, dialect = create_db_engine(config_file=config) + engine, dialect = create_db_engine(config_file=config_file) reg = Registrar(db_engine=engine, dialect=dialect, schema_version=SCHEMA_VERSION) query = Query(engine, dialect, schema_version=SCHEMA_VERSION) # Save the things that may be useful. - return data_registry = { + return { "registrar": reg, "query": query, "config": registry_config, From d6e9f8deadca48062e9873009575accb516f90c7 Mon Sep 17 00:00:00 2001 From: joezuntz Date: Mon, 17 Jul 2023 07:13:53 -0700 Subject: [PATCH 03/18] Fixes on perlmutter - can now find right path --- ceci/pipeline.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/ceci/pipeline.py b/ceci/pipeline.py index 4a9cc72..f5ec7e8 100644 --- a/ceci/pipeline.py +++ b/ceci/pipeline.py @@ -318,8 +318,9 @@ def __init__(self, stages, launcher_config, **kwargs): Space seperated path of modules loaded for this pipeline """ self.launcher_config = launcher_config + self.data_registry = None - self.overall_inputs = self.process_overall_inputs(kwargs.get("overall_inputs", {})) + self.overall_inputs = {} self.modules = kwargs.get("modules", "") # These are populated as we add stages below @@ -333,7 +334,6 @@ def __init__(self, stages, launcher_config, **kwargs): self.pipeline_outputs = None self.stages_config = None self.stage_config_data = None - self.data_registry = None self.global_config = {} # Store the individual stage informaton @@ -365,7 +365,7 @@ def create(pipe_config): "log_dir": pipe_config.get("log_dir", "."), "resume": pipe_config.get("resume", False), "flow_chart": pipe_config.get("flow_chart", ""), - "registrar": pipe_config.get("registrar", None), + "registry": pipe_config.get("registry", None), } launcher_dict = dict(cwl=CWLPipeline, parsl=ParslPipeline, mini=MiniPipeline) @@ -462,7 +462,7 @@ def setup_data_registry(self, registry_config): registry_config : dict A dictionary with information about the data registry to use """ - from dataregistry import Registrar, Query, create_db_engine, SCHEMA_VERSION + from dataregistry import Registrar, Query, create_db_engine, SCHEMA_VERSION, ownertypeenum # Use the default config file recommended by the data registry docs # if none is specified. @@ -471,13 +471,14 @@ def setup_data_registry(self, registry_config): config_file = registry_config.get("config") if config_file is None: - config_file = "~/.config_reg_access" + config_file = "$~/.config_reg_access" + config_file = os.path.expanduser(config_file) config_file = os.path.expandvars(config_file) # Make the database connection and the two main objects # we use to connect with it. engine, dialect = create_db_engine(config_file=config_file) - reg = Registrar(db_engine=engine, dialect=dialect, schema_version=SCHEMA_VERSION) + reg = Registrar(engine, dialect, ownertypeenum.user, owner="user", schema_version=SCHEMA_VERSION) query = Query(engine, dialect, schema_version=SCHEMA_VERSION) # Save the things that may be useful. @@ -486,6 +487,8 @@ def setup_data_registry(self, registry_config): "query": query, "config": registry_config, "root_dir": reg.root_dir, + "owner": reg._owner, + "owner_type": reg._owner_type } @@ -500,12 +503,13 @@ def data_registry_lookup(self, info): either an id, and alias, or a name """ from dataregistry.query import Filter - if self.data_registry is None: raise ValueError("No data registry configured") root_dir = self.data_registry["root_dir"] query = self.data_registry["query"] + owner = self.data_registry["owner"] + owner_type = self.data_registry["owner_type"] # We have various ways of looking up a dataset # 1. By id @@ -532,9 +536,9 @@ def data_registry_lookup(self, info): elif len(results) > 1: raise ValueError(f"Found multiple datasets matching {info} in registry") else: - relative_path = results[0]["dataset"]["relative_path"] - - return os.path.join(root_dir, relative_path) + relative_path = results[0].relative_path + full_path = os.path.join(root_dir, owner_type, owner, relative_path) + return full_path def process_overall_inputs(self, inputs): @@ -560,10 +564,14 @@ def process_overall_inputs(self, inputs): # for remote lookup by URL. if "lookup" not in value: raise ValueError(f"Missing lookup method for input {tag}") - if paths["lookup"] == "registry": + if value["lookup"] == "registry": paths[tag] = self.data_registry_lookup(value) + print(f"Located input {tag} at {paths[tag]} using data registry") + else: raise ValueError(f"Unknown lookup method {value['lookup']}") + elif value is None: + paths[tag] = None else: raise ValueError(f"Unknown input type {type(value)}") return paths From d98e9926eb0d33d424f6a67ed65893d2db9c7b11 Mon Sep 17 00:00:00 2001 From: joezuntz Date: Mon, 17 Jul 2023 07:33:40 -0700 Subject: [PATCH 04/18] Tidy query code --- ceci/pipeline.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/ceci/pipeline.py b/ceci/pipeline.py index f5ec7e8..084d196 100644 --- a/ceci/pipeline.py +++ b/ceci/pipeline.py @@ -471,7 +471,7 @@ def setup_data_registry(self, registry_config): config_file = registry_config.get("config") if config_file is None: - config_file = "$~/.config_reg_access" + config_file = "~/.config_reg_access" config_file = os.path.expanduser(config_file) config_file = os.path.expandvars(config_file) @@ -487,8 +487,6 @@ def setup_data_registry(self, registry_config): "query": query, "config": registry_config, "root_dir": reg.root_dir, - "owner": reg._owner, - "owner_type": reg._owner_type } @@ -508,8 +506,6 @@ def data_registry_lookup(self, info): root_dir = self.data_registry["root_dir"] query = self.data_registry["query"] - owner = self.data_registry["owner"] - owner_type = self.data_registry["owner_type"] # We have various ways of looking up a dataset # 1. By id @@ -526,7 +522,7 @@ def data_registry_lookup(self, info): # Main finder method results = query.find_datasets( - ["dataset.dataset_id", "dataset.name", "dataset.relative_path"], + ["dataset.dataset_id", "dataset.name", "dataset.relative_path", "dataset.owner_type", "dataset.owner"], [filter], ) # Check that we find exactly one dataset matching the query @@ -535,9 +531,10 @@ def data_registry_lookup(self, info): raise ValueError(f"Could not find dataset matching {info} in registry") elif len(results) > 1: raise ValueError(f"Found multiple datasets matching {info} in registry") - else: - relative_path = results[0].relative_path - full_path = os.path.join(root_dir, owner_type, owner, relative_path) + + # Get the absolute path + r = results[0] + full_path = os.path.join(root_dir, r.owner_type, r.owner, r.relative_path) return full_path From 7cea70b3d2f748aabbb43b4e57f025630c85051f Mon Sep 17 00:00:00 2001 From: joezuntz Date: Mon, 17 Jul 2023 07:35:32 -0700 Subject: [PATCH 05/18] remove print --- ceci/pipeline.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ceci/pipeline.py b/ceci/pipeline.py index 084d196..1c36311 100644 --- a/ceci/pipeline.py +++ b/ceci/pipeline.py @@ -563,8 +563,6 @@ def process_overall_inputs(self, inputs): raise ValueError(f"Missing lookup method for input {tag}") if value["lookup"] == "registry": paths[tag] = self.data_registry_lookup(value) - print(f"Located input {tag} at {paths[tag]} using data registry") - else: raise ValueError(f"Unknown lookup method {value['lookup']}") elif value is None: From a0cc0abfd3497d2260db8cc9e29e32dab31173c6 Mon Sep 17 00:00:00 2001 From: joezuntz Date: Mon, 17 Jul 2023 07:55:45 -0700 Subject: [PATCH 06/18] simplify to a single lookup approach --- ceci/pipeline.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/ceci/pipeline.py b/ceci/pipeline.py index 1c36311..9e3e6b4 100644 --- a/ceci/pipeline.py +++ b/ceci/pipeline.py @@ -556,15 +556,9 @@ def process_overall_inputs(self, inputs): paths[tag] = value # Case 2, dictionary with lookup method elif isinstance(value, dict): - # For now the only lookup method is registry, - # but we might add others in the future, e.g. - # for remote lookup by URL. - if "lookup" not in value: - raise ValueError(f"Missing lookup method for input {tag}") - if value["lookup"] == "registry": - paths[tag] = self.data_registry_lookup(value) - else: - raise ValueError(f"Unknown lookup method {value['lookup']}") + # This means that we will look up a path + # using the data registry + paths[tag] = self.data_registry_lookup(value) elif value is None: paths[tag] = None else: From f6e3315e9445149d2983b13b482581b10782fa64 Mon Sep 17 00:00:00 2001 From: joezuntz Date: Mon, 17 Jul 2023 08:00:08 -0700 Subject: [PATCH 07/18] refuse to try data registry away from NERSC --- ceci/pipeline.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ceci/pipeline.py b/ceci/pipeline.py index 9e3e6b4..bcab2f5 100644 --- a/ceci/pipeline.py +++ b/ceci/pipeline.py @@ -457,6 +457,8 @@ def setup_data_registry(self, registry_config): """ Set up the data registry. + # TODO: interactive version + Parameters ---------- registry_config : dict @@ -466,15 +468,16 @@ def setup_data_registry(self, registry_config): # Use the default config file recommended by the data registry docs # if none is specified. - - # TODO: interactive version - config_file = registry_config.get("config") if config_file is None: config_file = "~/.config_reg_access" config_file = os.path.expanduser(config_file) config_file = os.path.expandvars(config_file) + if not os.environ.get("NERSC_HOST"): + warnings.warn("The Data Registry is only available on NERSC: not setting it up now.") + return None + # Make the database connection and the two main objects # we use to connect with it. engine, dialect = create_db_engine(config_file=config_file) From 937f76d30db78de300843f31eaff06524d3d1ba8 Mon Sep 17 00:00:00 2001 From: Joe Zuntz Date: Tue, 18 Jul 2023 08:27:38 +0100 Subject: [PATCH 08/18] import warnings and add no-covers --- ceci/pipeline.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ceci/pipeline.py b/ceci/pipeline.py index bcab2f5..a9d74a8 100644 --- a/ceci/pipeline.py +++ b/ceci/pipeline.py @@ -6,6 +6,7 @@ import yaml import shutil from abc import abstractmethod +import warnings from .stage import PipelineStage from . import minirunner @@ -453,7 +454,7 @@ def print_stages(self, stream=sys.stdout): stream.write(f"{stage.instance_name:20}: {str(stage)}") stream.write("\n") - def setup_data_registry(self, registry_config): + def setup_data_registry(self, registry_config): #pragma: no cover """ Set up the data registry. @@ -493,7 +494,7 @@ def setup_data_registry(self, registry_config): } - def data_registry_lookup(self, info): + def data_registry_lookup(self, info): #pragma: no cover """ Look up a dataset in the data registry @@ -503,7 +504,7 @@ def data_registry_lookup(self, info): A dictionary with information about the dataset to look up. Must contain either an id, and alias, or a name """ - from dataregistry.query import Filter + from dataregistry import Filter if self.data_registry is None: raise ValueError("No data registry configured") @@ -558,7 +559,7 @@ def process_overall_inputs(self, inputs): if isinstance(value, str): paths[tag] = value # Case 2, dictionary with lookup method - elif isinstance(value, dict): + elif isinstance(value, dict): #pragma: no cover # This means that we will look up a path # using the data registry paths[tag] = self.data_registry_lookup(value) From b6555292077fedeeb42452800c5c6f08e38b985b Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Tue, 31 Oct 2023 10:32:49 +0100 Subject: [PATCH 09/18] Update dataregistry interface to latest version --- ceci/pipeline.py | 51 +++++++++++++++++------------------------------- 1 file changed, 18 insertions(+), 33 deletions(-) diff --git a/ceci/pipeline.py b/ceci/pipeline.py index a9d74a8..b1c3254 100644 --- a/ceci/pipeline.py +++ b/ceci/pipeline.py @@ -465,32 +465,23 @@ def setup_data_registry(self, registry_config): #pragma: no cover registry_config : dict A dictionary with information about the data registry to use """ - from dataregistry import Registrar, Query, create_db_engine, SCHEMA_VERSION, ownertypeenum + from dataregistry import DataRegistry - # Use the default config file recommended by the data registry docs - # if none is specified. - config_file = registry_config.get("config") - if config_file is None: - config_file = "~/.config_reg_access" - config_file = os.path.expanduser(config_file) - config_file = os.path.expandvars(config_file) + # Establish a connection to the data registry. If the config_file is + # None the dataregistry will assume the users config file is in the + # default location (~/.config_reg_access). + registry = DataRegistry(config_file=registry_config.get("config", None), + owner_type=registry_config.get("config", "user"), + owner=registry_config.get("owner", None)) if not os.environ.get("NERSC_HOST"): warnings.warn("The Data Registry is only available on NERSC: not setting it up now.") return None - # Make the database connection and the two main objects - # we use to connect with it. - engine, dialect = create_db_engine(config_file=config_file) - reg = Registrar(engine, dialect, ownertypeenum.user, owner="user", schema_version=SCHEMA_VERSION) - query = Query(engine, dialect, schema_version=SCHEMA_VERSION) - # Save the things that may be useful. return { - "registrar": reg, - "query": query, + "registry": registry, "config": registry_config, - "root_dir": reg.root_dir, } @@ -504,42 +495,36 @@ def data_registry_lookup(self, info): #pragma: no cover A dictionary with information about the dataset to look up. Must contain either an id, and alias, or a name """ - from dataregistry import Filter if self.data_registry is None: raise ValueError("No data registry configured") - root_dir = self.data_registry["root_dir"] - query = self.data_registry["query"] + registry = self.data_registry["registry"] # We have various ways of looking up a dataset - # 1. By id - # 2. By name - # 3. By alias + # 1. By the `dataset_id` + # 2. By the dataset `name` + # 3. By a dataset alias `name` if "id" in info: - filter = Filter("dataset.dataset_id", "==", info["id"]) + return registry.Query.get_dataset_absolute_path(info["id"]) elif "name" in info: - filter = Filter("dataset.name", "==", info["name"]) + filter = registry.Query.gen_filter("dataset.name", "==", info["name"]) elif "alias" in info: raise NotImplementedError("Alias lookup not yet implemented") else: raise ValueError("Must specify either id or name in registry lookup") # Main finder method - results = query.find_datasets( - ["dataset.dataset_id", "dataset.name", "dataset.relative_path", "dataset.owner_type", "dataset.owner"], - [filter], - ) + results = registry.Query.find_datasets(["dataset.dataset_id"], [filter]) + # Check that we find exactly one dataset matching the query results = list(results) if len(results) == 0: - raise ValueError(f"Could not find dataset matching {info} in registry") + raise ValueError(f"Could not find any dataset matching {info} in registry") elif len(results) > 1: raise ValueError(f"Found multiple datasets matching {info} in registry") # Get the absolute path - r = results[0] - full_path = os.path.join(root_dir, r.owner_type, r.owner, r.relative_path) - return full_path + return registry.Query.get_dataset_absolute_path(results[0].dataset_id) def process_overall_inputs(self, inputs): From f756c9c91411e6ccbe2e72b360ec14e8fb6a09b9 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 3 Nov 2023 14:54:28 +0100 Subject: [PATCH 10/18] Start implementing dataregistry ci --- .github/workflows/ci.yml | 31 ++++++++++- ceci/pipeline.py | 11 ++-- pyproject.toml | 6 ++ tests/create_registry_entries.py | 28 ++++++++++ tests/test_dataregistry.yml | 94 ++++++++++++++++++++++++++++++++ 5 files changed, 164 insertions(+), 6 deletions(-) create mode 100644 tests/create_registry_entries.py create mode 100644 tests/test_dataregistry.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0b362d6..8355e3b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,7 +5,7 @@ name: Continuous Integration on: push: - branches: [ master ] + branches: [ master, data-registry ] pull_request: branches: [ master ] @@ -41,3 +41,32 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v1 + test_data_registry: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.9] + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + with: + submodules: true + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + steps: + - name: Checkout dataregistry repository + uses: actions/checkout@v2 + with: + repository: LSSTDESC/dataregistry + path: './dataregistry' + + - name: Install + run: | + sudo apt-get update && sudo apt-get -y install libopenmpi-dev openmpi-bin graphviz graphviz-dev + pip install .[all] diff --git a/ceci/pipeline.py b/ceci/pipeline.py index b1c3254..44df0fa 100644 --- a/ceci/pipeline.py +++ b/ceci/pipeline.py @@ -471,12 +471,13 @@ def setup_data_registry(self, registry_config): #pragma: no cover # None the dataregistry will assume the users config file is in the # default location (~/.config_reg_access). registry = DataRegistry(config_file=registry_config.get("config", None), - owner_type=registry_config.get("config", "user"), - owner=registry_config.get("owner", None)) + owner_type=registry_config.get("owner_type", "user"), + owner=registry_config.get("owner", None), + root_dir=registry_config.get("root_dir", None)) - if not os.environ.get("NERSC_HOST"): - warnings.warn("The Data Registry is only available on NERSC: not setting it up now.") - return None + #if not os.environ.get("NERSC_HOST"): + # warnings.warn("The Data Registry is only available on NERSC: not setting it up now.") + # return None # Save the things that may be useful. return { diff --git a/pyproject.toml b/pyproject.toml index aad7c34..9dcd7e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,6 +77,12 @@ test = [ "dask[distributed]", ] +test_dataregistry = [ + "pyyaml > 3", + "psutil", + "dataregistry @ git+https://github.com/LSSTDESC/dataregistry", +] + all = [ "parsl >= 1.0.0", "flask", diff --git a/tests/create_registry_entries.py b/tests/create_registry_entries.py new file mode 100644 index 0000000..c31c38d --- /dev/null +++ b/tests/create_registry_entries.py @@ -0,0 +1,28 @@ +import os +import sys + +from dataregistry import DataRegistry + +_TEST_ROOT_DIR = "DataRegistry_data" + +# Make root dir +if not os.path.isdir(_TEST_ROOT_DIR): + os.makedirs(_TEST_ROOT_DIR) + +# Establish connection to database +datareg = DataRegistry(root_dir=_TEST_ROOT_DIR) + +# Add new entry. +datareg.Registrar.register_dataset( + "dm.txt", + "0.0.1", + verbose=True, + old_location="inputs/dm.txt" +) + +datareg.Registrar.register_dataset( + "fiducial_cosmology.txt", + "0.0.1", + verbose=True, + old_location="inputs/fiducial_cosmology.txt" +) diff --git a/tests/test_dataregistry.yml b/tests/test_dataregistry.yml new file mode 100644 index 0000000..c0a130b --- /dev/null +++ b/tests/test_dataregistry.yml @@ -0,0 +1,94 @@ +# Python modules that are imported to find +# stage classes. Any stages imported in these +# modules are automatically detected and their names can +# be used below +modules: ceci_example + +# The launcher to use +# These are defined in ceci/sites +launcher: + name: mini + interval: 0.5 + +# launcher: +# name: parsl +# # max_threads only referenced for local sites +# #log: parsl_log.txt + +# launcher: +# name: cwl +# launcher: cwltool +# dir: ./test/cwl + +site: + name: local + max_threads: 4 + # max_threads: 4 + # container: joezuntz/txpipe + # volume: $PWD:/opt/txpipe + + + +#site: +# name: nersc-interactive +# # Put the log for the overall pipeline infrastructure in this file: +# pipeline_log: log.txt + +# site: +# name: nersc-batch +# cpu_type: haswell +# queue: debug +# max_jobs: 2 +# account: m1727 +# walltime: "00:30:00" +# setup: /global/projecta/projectdirs/lsst/groups/WL/users/zuntz/setup-cori + + + +# The list of stages to run and the number of processors +# to use for each. +stages: + - name: WLGCSummaryStatistic + nprocess: 1 + threads_per_process: 2 + - name: SysMapMaker + nprocess: 1 + - name: shearMeasurementPipe + nprocess: 1 + - name: PZEstimationPipe + nprocess: 1 + - name: WLGCRandoms + nprocess: 1 + - name: WLGCSelector + nprocess: 1 + - name: SourceSummarizer + nprocess: 1 + - name: WLGCTwoPoint + nprocess: 1 + - name: WLGCCov + nprocess: 1 + +# Definitions of where to find inputs for the overall pipeline. +# Any input required by a pipeline stage that is not generated by +# a previous stage must be defined here. They are listed by tag. +inputs: + DM: + id: 1 + fiducial_cosmology: + id: 2 + +# Overall configuration file +config: ./tests/config.yml + +# If all the outputs for a stage already exist then do not re-run that stage +resume: False + +# Put all the output files in this directory: +output_dir: ./tests/outputs + +# Put the logs from the individual stages in this directory: +log_dir: ./tests/logs + +# Point to the root directory of the dataregistry +registry: + root_dir: ./tests/DataRegistry_data From 4def86ada14ab1b9f424bd2cdbb2c9d8311d7083 Mon Sep 17 00:00:00 2001 From: stuartmcalpine <67105723+stuartmcalpine@users.noreply.github.com> Date: Fri, 3 Nov 2023 14:55:40 +0100 Subject: [PATCH 11/18] Update ci.yml --- .github/workflows/ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8355e3b..a17e8d9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,7 +59,6 @@ jobs: with: python-version: ${{ matrix.python-version }} - steps: - name: Checkout dataregistry repository uses: actions/checkout@v2 with: From ab2d0fb3eee8e232634296da4b6e5085bb9d7410 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 3 Nov 2023 15:16:20 +0100 Subject: [PATCH 12/18] Add creation of database to CI --- .github/workflows/ci.yml | 90 ++++++++++++++++++++++++++-------------- 1 file changed, 60 insertions(+), 30 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8355e3b..67630d9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,40 +10,62 @@ on: branches: [ master ] jobs: - build: - - runs-on: ubuntu-latest - strategy: - matrix: - python-version: [3.7, 3.8, 3.9, "3.10"] - - steps: - - name: Checkout repository - uses: actions/checkout@v2 - with: - submodules: true - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Install - run: | - sudo apt-get update && sudo apt-get -y install libopenmpi-dev openmpi-bin graphviz graphviz-dev - pip install .[all] - - - name: Tests - run: | - pytest --cov=ceci - - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 + # build: + # + # runs-on: ubuntu-latest + # strategy: + # matrix: + # python-version: [3.7, 3.8, 3.9, "3.10"] + # + # steps: + # - name: Checkout repository + # uses: actions/checkout@v2 + # with: + # submodules: true + # + # - name: Set up Python ${{ matrix.python-version }} + # uses: actions/setup-python@v2 + # with: + # python-version: ${{ matrix.python-version }} + # + # - name: Install + # run: | + # sudo apt-get update && sudo apt-get -y install libopenmpi-dev openmpi-bin graphviz graphviz-dev + # pip install .[all] + # + # - name: Tests + # run: | + # pytest --cov=ceci + # + # + # - name: Upload coverage to Codecov + # uses: codecov/codecov-action@v1 test_data_registry: runs-on: ubuntu-latest + + env: + DATAREG_CONFIG: "${{ github.workspace }}/config.txt" + + # Service containers to run with `runner-job` + services: + # Label used to access the service container + postgres: + # Docker Hub image + image: postgres + ports: + - 5432:5432 + env: + POSTGRES_PASSWORD: postgres + POSTGRES_DB: desc_data_registry + # Set health checks to wait until postgres has started + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + strategy: matrix: python-version: [3.9] @@ -70,3 +92,11 @@ jobs: run: | sudo apt-get update && sudo apt-get -y install libopenmpi-dev openmpi-bin graphviz graphviz-dev pip install .[all] + + - name: Set up dataregistry + run: | + echo "sqlalchemy.url : postgresql://postgres:postgres@localhost:5432/desc_data_registry" > $DATAREG_CONFIG + cd dataregistry + python3 -m pip install . + python scripts/create_registry_db.py --config $DATAREG_CONFIG + From e3f123107a5f7b2308a3d9af91dce96560813644 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 3 Nov 2023 15:27:59 +0100 Subject: [PATCH 13/18] Add test to CI --- .github/workflows/ci.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 366d140..2e7d8e9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -97,5 +97,7 @@ jobs: echo "sqlalchemy.url : postgresql://postgres:postgres@localhost:5432/desc_data_registry" > $DATAREG_CONFIG cd dataregistry python3 -m pip install . - python scripts/create_registry_db.py --config $DATAREG_CONFIG - + python3 scripts/create_registry_db.py --config $DATAREG_CONFIG + cd .. + python3 tests/create_registry_entries.py + ceci tests/test_dataregistry.yml From 56e365f5b5d60b930907f35bcac7c9fd773bf3ef Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 3 Nov 2023 15:33:11 +0100 Subject: [PATCH 14/18] Fix --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2e7d8e9..3857bd0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -98,6 +98,6 @@ jobs: cd dataregistry python3 -m pip install . python3 scripts/create_registry_db.py --config $DATAREG_CONFIG - cd .. - python3 tests/create_registry_entries.py - ceci tests/test_dataregistry.yml + cd ../tests + python3 create_registry_entries.py + ceci test_dataregistry.yml From d7ae29a40c29e29e0f5a8471e1f2d95f903f2fa6 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 3 Nov 2023 15:39:49 +0100 Subject: [PATCH 15/18] Update --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3857bd0..2269eff 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -100,4 +100,5 @@ jobs: python3 scripts/create_registry_db.py --config $DATAREG_CONFIG cd ../tests python3 create_registry_entries.py - ceci test_dataregistry.yml + cd .. + ceci tests/test_dataregistry.yml From b26717df47bc8d599fd47ac9340a2c55db18af46 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 3 Nov 2023 15:45:02 +0100 Subject: [PATCH 16/18] Put other test back in --- .github/workflows/ci.yml | 60 ++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2269eff..942271d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,36 +10,36 @@ on: branches: [ master ] jobs: - # build: - # - # runs-on: ubuntu-latest - # strategy: - # matrix: - # python-version: [3.7, 3.8, 3.9, "3.10"] - # - # steps: - # - name: Checkout repository - # uses: actions/checkout@v2 - # with: - # submodules: true - # - # - name: Set up Python ${{ matrix.python-version }} - # uses: actions/setup-python@v2 - # with: - # python-version: ${{ matrix.python-version }} - # - # - name: Install - # run: | - # sudo apt-get update && sudo apt-get -y install libopenmpi-dev openmpi-bin graphviz graphviz-dev - # pip install .[all] - # - # - name: Tests - # run: | - # pytest --cov=ceci - # - # - # - name: Upload coverage to Codecov - # uses: codecov/codecov-action@v1 + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.7, 3.8, 3.9, "3.10"] + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + with: + submodules: true + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install + run: | + sudo apt-get update && sudo apt-get -y install libopenmpi-dev openmpi-bin graphviz graphviz-dev + pip install .[all] + + - name: Tests + run: | + pytest --cov=ceci + + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 test_data_registry: From bab5b9dde6c982151f44f4309c43f3d6c2dff11e Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 3 Nov 2023 15:48:29 +0100 Subject: [PATCH 17/18] Fix --- .github/workflows/ci.yml | 46 ++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 942271d..5649dfe 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,36 +10,36 @@ on: branches: [ master ] jobs: - build: + build: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: [3.7, 3.8, 3.9, "3.10"] + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.7, 3.8, 3.9, "3.10"] - steps: - - name: Checkout repository - uses: actions/checkout@v2 - with: - submodules: true + steps: + - name: Checkout repository + uses: actions/checkout@v2 + with: + submodules: true - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} - - name: Install - run: | - sudo apt-get update && sudo apt-get -y install libopenmpi-dev openmpi-bin graphviz graphviz-dev - pip install .[all] + - name: Install + run: | + sudo apt-get update && sudo apt-get -y install libopenmpi-dev openmpi-bin graphviz graphviz-dev + pip install .[all] - - name: Tests - run: | - pytest --cov=ceci + - name: Tests + run: | + pytest --cov=ceci - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 test_data_registry: From a7f921dadbf023987e1388ad88acc30458f9f02f Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 3 Nov 2023 15:52:37 +0100 Subject: [PATCH 18/18] Change python ranges in ci --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5649dfe..7ea7a5e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.7, 3.8, 3.9, "3.10"] + python-version: [3.8, 3.9, "3.10", "3.11", "3.12"] steps: - name: Checkout repository