From eaeb8c9e7024e566518b11cbc4004b3d10a7329d Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Wed, 29 Jun 2022 18:06:18 -0400 Subject: [PATCH 1/3] BF(workaround): loop through namespaces while validating nwb To overcome problems like presented in https://github.com/dandi/helpdesk/discussions/43 this introduces solution proposed by @orugbel in https://github.com/dandi/dandi-cli/issues/917#issuecomment-1045154252 Unfortunately there were no release of pynwb with that function yet, so we are doomed to duplicate code and do it "manually" here for now Closes https://github.com/dandi/dandi-cli/issues/917 --- dandi/pynwb_utils.py | 73 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/dandi/pynwb_utils.py b/dandi/pynwb_utils.py index c3295099f..c7844db3b 100644 --- a/dandi/pynwb_utils.py +++ b/dandi/pynwb_utils.py @@ -319,6 +319,76 @@ def rename_nwb_external_files(metadata: List[dict], dandiset_path: str) -> None: container.external_file[no] = str(name_new) +# borrowed from +# https://github.com/NeurodataWithoutBorders/pynwb/blob/745aaf26fa56958254e1d22a73d4c962c8074332/src/pynwb/validate.py#L29 +# which is part of the https://github.com/NeurodataWithoutBorders/pynwb/pull/1432 +# and needed to overcome errors like in https://github.com/dandi/helpdesk/discussions/43 +def get_cached_namespaces_to_validate(path): + """ + Determine the most specific namespace(s) (i.e., extensions) that are cached in the given + NWB file that should be used for validation. + + Example + ------- + + The following example illustrates how we can use this function to validate against namespaces + cached in a file. This is useful, e.g., when a file was created using an extension + + >>> from pynwb import validate + >>> from pynwb.validate import get_cached_namespaces_to_validate + >>> path = "my_nwb_file.nwb" + >>> validate_namespaces, manager, cached_namespaces = get_cached_namespaces_to_validate(path) + >>> with NWBHDF5IO(path, "r", manager=manager) as reader: + >>> errors = [] + >>> for ns in validate_namespaces: + >>> errors += validate(io=reader, namespace=ns) + + :param path: Path for the NWB file + :return: Tuple with: + - List of strings with the most specific namespace(s) to use for validation. + - BuildManager object for opening the file for validation + - Dict with the full result from NWBHDF5IO.load_namespaces + """ + from hdmf.build import BuildManager, TypeMap + from hdmf.spec import NamespaceCatalog + from pynwb.spec import NWBDatasetSpec, NWBGroupSpec, NWBNamespace + + catalog = NamespaceCatalog(NWBGroupSpec, NWBDatasetSpec, NWBNamespace) + ns_deps = NWBHDF5IO.load_namespaces(catalog, path) + # determine which namespaces are the most specific (i.e. extensions) and validate against those + s = set(ns_deps.keys()) + for k in ns_deps: + s -= ns_deps[k].keys() + # TODO remove this workaround for issue + # https://github.com/NeurodataWithoutBorders/pynwb/issues/1357 + if "hdmf-experimental" in s: + s.remove("hdmf-experimental") # remove validation of hdmf-experimental for now + namespaces = list(sorted(s)) + + if len(namespaces) > 0: + tm = TypeMap(catalog) + manager = BuildManager(tm) + else: + manager = None + + return namespaces, manager, ns_deps + + +def validate_namespaces(path: Union[str, Path]): + """pynwb.validate which validates each validatable namespace separately + + Proposed by @orugbel in https://github.com/dandi/dandi-cli/issues/917#issuecomment-1045154252 + """ + namespaces_validate, manager, namespaces_cached = get_cached_namespaces_to_validate( + path + ) + with NWBHDF5IO(path, "r", manager=manager) as reader: + errors = [] + for ns in namespaces_validate: + errors += validate(io=reader, namespace=ns) + return errors + + @validate_cache.memoize_path def validate(path: Union[str, Path], devel_debug: bool = False) -> List[str]: """Run validation on a file and return errors @@ -333,8 +403,7 @@ def validate(path: Union[str, Path], devel_debug: bool = False) -> List[str]: path = str(path) # Might come in as pathlib's PATH errors: List[str] try: - with pynwb.NWBHDF5IO(path, "r", load_namespaces=True) as reader: - errors = pynwb.validate(reader) + errors = validate_namespaces(path) lgr.warning( "pynwb validation errors for %s: %s", path, From 0d4fb0c46c03be2a67eb3dae0ae7b1b3e996d453 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Thu, 30 Jun 2022 08:56:34 -0400 Subject: [PATCH 2/3] Update dandi/pynwb_utils.py Co-authored-by: John T. Wodder II --- dandi/pynwb_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dandi/pynwb_utils.py b/dandi/pynwb_utils.py index c7844db3b..62d9eb9bd 100644 --- a/dandi/pynwb_utils.py +++ b/dandi/pynwb_utils.py @@ -374,7 +374,7 @@ def get_cached_namespaces_to_validate(path): return namespaces, manager, ns_deps -def validate_namespaces(path: Union[str, Path]): +def validate_namespaces(path: Union[str, Path]) -> List[str]: """pynwb.validate which validates each validatable namespace separately Proposed by @orugbel in https://github.com/dandi/dandi-cli/issues/917#issuecomment-1045154252 From 259ae0d1dbb698f8cbe6e8e06b0b6166ad4ff10e Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 30 Jun 2022 11:51:10 -0400 Subject: [PATCH 3/3] Code improvements and bugfix from code review Co-authored-by: John T. Wodder II --- dandi/pynwb_utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dandi/pynwb_utils.py b/dandi/pynwb_utils.py index 62d9eb9bd..d578b26e5 100644 --- a/dandi/pynwb_utils.py +++ b/dandi/pynwb_utils.py @@ -361,9 +361,8 @@ def get_cached_namespaces_to_validate(path): s -= ns_deps[k].keys() # TODO remove this workaround for issue # https://github.com/NeurodataWithoutBorders/pynwb/issues/1357 - if "hdmf-experimental" in s: - s.remove("hdmf-experimental") # remove validation of hdmf-experimental for now - namespaces = list(sorted(s)) + s.discard("hdmf-experimental") # remove validation of hdmf-experimental for now + namespaces = sorted(s) if len(namespaces) > 0: tm = TypeMap(catalog) @@ -385,7 +384,7 @@ def validate_namespaces(path: Union[str, Path]) -> List[str]: with NWBHDF5IO(path, "r", manager=manager) as reader: errors = [] for ns in namespaces_validate: - errors += validate(io=reader, namespace=ns) + errors += pynwb.validate(io=reader, namespace=ns) return errors