SCHEMA: Add file rule for phenotype tables (#1672)

* TEST: Add example to test phenotypic data * SCHEMA: Add file rule for phenotype * ENH: Update stem rule to accept datatypes, glob stems * TEST: Update expected regexes for stem rules * TEST: Make phenotype an exception for now * feat(metaschema): Allow datatypes in stem rules * feat(regex): Capture paths and stems, and match string ends
bids-standard · May 23, 2024 · 2246cd8 · 2246cd8
1 parent 1d929e5
commit 2246cd8
Show file tree

Hide file tree

Showing 6 changed files with 58 additions and 11 deletions.
diff --git a/src/metaschema.json b/src/metaschema.json
@@ -751,6 +751,10 @@
       "type": "object",
       "properties": {
         "level": { "enum": ["optional", "recommended", "required"] },
+        "datatypes": {
+          "type": "array",
+          "items": { "pattern": "^[a-z]+$" }
+        },
         "stem": { "type": "string" },
         "extensions": { "type": "array", "items": { "type": "string" } }
       },

diff --git a/src/schema/rules/files/common/tables.yaml b/src/schema/rules/files/common/tables.yaml
@@ -30,3 +30,16 @@ sessions: # This file may only exist if session is present in the dataset.
     - .json
   entities:
     subject: required
+
+# Phenotype is a special case where there are no applicable entities, but a
+# parent directory is specified. This most closely matches datatype in the current
+# structure. We also require a stem that can match any value, as there are no
+# constraints on the filename except extension.
+phenotype:
+  level: optional
+  datatypes:
+    - phenotype
+  stem: '*'
+  extensions:
+    - .tsv
+    - .json
diff --git a/tools/schemacode/bidsschematools/conftest.py b/tools/schemacode/bidsschematools/conftest.py
@@ -25,6 +25,7 @@
     "qmri_tb1tfl",  # fmap, _TB1TFL
     "qmri_vfa",  # derivatives
     "ds000248",  # .bidsignore
+    "fnirs_automaticity",  # phenotypic
 ]
 # Errors are described in the README of the respective datasets:
 # https://github.com/bids-standard/bids-error-examples

diff --git a/tools/schemacode/bidsschematools/data/tests/test_rules.py b/tools/schemacode/bidsschematools/data/tests/test_rules.py
@@ -87,6 +87,11 @@ def test_rule_objects(schema_obj):
 
                 # Build a list of items mentioned in rules, but not found in objects.
                 if use not in object_values:
+                    if (use, object_type) == ("phenotype", "datatypes"):
+                        # Special case: phenotype is a top-level directory
+                        # that acts like a datatype, but we don't want to
+                        # define it that way in the glossary, currently.
+                        continue
                     temp_path = path[:]
                     if is_list:
                         temp_path[-1] += f"[{i_use}]"

diff --git a/tools/schemacode/bidsschematools/rules.py b/tools/schemacode/bidsschematools/rules.py
@@ -4,6 +4,7 @@
 ``schema.rules.files``.
 """
 
+import fnmatch
 import re
 import typing as ty
 from collections.abc import Mapping
@@ -125,7 +126,7 @@ def _entity_rule(rule: Mapping, schema: bst.types.Namespace):
     ext_regex = f"(?P<extension>{ext_match})"
 
     return {
-        "regex": "".join(dir_regex + entity_regex + [suffix_regex, ext_regex]),
+        "regex": "".join(dir_regex + entity_regex + [suffix_regex, ext_regex, r"\Z"]),
         "mandatory": False,
     }
 
@@ -170,15 +171,24 @@ def _sanitize_extension(ext: str) -> str:
 
 
 def _stem_rule(rule: bst.types.Namespace):
-    stem_regex = re.escape(rule.stem)
+    # translate includes a trailing \Z (end of string) but we expect extensions
+    stem_match = fnmatch.translate(rule.stem)[:-2]
+    stem_regex = f"(?P<stem>{stem_match})"
+
+    dtypes = set(rule.get("datatypes", ()))
+    dir_regex = f"(?P<datatype>{'|'.join(dtypes)})/" if dtypes else ""
+
     ext_match = "|".join(_sanitize_extension(ext) for ext in rule.extensions)
-    ext_regex = f"(?P<extension>{ext_match})"
+    ext_regex = rf"(?P<extension>{ext_match})\Z"
 
-    return {"regex": stem_regex + ext_regex, "mandatory": rule.level == "required"}
+    return {"regex": dir_regex + stem_regex + ext_regex, "mandatory": rule.level == "required"}
 
 
 def _path_rule(rule: bst.types.Namespace):
-    return {"regex": re.escape(rule.path), "mandatory": rule.level == "required"}
+    path_match = re.escape(rule.path)
+    # Exact path matches may be files or opaque directories
+    # Consider using rules.directories to identify opaque directories
+    return {"regex": rf"(?P<path>{path_match})(?:/.*)?\Z", "mandatory": rule.level == "required"}
 
 
 def regexify_filename_rules(

diff --git a/tools/schemacode/bidsschematools/tests/test_rules.py b/tools/schemacode/bidsschematools/tests/test_rules.py
@@ -21,7 +21,7 @@ def test_entity_rule(schema_obj):
             r"sub-(?P=subject)_"
             r"(?:ses-(?P=session)_)?"
             r"(?P<suffix>T1w)"
-            r"(?P<extension>\.nii)"
+            r"(?P<extension>\.nii)\Z"
         ),
         "mandatory": False,
     }
@@ -43,7 +43,7 @@ def test_entity_rule(schema_obj):
             r"(?:sub-(?P=subject)_)?"
             r"(?:ses-(?P=session)_)?"
             r"(?P<suffix>T1w)"
-            r"(?P<extension>\.json)"
+            r"(?P<extension>\.json)\Z"
         ),
         "mandatory": False,
     }
@@ -84,28 +84,42 @@ def test_split_inheritance_rules():
 def test_stem_rule():
     rule = Namespace.build({"stem": "README", "level": "required", "extensions": ["", ".md"]})
     assert rules._stem_rule(rule) == {
-        "regex": r"README(?P<extension>|\.md)",
+        "regex": r"(?P<stem>(?s:README))(?P<extension>|\.md)\Z",
         "mandatory": True,
     }
 
     rule = Namespace.build(
         {"stem": "participants", "level": "optional", "extensions": [".tsv", ".json"]}
     )
     assert rules._stem_rule(rule) == {
-        "regex": r"participants(?P<extension>\.tsv|\.json)",
+        "regex": r"(?P<stem>(?s:participants))(?P<extension>\.tsv|\.json)\Z",
+        "mandatory": False,
+    }
+
+    # Wildcard stem, with datatype
+    rule = Namespace.build(
+        {
+            "stem": "*",
+            "datatypes": ["phenotype"],
+            "level": "optional",
+            "extensions": [".tsv", ".json"],
+        }
+    )
+    assert rules._stem_rule(rule) == {
+        "regex": r"(?P<datatype>phenotype)/(?P<stem>(?s:.*))(?P<extension>\.tsv|\.json)\Z",
         "mandatory": False,
     }
 
 
 def test_path_rule():
     rule = Namespace.build({"path": "dataset_description.json", "level": "required"})
     assert rules._path_rule(rule) == {
-        "regex": r"dataset_description\.json",
+        "regex": r"(?P<path>dataset_description\.json)(?:/.*)?\Z",
         "mandatory": True,
     }
 
     rule = Namespace.build({"path": "LICENSE", "level": "optional"})
-    assert rules._path_rule(rule) == {"regex": "LICENSE", "mandatory": False}
+    assert rules._path_rule(rule) == {"regex": r"(?P<path>LICENSE)(?:/.*)?\Z", "mandatory": False}
 
 
 def test_regexify_all():