Skip to content

Commit

Permalink
SCHEMA: Add file rule for phenotype tables (#1672)
Browse files Browse the repository at this point in the history
* TEST: Add example to test phenotypic data

* SCHEMA: Add file rule for phenotype

* ENH: Update stem rule to accept datatypes, glob stems

* TEST: Update expected regexes for stem rules

* TEST: Make phenotype an exception for now

* feat(metaschema): Allow datatypes in stem rules

* feat(regex): Capture paths and stems, and match string ends
  • Loading branch information
effigies authored May 23, 2024
1 parent 1d929e5 commit 2246cd8
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 11 deletions.
4 changes: 4 additions & 0 deletions src/metaschema.json
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,10 @@
"type": "object",
"properties": {
"level": { "enum": ["optional", "recommended", "required"] },
"datatypes": {
"type": "array",
"items": { "pattern": "^[a-z]+$" }
},
"stem": { "type": "string" },
"extensions": { "type": "array", "items": { "type": "string" } }
},
Expand Down
13 changes: 13 additions & 0 deletions src/schema/rules/files/common/tables.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,16 @@ sessions: # This file may only exist if session is present in the dataset.
- .json
entities:
subject: required

# Phenotype is a special case where there are no applicable entities, but a
# parent directory is specified. This most closely matches datatype in the current
# structure. We also require a stem that can match any value, as there are no
# constraints on the filename except extension.
phenotype:
level: optional
datatypes:
- phenotype
stem: '*'
extensions:
- .tsv
- .json
1 change: 1 addition & 0 deletions tools/schemacode/bidsschematools/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"qmri_tb1tfl", # fmap, _TB1TFL
"qmri_vfa", # derivatives
"ds000248", # .bidsignore
"fnirs_automaticity", # phenotypic
]
# Errors are described in the README of the respective datasets:
# https://github.com/bids-standard/bids-error-examples
Expand Down
5 changes: 5 additions & 0 deletions tools/schemacode/bidsschematools/data/tests/test_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,11 @@ def test_rule_objects(schema_obj):

# Build a list of items mentioned in rules, but not found in objects.
if use not in object_values:
if (use, object_type) == ("phenotype", "datatypes"):
# Special case: phenotype is a top-level directory
# that acts like a datatype, but we don't want to
# define it that way in the glossary, currently.
continue
temp_path = path[:]
if is_list:
temp_path[-1] += f"[{i_use}]"
Expand Down
20 changes: 15 additions & 5 deletions tools/schemacode/bidsschematools/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
``schema.rules.files``.
"""

import fnmatch
import re
import typing as ty
from collections.abc import Mapping
Expand Down Expand Up @@ -125,7 +126,7 @@ def _entity_rule(rule: Mapping, schema: bst.types.Namespace):
ext_regex = f"(?P<extension>{ext_match})"

return {
"regex": "".join(dir_regex + entity_regex + [suffix_regex, ext_regex]),
"regex": "".join(dir_regex + entity_regex + [suffix_regex, ext_regex, r"\Z"]),
"mandatory": False,
}

Expand Down Expand Up @@ -170,15 +171,24 @@ def _sanitize_extension(ext: str) -> str:


def _stem_rule(rule: bst.types.Namespace):
stem_regex = re.escape(rule.stem)
# translate includes a trailing \Z (end of string) but we expect extensions
stem_match = fnmatch.translate(rule.stem)[:-2]
stem_regex = f"(?P<stem>{stem_match})"

dtypes = set(rule.get("datatypes", ()))
dir_regex = f"(?P<datatype>{'|'.join(dtypes)})/" if dtypes else ""

ext_match = "|".join(_sanitize_extension(ext) for ext in rule.extensions)
ext_regex = f"(?P<extension>{ext_match})"
ext_regex = rf"(?P<extension>{ext_match})\Z"

return {"regex": stem_regex + ext_regex, "mandatory": rule.level == "required"}
return {"regex": dir_regex + stem_regex + ext_regex, "mandatory": rule.level == "required"}


def _path_rule(rule: bst.types.Namespace):
return {"regex": re.escape(rule.path), "mandatory": rule.level == "required"}
path_match = re.escape(rule.path)
# Exact path matches may be files or opaque directories
# Consider using rules.directories to identify opaque directories
return {"regex": rf"(?P<path>{path_match})(?:/.*)?\Z", "mandatory": rule.level == "required"}


def regexify_filename_rules(
Expand Down
26 changes: 20 additions & 6 deletions tools/schemacode/bidsschematools/tests/test_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_entity_rule(schema_obj):
r"sub-(?P=subject)_"
r"(?:ses-(?P=session)_)?"
r"(?P<suffix>T1w)"
r"(?P<extension>\.nii)"
r"(?P<extension>\.nii)\Z"
),
"mandatory": False,
}
Expand All @@ -43,7 +43,7 @@ def test_entity_rule(schema_obj):
r"(?:sub-(?P=subject)_)?"
r"(?:ses-(?P=session)_)?"
r"(?P<suffix>T1w)"
r"(?P<extension>\.json)"
r"(?P<extension>\.json)\Z"
),
"mandatory": False,
}
Expand Down Expand Up @@ -84,28 +84,42 @@ def test_split_inheritance_rules():
def test_stem_rule():
rule = Namespace.build({"stem": "README", "level": "required", "extensions": ["", ".md"]})
assert rules._stem_rule(rule) == {
"regex": r"README(?P<extension>|\.md)",
"regex": r"(?P<stem>(?s:README))(?P<extension>|\.md)\Z",
"mandatory": True,
}

rule = Namespace.build(
{"stem": "participants", "level": "optional", "extensions": [".tsv", ".json"]}
)
assert rules._stem_rule(rule) == {
"regex": r"participants(?P<extension>\.tsv|\.json)",
"regex": r"(?P<stem>(?s:participants))(?P<extension>\.tsv|\.json)\Z",
"mandatory": False,
}

# Wildcard stem, with datatype
rule = Namespace.build(
{
"stem": "*",
"datatypes": ["phenotype"],
"level": "optional",
"extensions": [".tsv", ".json"],
}
)
assert rules._stem_rule(rule) == {
"regex": r"(?P<datatype>phenotype)/(?P<stem>(?s:.*))(?P<extension>\.tsv|\.json)\Z",
"mandatory": False,
}


def test_path_rule():
rule = Namespace.build({"path": "dataset_description.json", "level": "required"})
assert rules._path_rule(rule) == {
"regex": r"dataset_description\.json",
"regex": r"(?P<path>dataset_description\.json)(?:/.*)?\Z",
"mandatory": True,
}

rule = Namespace.build({"path": "LICENSE", "level": "optional"})
assert rules._path_rule(rule) == {"regex": "LICENSE", "mandatory": False}
assert rules._path_rule(rule) == {"regex": r"(?P<path>LICENSE)(?:/.*)?\Z", "mandatory": False}


def test_regexify_all():
Expand Down

0 comments on commit 2246cd8

Please sign in to comment.