From c2fb996c9d4b8cf9bddb91284093e4fcca62b6e2 Mon Sep 17 00:00:00 2001 From: Dorota Jarecka Date: Wed, 5 Oct 2022 22:24:02 -0400 Subject: [PATCH 1/4] aading draft of dandi search command --- dandi/cli/cmd_search.py | 141 ++++++++++++++++++++++++++++++++++++++++ dandi/cli/command.py | 2 + 2 files changed, 143 insertions(+) create mode 100644 dandi/cli/cmd_search.py diff --git a/dandi/cli/cmd_search.py b/dandi/cli/cmd_search.py new file mode 100644 index 000000000..9eb129840 --- /dev/null +++ b/dandi/cli/cmd_search.py @@ -0,0 +1,141 @@ +from pathlib import Path + +import click +import pandas as pd +import stardog + +from .base import map_to_click_exceptions + +DANDISETS_FIELDS = { + "approach": ["apr", "?as dandi:approach / schema:name ?apr ."], + "species_id": ["sid", "?as dandi:species / schema:identifier ?sid ."], + "species_name": ["snm", "?as dandi:species / schema:name ?snm ."], +} + +# The use of f-strings apparently makes this not a proper docstring, and so +# click doesn't use it unless we explicitly assign it to `help`: + + +@click.command(help="Search TODO") +@click.option( + "-F", + "--file", + help="Comma-separated list of fields to display. " + "An empty value to trigger a list of " + "available fields to be printed out", +) +@click.option( + "-t", + "--search_type", + help="Type of the search.", + type=click.Choice(["dandisets", "assets"]), +) +@click.option( + "--check_fields", + help="Field name for dandisets search", + type=click.Choice(DANDISETS_FIELDS.keys()), + multiple=True, +) +@click.option( + "--filter_fields", + help="Field name for dandisets search", + type=(str, str), + multiple=True, +) +@click.option( + "-f", + "--format", + help="Choose the format for output. TODO", + type=click.Choice(["stdout", "csv"]), + default="stdout", +) +@click.option( + "--number_of_lines", + help="Number of lines of output that will be printed", + default=10, +) +@click.option( + "-d", + "--database_name", + help="Database name", + default="dandisets_new", +) +@map_to_click_exceptions +def search( + file=None, + search_type=None, + check_fields=None, + filter_fields=None, + format="stdout", + number_of_lines=10, + database_name="dandisets_new", +): + + if file and search_type: + raise Exception("file and type are mutually exclusive options") + + conn_details = { + "endpoint": "https://search.dandiarchive.org:5820", + "username": "anonymous", + "password": "anonymous", + } + + if file: + filepath = Path(file) + with filepath.open() as f: + query_str = f.read() + elif search_type == "dandisets": + if not check_fields and not filter_fields: + raise Exception( + "check_fields or filter_fields is required if search type is dandisets" + ) + elif filter_fields: + for el in filter_fields: + if el[0] not in DANDISETS_FIELDS: + raise Exception( + f"field {el[0]} used in filter_fields, but only {DANDISETS_FIELDS} allowed" + ) + query_str = create_dandiset_query(check_fields, filter_fields) + else: + raise NotImplementedError + + with stardog.Connection(database_name, **conn_details) as conn: + results = conn.select(query_str) + res_df = results2df(results, number_of_lines) + + if format == "stdout": + print(res_df) + else: + raise NotImplementedError("only stdout format implemented for now") + + # errors = defaultdict(list) # problem: [] paths + # + # if errors: + # lgr.warning( + # "Failed to operate on some paths (empty records were listed):\n %s", + # "\n ".join("%s: %d paths" % (k, len(v)) for k, v in errors.items()), + # ) + + +def results2df(results, limit=10): + res_lim = results["results"]["bindings"][:limit] + res_val_l = [dict((k, v["value"]) for k, v in res.items()) for res in res_lim] + return pd.DataFrame(res_val_l) + + +def create_dandiset_query(check_fields, filter_fields): + var = "" + for el in check_fields: + var += f" ?{DANDISETS_FIELDS[el][0]}" + + query_str = ( + f"SELECT DISTINCT ?d{var} WHERE \n" "{ \n" " ?d dandi:assetsSummary ?as . \n" + ) + for el in check_fields: + query_str += f" {DANDISETS_FIELDS[el][1]} \n" + + for (key, val) in filter_fields: + query_str += f'FILTER (?{DANDISETS_FIELDS[key][0]} = "{val}") \n' + query_str += "}" + # print(query_str) + return query_str diff --git a/dandi/cli/command.py b/dandi/cli/command.py index a3e0770b6..cd95eb87e 100644 --- a/dandi/cli/command.py +++ b/dandi/cli/command.py @@ -145,6 +145,7 @@ def main(ctx, log_level, pdb=False): from .cmd_ls import ls # noqa: E402 from .cmd_move import move # noqa: E402 from .cmd_organize import organize # noqa: E402 +from .cmd_search import search # noqa: E402 from .cmd_service_scripts import service_scripts # noqa: E402 from .cmd_shell_completion import shell_completion # noqa: E402 from .cmd_upload import upload # noqa: E402 @@ -158,6 +159,7 @@ def main(ctx, log_level, pdb=False): ls, move, organize, + search, service_scripts, shell_completion, upload, From bd210e01876a1eb9919e21d0de5bcf9ce5058fc2 Mon Sep 17 00:00:00 2001 From: Dorota Jarecka Date: Sun, 6 Nov 2022 23:05:09 -0500 Subject: [PATCH 2/4] adding assets search; updates to search syntax: allowing for multiple fields in select_fields, allowing for multiple values and ranges in filter fields; adding tests --- dandi/cli/cmd_search.py | 126 +++++++++++++++++----- dandi/cli/tests/test_search.py | 192 +++++++++++++++++++++++++++++++++ dandi/support/pyout.py | 6 +- 3 files changed, 294 insertions(+), 30 deletions(-) create mode 100644 dandi/cli/tests/test_search.py diff --git a/dandi/cli/cmd_search.py b/dandi/cli/cmd_search.py index 9eb129840..e7f25f18b 100644 --- a/dandi/cli/cmd_search.py +++ b/dandi/cli/cmd_search.py @@ -6,14 +6,39 @@ from .base import map_to_click_exceptions +# supported fields +# TODO: adding more DANDISETS_FIELDS = { "approach": ["apr", "?as dandi:approach / schema:name ?apr ."], "species_id": ["sid", "?as dandi:species / schema:identifier ?sid ."], "species_name": ["snm", "?as dandi:species / schema:name ?snm ."], } -# The use of f-strings apparently makes this not a proper docstring, and so -# click doesn't use it unless we explicitly assign it to `help`: +ASSETS_FIELDS = { + "size": ["size", "?asset schema:contentSize ?size ."], + "format": ["format", "?asset schema:encodingFormat ?format ."], +} + + +def parse_validate(ctx, param, value): + value_parse = [] + # parsing elements that have multiple comma-separated values + for el in value: + value_parse += el.split(",") + if param.name == "select_fields": + if ctx.params["search_type"] == "dandisets": + choice_list = DANDISETS_FIELDS.keys() + elif ctx.params["search_type"] == "assets": + choice_list = ASSETS_FIELDS.keys() + else: + choice_list = None + else: + choice_list = None + # checking if all values are in the list of possible choices + for el in value_parse: + if choice_list and el not in choice_list: + ctx.fail(f"{el} is not in the list: {choice_list}") + return value_parse @click.command(help="Search TODO") @@ -31,19 +56,20 @@ type=click.Choice(["dandisets", "assets"]), ) @click.option( - "--check_fields", + "-s", + "--select_fields", help="Field name for dandisets search", - type=click.Choice(DANDISETS_FIELDS.keys()), + callback=parse_validate, multiple=True, ) @click.option( + "-f", "--filter_fields", help="Field name for dandisets search", type=(str, str), multiple=True, ) @click.option( - "-f", "--format", help="Choose the format for output. TODO", type=click.Choice(["stdout", "csv"]), @@ -64,7 +90,7 @@ def search( file=None, search_type=None, - check_fields=None, + select_fields=None, filter_fields=None, format="stdout", number_of_lines=10, @@ -84,18 +110,22 @@ def search( filepath = Path(file) with filepath.open() as f: query_str = f.read() - elif search_type == "dandisets": - if not check_fields and not filter_fields: + elif search_type in ["dandisets", "assets"]: + if not select_fields: raise Exception( - "check_fields or filter_fields is required if search type is dandisets" + f"select_fields is required if search type is {search_type}" ) - elif filter_fields: + if filter_fields: for el in filter_fields: - if el[0] not in DANDISETS_FIELDS: + if el[0] not in select_fields: raise Exception( - f"field {el[0]} used in filter_fields, but only {DANDISETS_FIELDS} allowed" + f"field {el[0]} used in filter_fields, " + f"but select fields contain {select_fields}" ) - query_str = create_dandiset_query(check_fields, filter_fields) + if search_type == "dandisets": + query_str = create_dandisets_query(select_fields, filter_fields) + elif search_type == "assets": + query_str = create_assets_query(select_fields, filter_fields) else: raise NotImplementedError @@ -108,14 +138,6 @@ def search( else: raise NotImplementedError("only stdout format implemented for now") - # errors = defaultdict(list) # problem: [] paths - # - # if errors: - # lgr.warning( - # "Failed to operate on some paths (empty records were listed):\n %s", - # "\n ".join("%s: %d paths" % (k, len(v)) for k, v in errors.items()), - # ) - def results2df(results, limit=10): res_lim = results["results"]["bindings"][:limit] @@ -123,19 +145,69 @@ def results2df(results, limit=10): return pd.DataFrame(res_val_l) -def create_dandiset_query(check_fields, filter_fields): +def filter_query(filter_fields, fields_dict): + """creating filter part for the queries""" + filter_str = "" + for (key, val) in filter_fields: + if val[0] == "(" and val[-1] == ")": + val = val[1:-1].split(",") + if len(val) != 2: + raise ValueError( + "If value for filter is a tuple, it has to have 2 elements " + ) + else: + min_val = val[0].strip() + max_val = val[1].strip() + if max_val and min_val: + filter_str += ( + f"FILTER (?{fields_dict[key][0]} > {min_val} " + f"&& ?{fields_dict[key][0]} < {max_val}) \n" + ) + elif max_val: + filter_str += f"FILTER (?{fields_dict[key][0]} < {max_val}) \n" + elif min_val: + filter_str += f"FILTER (?{fields_dict[key][0]} > {min_val}) \n" + else: + val = val.split(",") + cond_str = f'?{fields_dict[key][0]} = "{val[0]}"' + for el in val[1:]: + cond_str += f' || ?{fields_dict[key][0]} = "{el}"' + filter_str += f"FILTER ({cond_str}) \n" + return filter_str + + +def create_dandisets_query(select_fields, filter_fields): + """Creating a query for dandisets search""" var = "" - for el in check_fields: + for el in select_fields: var += f" ?{DANDISETS_FIELDS[el][0]}" query_str = ( f"SELECT DISTINCT ?d{var} WHERE \n" "{ \n" " ?d dandi:assetsSummary ?as . \n" ) - for el in check_fields: + for el in select_fields: query_str += f" {DANDISETS_FIELDS[el][1]} \n" + query_str += filter_query(filter_fields, DANDISETS_FIELDS) + query_str += "}" + return query_str - for (key, val) in filter_fields: - query_str += f'FILTER (?{DANDISETS_FIELDS[key][0]} = "{val}") \n' + +def create_assets_query(select_fields, filter_fields): + """Creating a query for assets search""" + var = "" + for el in select_fields: + var += f" ?{ASSETS_FIELDS[el][0]}" + + query_str = ( + f"SELECT DISTINCT ?asset ?d_id ?path{var} WHERE \n" + "{ \n" + " ?asset rdf:type dandi:Asset . \n" + " ?d prov:hasMember ?asset . \n" + " ?d schema:identifier ?d_id . \n" + " ?asset dandi:path ?path . \n" + ) + for el in select_fields: + query_str += f" {ASSETS_FIELDS[el][1]} \n" + query_str += filter_query(filter_fields, ASSETS_FIELDS) query_str += "}" - # print(query_str) return query_str diff --git a/dandi/cli/tests/test_search.py b/dandi/cli/tests/test_search.py new file mode 100644 index 000000000..bafc4847f --- /dev/null +++ b/dandi/cli/tests/test_search.py @@ -0,0 +1,192 @@ +from click.testing import CliRunner +import pytest + +from dandi.tests.skip import mark + +from ..command import search + +pytestmark = mark.skipif_no_network + + +@pytest.mark.parametrize( + "select_fields, print_fields", + [ + ("approach", ["app"]), + ("species_name", ["snm"]), + ], +) +def test_search_dandiset_select_fields(select_fields, print_fields): + """using select_fields option with single or multiple comma-separated values""" + runner = CliRunner() + r = runner.invoke(search, ["-t", "dandisets", "--select_fields", select_fields]) + assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}" + out = r.stdout + for fld in print_fields: + assert fld in out, f"{fld} is not in the output: {out}" + + +@pytest.mark.parametrize( + "select_fields_mult", + [ + (["--select_fields", "approach", "-s", "species_name"]), + (["--select_fields", "approach,species_name"]), + ], +) +def test_search_dandiset_select_fields_mult(select_fields_mult): + """using select_fields option multiple times""" + runner = CliRunner() + r = runner.invoke(search, ["-t", "dandisets"] + select_fields_mult) + assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}" + out = r.stdout + for fld in ["apr", "snm"]: + assert fld in out, f"field {fld} is not in the output" + + +@pytest.mark.parametrize( + "filter_fields", [["species_name", "Human"], ["approach", "behavioral approach"]] +) +def test_search_dandiset_check_filter(filter_fields): + """using select_fields option multiple times""" + runner = CliRunner() + r = runner.invoke( + search, + [ + "-t", + "dandisets", + "--select_fields", + "approach,species_name", + "--filter_fields", + ] + + filter_fields, + ) + assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}" + out = r.stdout + for ln in out.split("\n")[1:]: + if ln == "": + break + assert filter_fields[1] in ln, f"value {filter_fields[1]} is not in the output" + + +def test_search_dandiset_check_filter_mult(): + """using a filter option multiple times""" + runner = CliRunner() + r = runner.invoke( + search, + [ + "-t", + "dandisets", + "--select_fields", + "approach,species_name", + "-f", + "species_name", + "Human", + "-f", + "approach", + "behavioral approach", + ], + ) + assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}" + out = r.stdout + for ln in out.split("\n")[1:-1]: + assert "Human" in ln, "Human is not in the output" + assert "behavioral approach" in ln, "behavioral approach is not in the output" + + +def test_search_dandiset_check_filter_list(): + """using comma-separated list in a filter option""" + runner = CliRunner() + r = runner.invoke( + search, + [ + "-t", + "dandisets", + "--select_fields", + "approach,species_name", + "-f", + "species_name", + "Human,House mouse", + ], + ) + assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}" + out = r.stdout + for ln in out.split("\n")[1:-1]: + assert "Human" in ln or "House mouse" in ln, "Human is not in the output" + + +@pytest.mark.parametrize( + "select_fields, print_fields", + [("format", ["format"]), ("size", ["size"]), ("size,format", ["size", "format"])], +) +def test_search_assets_select_fields(select_fields, print_fields): + """using select_fields option with single or multiple comma-separated values""" + runner = CliRunner() + r = runner.invoke(search, ["-t", "assets", "--select_fields", select_fields]) + assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}" + out = r.stdout + for fld in print_fields: + assert fld in out, f"{fld} is not in the output: {out}" + + +@pytest.mark.parametrize( + "filter_fields", + [ + ["format", "application/x-nwb"], + ], +) +def test_search_asset_check_filter(filter_fields): + """using select_fields and filter option in assets search""" + runner = CliRunner() + r = runner.invoke( + search, + ["-t", "assets", "--select_fields", "format,size", "--filter_fields"] + + filter_fields, + ) + assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}" + out = r.stdout + for ln in out.split("\n")[1:]: + if ln == "": + break + assert filter_fields[1] in ln, f"value {filter_fields[1]} is not in the output" + + +@pytest.mark.parametrize( + "size_range", [(7 * 10e9, 9 * 10e9), "(, 9*10e9)", "(7*10e9, )"] +) +def test_search_asset_check_filter_range(size_range): + """using range in a filter option for assets search""" + runner = CliRunner() + r = runner.invoke( + search, + [ + "-t", + "assets", + "--select_fields", + "format,size", + "--filter_fields", + "size", + f"{size_range}", + ], + ) + assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}" + out = r.stdout + assert "size" in out.split("\n")[0] + assert len(out.split("\n")) > 1 + + +def test_search_from_file(tmpdir): + """using search command with a file that contains any sprql query""" + query_file = tmpdir / "query.txt" + query = """ + SELECT DISTINCT ?apr WHERE + { + ?as dandi:approach / schema:name ?apr + } + """ + with query_file.open("w") as f: + f.write(query) + runner = CliRunner() + r = runner.invoke(search, ["-F", query_file]) + assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}" + out = r.stdout + assert "apr" in out.split("\n")[0] + assert len(out.split("\n")) > 1 diff --git a/dandi/support/pyout.py b/dandi/support/pyout.py index 90b422428..ebc51d3a1 100644 --- a/dandi/support/pyout.py +++ b/dandi/support/pyout.py @@ -36,7 +36,7 @@ def naturalsize(v): return humanize.naturalsize(v) -def datefmt(v, fmt=u"%Y-%m-%d/%H:%M:%S"): +def datefmt(v, fmt="%Y-%m-%d/%H:%M:%S"): if isinstance(v, datetime.datetime): return v.strftime(fmt) else: @@ -71,8 +71,8 @@ def counts(values): color=dict( interval=[ [0, 1024, "blue"], - [1024, 1024 ** 2, "green"], - [1024 ** 2, None, "red"], + [1024, 1024**2, "green"], + [1024**2, None, "red"], ] ), aggregate=lambda x: naturalsize(sum(x)), From 2852e4208c147a1e15f33d0a8e94caaa58998c10 Mon Sep 17 00:00:00 2001 From: Dorota Jarecka Date: Sun, 6 Nov 2022 23:21:18 -0500 Subject: [PATCH 3/4] removing stardog and adding SPARQLWrapper library to read from DB --- dandi/cli/cmd_search.py | 16 +++++++--------- setup.cfg | 1 + 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/dandi/cli/cmd_search.py b/dandi/cli/cmd_search.py index e7f25f18b..5a3811e36 100644 --- a/dandi/cli/cmd_search.py +++ b/dandi/cli/cmd_search.py @@ -1,8 +1,8 @@ from pathlib import Path +from SPARQLWrapper import JSON, SPARQLWrapper import click import pandas as pd -import stardog from .base import map_to_click_exceptions @@ -100,12 +100,6 @@ def search( if file and search_type: raise Exception("file and type are mutually exclusive options") - conn_details = { - "endpoint": "https://search.dandiarchive.org:5820", - "username": "anonymous", - "password": "anonymous", - } - if file: filepath = Path(file) with filepath.open() as f: @@ -129,8 +123,12 @@ def search( else: raise NotImplementedError - with stardog.Connection(database_name, **conn_details) as conn: - results = conn.select(query_str) + endpoint = "https://search.dandiarchive.org:5820/dandisets_new/query" + sparql = SPARQLWrapper(endpoint) + sparql.setCredentials("anonymous", "anonymous") + sparql.setReturnFormat(JSON) + sparql.setQuery(query_str) + results = sparql.queryAndConvert() res_df = results2df(results, number_of_lines) if format == "stdout": diff --git a/setup.cfg b/setup.cfg index 0de477174..b26ce0254 100644 --- a/setup.cfg +++ b/setup.cfg @@ -57,6 +57,7 @@ install_requires = requests ~= 2.20 ruamel.yaml >=0.15, <1 semantic-version + sparqlwrapper tenacity zarr ~= 2.10 zip_safe = False From 447c8050392231aa27c0563fe413646ef6115f94 Mon Sep 17 00:00:00 2001 From: Dorota Jarecka Date: Mon, 28 Nov 2022 13:49:55 -0500 Subject: [PATCH 4/4] adding age to the asstes fields --- dandi/cli/cmd_search.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dandi/cli/cmd_search.py b/dandi/cli/cmd_search.py index 5a3811e36..b3c4e6abc 100644 --- a/dandi/cli/cmd_search.py +++ b/dandi/cli/cmd_search.py @@ -17,6 +17,7 @@ ASSETS_FIELDS = { "size": ["size", "?asset schema:contentSize ?size ."], "format": ["format", "?asset schema:encodingFormat ?format ."], + "age": ["age", "?asset prov:wasAttributedTo / dandi:age / schema:value ?age ."], }