From c2fb996c9d4b8cf9bddb91284093e4fcca62b6e2 Mon Sep 17 00:00:00 2001
From: Dorota Jarecka <djarecka@gmail.com>
Date: Wed, 5 Oct 2022 22:24:02 -0400
Subject: [PATCH 1/4] aading draft of dandi search command

---
 dandi/cli/cmd_search.py | 141 ++++++++++++++++++++++++++++++++++++++++
 dandi/cli/command.py    |   2 +
 2 files changed, 143 insertions(+)
 create mode 100644 dandi/cli/cmd_search.py

diff --git a/dandi/cli/cmd_search.py b/dandi/cli/cmd_search.py
new file mode 100644
index 000000000..9eb129840
--- /dev/null
+++ b/dandi/cli/cmd_search.py
@@ -0,0 +1,141 @@
+from pathlib import Path
+
+import click
+import pandas as pd
+import stardog
+
+from .base import map_to_click_exceptions
+
+DANDISETS_FIELDS = {
+    "approach": ["apr", "?as dandi:approach / schema:name ?apr ."],
+    "species_id": ["sid", "?as dandi:species / schema:identifier ?sid ."],
+    "species_name": ["snm", "?as dandi:species / schema:name ?snm ."],
+}
+
+# The use of f-strings apparently makes this not a proper docstring, and so
+# click doesn't use it unless we explicitly assign it to `help`:
+
+
+@click.command(help="Search TODO")
+@click.option(
+    "-F",
+    "--file",
+    help="Comma-separated list of fields to display. "
+    "An empty value to trigger a list of "
+    "available fields to be printed out",
+)
+@click.option(
+    "-t",
+    "--search_type",
+    help="Type of the search.",
+    type=click.Choice(["dandisets", "assets"]),
+)
+@click.option(
+    "--check_fields",
+    help="Field name for dandisets search",
+    type=click.Choice(DANDISETS_FIELDS.keys()),
+    multiple=True,
+)
+@click.option(
+    "--filter_fields",
+    help="Field name for dandisets search",
+    type=(str, str),
+    multiple=True,
+)
+@click.option(
+    "-f",
+    "--format",
+    help="Choose the format for output. TODO",
+    type=click.Choice(["stdout", "csv"]),
+    default="stdout",
+)
+@click.option(
+    "--number_of_lines",
+    help="Number of lines of output that will be printed",
+    default=10,
+)
+@click.option(
+    "-d",
+    "--database_name",
+    help="Database name",
+    default="dandisets_new",
+)
+@map_to_click_exceptions
+def search(
+    file=None,
+    search_type=None,
+    check_fields=None,
+    filter_fields=None,
+    format="stdout",
+    number_of_lines=10,
+    database_name="dandisets_new",
+):
+
+    if file and search_type:
+        raise Exception("file and type are mutually exclusive options")
+
+    conn_details = {
+        "endpoint": "https://search.dandiarchive.org:5820",
+        "username": "anonymous",
+        "password": "anonymous",
+    }
+
+    if file:
+        filepath = Path(file)
+        with filepath.open() as f:
+            query_str = f.read()
+    elif search_type == "dandisets":
+        if not check_fields and not filter_fields:
+            raise Exception(
+                "check_fields or filter_fields is required if search type is dandisets"
+            )
+        elif filter_fields:
+            for el in filter_fields:
+                if el[0] not in DANDISETS_FIELDS:
+                    raise Exception(
+                        f"field {el[0]} used in filter_fields, but only {DANDISETS_FIELDS} allowed"
+                    )
+        query_str = create_dandiset_query(check_fields, filter_fields)
+    else:
+        raise NotImplementedError
+
+    with stardog.Connection(database_name, **conn_details) as conn:
+        results = conn.select(query_str)
+    res_df = results2df(results, number_of_lines)
+
+    if format == "stdout":
+        print(res_df)
+    else:
+        raise NotImplementedError("only stdout format implemented for now")
+
+    # errors = defaultdict(list)  # problem: [] paths
+    #
+    # if errors:
+    #     lgr.warning(
+    #         "Failed to operate on some paths (empty records were listed):\n %s",
+    #         "\n ".join("%s: %d paths" % (k, len(v)) for k, v in errors.items()),
+    #     )
+
+
+def results2df(results, limit=10):
+    res_lim = results["results"]["bindings"][:limit]
+    res_val_l = [dict((k, v["value"]) for k, v in res.items()) for res in res_lim]
+    return pd.DataFrame(res_val_l)
+
+
+def create_dandiset_query(check_fields, filter_fields):
+    var = ""
+    for el in check_fields:
+        var += f" ?{DANDISETS_FIELDS[el][0]}"
+
+    query_str = (
+        f"SELECT DISTINCT ?d{var} WHERE \n" "{ \n" "   ?d dandi:assetsSummary ?as . \n"
+    )
+    for el in check_fields:
+        query_str += f"   {DANDISETS_FIELDS[el][1]} \n"
+
+    for (key, val) in filter_fields:
+        query_str += f'FILTER (?{DANDISETS_FIELDS[key][0]} = "{val}") \n'
+    query_str += "}"
+    # print(query_str)
+    return query_str
diff --git a/dandi/cli/command.py b/dandi/cli/command.py
index a3e0770b6..cd95eb87e 100644
--- a/dandi/cli/command.py
+++ b/dandi/cli/command.py
@@ -145,6 +145,7 @@ def main(ctx, log_level, pdb=False):
 from .cmd_ls import ls  # noqa: E402
 from .cmd_move import move  # noqa: E402
 from .cmd_organize import organize  # noqa: E402
+from .cmd_search import search  # noqa: E402
 from .cmd_service_scripts import service_scripts  # noqa: E402
 from .cmd_shell_completion import shell_completion  # noqa: E402
 from .cmd_upload import upload  # noqa: E402
@@ -158,6 +159,7 @@ def main(ctx, log_level, pdb=False):
     ls,
     move,
     organize,
+    search,
     service_scripts,
     shell_completion,
     upload,

From bd210e01876a1eb9919e21d0de5bcf9ce5058fc2 Mon Sep 17 00:00:00 2001
From: Dorota Jarecka <djarecka@gmail.com>
Date: Sun, 6 Nov 2022 23:05:09 -0500
Subject: [PATCH 2/4] adding assets search; updates to search syntax: allowing
 for multiple fields in select_fields, allowing for multiple values and ranges
 in filter fields; adding tests

---
 dandi/cli/cmd_search.py        | 126 +++++++++++++++++-----
 dandi/cli/tests/test_search.py | 192 +++++++++++++++++++++++++++++++++
 dandi/support/pyout.py         |   6 +-
 3 files changed, 294 insertions(+), 30 deletions(-)
 create mode 100644 dandi/cli/tests/test_search.py

diff --git a/dandi/cli/cmd_search.py b/dandi/cli/cmd_search.py
index 9eb129840..e7f25f18b 100644
--- a/dandi/cli/cmd_search.py
+++ b/dandi/cli/cmd_search.py
@@ -6,14 +6,39 @@
 
 from .base import map_to_click_exceptions
 
+# supported fields
+# TODO: adding more
 DANDISETS_FIELDS = {
     "approach": ["apr", "?as dandi:approach / schema:name ?apr ."],
     "species_id": ["sid", "?as dandi:species / schema:identifier ?sid ."],
     "species_name": ["snm", "?as dandi:species / schema:name ?snm ."],
 }
 
-# The use of f-strings apparently makes this not a proper docstring, and so
-# click doesn't use it unless we explicitly assign it to `help`:
+ASSETS_FIELDS = {
+    "size": ["size", "?asset schema:contentSize ?size ."],
+    "format": ["format", "?asset schema:encodingFormat ?format ."],
+}
+
+
+def parse_validate(ctx, param, value):
+    value_parse = []
+    # parsing elements that have multiple comma-separated values
+    for el in value:
+        value_parse += el.split(",")
+    if param.name == "select_fields":
+        if ctx.params["search_type"] == "dandisets":
+            choice_list = DANDISETS_FIELDS.keys()
+        elif ctx.params["search_type"] == "assets":
+            choice_list = ASSETS_FIELDS.keys()
+        else:
+            choice_list = None
+    else:
+        choice_list = None
+    # checking if all values are in the list of possible choices
+    for el in value_parse:
+        if choice_list and el not in choice_list:
+            ctx.fail(f"{el} is not in the list: {choice_list}")
+    return value_parse
 
 
 @click.command(help="Search TODO")
@@ -31,19 +56,20 @@
     type=click.Choice(["dandisets", "assets"]),
 )
 @click.option(
-    "--check_fields",
+    "-s",
+    "--select_fields",
     help="Field name for dandisets search",
-    type=click.Choice(DANDISETS_FIELDS.keys()),
+    callback=parse_validate,
     multiple=True,
 )
 @click.option(
+    "-f",
     "--filter_fields",
     help="Field name for dandisets search",
     type=(str, str),
     multiple=True,
 )
 @click.option(
-    "-f",
     "--format",
     help="Choose the format for output. TODO",
     type=click.Choice(["stdout", "csv"]),
@@ -64,7 +90,7 @@
 def search(
     file=None,
     search_type=None,
-    check_fields=None,
+    select_fields=None,
     filter_fields=None,
     format="stdout",
     number_of_lines=10,
@@ -84,18 +110,22 @@ def search(
         filepath = Path(file)
         with filepath.open() as f:
             query_str = f.read()
-    elif search_type == "dandisets":
-        if not check_fields and not filter_fields:
+    elif search_type in ["dandisets", "assets"]:
+        if not select_fields:
             raise Exception(
-                "check_fields or filter_fields is required if search type is dandisets"
+                f"select_fields is required if search type is {search_type}"
             )
-        elif filter_fields:
+        if filter_fields:
             for el in filter_fields:
-                if el[0] not in DANDISETS_FIELDS:
+                if el[0] not in select_fields:
                     raise Exception(
-                        f"field {el[0]} used in filter_fields, but only {DANDISETS_FIELDS} allowed"
+                        f"field {el[0]} used in filter_fields, "
+                        f"but select fields contain {select_fields}"
                     )
-        query_str = create_dandiset_query(check_fields, filter_fields)
+        if search_type == "dandisets":
+            query_str = create_dandisets_query(select_fields, filter_fields)
+        elif search_type == "assets":
+            query_str = create_assets_query(select_fields, filter_fields)
     else:
         raise NotImplementedError
 
@@ -108,14 +138,6 @@ def search(
     else:
         raise NotImplementedError("only stdout format implemented for now")
 
-    # errors = defaultdict(list)  # problem: [] paths
-    #
-    # if errors:
-    #     lgr.warning(
-    #         "Failed to operate on some paths (empty records were listed):\n %s",
-    #         "\n ".join("%s: %d paths" % (k, len(v)) for k, v in errors.items()),
-    #     )
-
 
 def results2df(results, limit=10):
     res_lim = results["results"]["bindings"][:limit]
@@ -123,19 +145,69 @@ def results2df(results, limit=10):
     return pd.DataFrame(res_val_l)
 
 
-def create_dandiset_query(check_fields, filter_fields):
+def filter_query(filter_fields, fields_dict):
+    """creating filter part for the queries"""
+    filter_str = ""
+    for (key, val) in filter_fields:
+        if val[0] == "(" and val[-1] == ")":
+            val = val[1:-1].split(",")
+            if len(val) != 2:
+                raise ValueError(
+                    "If value for filter is a tuple, it has to have 2 elements "
+                )
+            else:
+                min_val = val[0].strip()
+                max_val = val[1].strip()
+                if max_val and min_val:
+                    filter_str += (
+                        f"FILTER (?{fields_dict[key][0]} > {min_val} "
+                        f"&& ?{fields_dict[key][0]} < {max_val}) \n"
+                    )
+                elif max_val:
+                    filter_str += f"FILTER (?{fields_dict[key][0]} < {max_val}) \n"
+                elif min_val:
+                    filter_str += f"FILTER (?{fields_dict[key][0]} > {min_val}) \n"
+        else:
+            val = val.split(",")
+            cond_str = f'?{fields_dict[key][0]} = "{val[0]}"'
+            for el in val[1:]:
+                cond_str += f' || ?{fields_dict[key][0]} = "{el}"'
+            filter_str += f"FILTER ({cond_str}) \n"
+    return filter_str
+
+
+def create_dandisets_query(select_fields, filter_fields):
+    """Creating a query for dandisets search"""
     var = ""
-    for el in check_fields:
+    for el in select_fields:
         var += f" ?{DANDISETS_FIELDS[el][0]}"
 
     query_str = (
         f"SELECT DISTINCT ?d{var} WHERE \n" "{ \n" "   ?d dandi:assetsSummary ?as . \n"
     )
-    for el in check_fields:
+    for el in select_fields:
         query_str += f"   {DANDISETS_FIELDS[el][1]} \n"
+    query_str += filter_query(filter_fields, DANDISETS_FIELDS)
+    query_str += "}"
+    return query_str
 
-    for (key, val) in filter_fields:
-        query_str += f'FILTER (?{DANDISETS_FIELDS[key][0]} = "{val}") \n'
+
+def create_assets_query(select_fields, filter_fields):
+    """Creating a query for assets search"""
+    var = ""
+    for el in select_fields:
+        var += f" ?{ASSETS_FIELDS[el][0]}"
+
+    query_str = (
+        f"SELECT DISTINCT ?asset ?d_id ?path{var} WHERE \n"
+        "{ \n"
+        "   ?asset rdf:type dandi:Asset . \n"
+        "   ?d prov:hasMember ?asset . \n"
+        "   ?d schema:identifier ?d_id . \n"
+        "   ?asset dandi:path ?path . \n"
+    )
+    for el in select_fields:
+        query_str += f"   {ASSETS_FIELDS[el][1]} \n"
+    query_str += filter_query(filter_fields, ASSETS_FIELDS)
     query_str += "}"
-    # print(query_str)
     return query_str
diff --git a/dandi/cli/tests/test_search.py b/dandi/cli/tests/test_search.py
new file mode 100644
index 000000000..bafc4847f
--- /dev/null
+++ b/dandi/cli/tests/test_search.py
@@ -0,0 +1,192 @@
+from click.testing import CliRunner
+import pytest
+
+from dandi.tests.skip import mark
+
+from ..command import search
+
+pytestmark = mark.skipif_no_network
+
+
+@pytest.mark.parametrize(
+    "select_fields, print_fields",
+    [
+        ("approach", ["app"]),
+        ("species_name", ["snm"]),
+    ],
+)
+def test_search_dandiset_select_fields(select_fields, print_fields):
+    """using select_fields option with single or multiple comma-separated values"""
+    runner = CliRunner()
+    r = runner.invoke(search, ["-t", "dandisets", "--select_fields", select_fields])
+    assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}"
+    out = r.stdout
+    for fld in print_fields:
+        assert fld in out, f"{fld} is not in the output: {out}"
+
+
+@pytest.mark.parametrize(
+    "select_fields_mult",
+    [
+        (["--select_fields", "approach", "-s", "species_name"]),
+        (["--select_fields", "approach,species_name"]),
+    ],
+)
+def test_search_dandiset_select_fields_mult(select_fields_mult):
+    """using select_fields option multiple times"""
+    runner = CliRunner()
+    r = runner.invoke(search, ["-t", "dandisets"] + select_fields_mult)
+    assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}"
+    out = r.stdout
+    for fld in ["apr", "snm"]:
+        assert fld in out, f"field {fld} is not in the output"
+
+
+@pytest.mark.parametrize(
+    "filter_fields", [["species_name", "Human"], ["approach", "behavioral approach"]]
+)
+def test_search_dandiset_check_filter(filter_fields):
+    """using select_fields option multiple times"""
+    runner = CliRunner()
+    r = runner.invoke(
+        search,
+        [
+            "-t",
+            "dandisets",
+            "--select_fields",
+            "approach,species_name",
+            "--filter_fields",
+        ]
+        + filter_fields,
+    )
+    assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}"
+    out = r.stdout
+    for ln in out.split("\n")[1:]:
+        if ln == "":
+            break
+        assert filter_fields[1] in ln, f"value {filter_fields[1]} is not in the output"
+
+
+def test_search_dandiset_check_filter_mult():
+    """using a filter option multiple times"""
+    runner = CliRunner()
+    r = runner.invoke(
+        search,
+        [
+            "-t",
+            "dandisets",
+            "--select_fields",
+            "approach,species_name",
+            "-f",
+            "species_name",
+            "Human",
+            "-f",
+            "approach",
+            "behavioral approach",
+        ],
+    )
+    assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}"
+    out = r.stdout
+    for ln in out.split("\n")[1:-1]:
+        assert "Human" in ln, "Human is not in the output"
+        assert "behavioral approach" in ln, "behavioral approach is not in the output"
+
+
+def test_search_dandiset_check_filter_list():
+    """using comma-separated list in a filter option"""
+    runner = CliRunner()
+    r = runner.invoke(
+        search,
+        [
+            "-t",
+            "dandisets",
+            "--select_fields",
+            "approach,species_name",
+            "-f",
+            "species_name",
+            "Human,House mouse",
+        ],
+    )
+    assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}"
+    out = r.stdout
+    for ln in out.split("\n")[1:-1]:
+        assert "Human" in ln or "House mouse" in ln, "Human is not in the output"
+
+
+@pytest.mark.parametrize(
+    "select_fields, print_fields",
+    [("format", ["format"]), ("size", ["size"]), ("size,format", ["size", "format"])],
+)
+def test_search_assets_select_fields(select_fields, print_fields):
+    """using select_fields option with single or multiple comma-separated values"""
+    runner = CliRunner()
+    r = runner.invoke(search, ["-t", "assets", "--select_fields", select_fields])
+    assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}"
+    out = r.stdout
+    for fld in print_fields:
+        assert fld in out, f"{fld} is not in the output: {out}"
+
+
+@pytest.mark.parametrize(
+    "filter_fields",
+    [
+        ["format", "application/x-nwb"],
+    ],
+)
+def test_search_asset_check_filter(filter_fields):
+    """using select_fields and filter option in assets search"""
+    runner = CliRunner()
+    r = runner.invoke(
+        search,
+        ["-t", "assets", "--select_fields", "format,size", "--filter_fields"]
+        + filter_fields,
+    )
+    assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}"
+    out = r.stdout
+    for ln in out.split("\n")[1:]:
+        if ln == "":
+            break
+        assert filter_fields[1] in ln, f"value {filter_fields[1]} is not in the output"
+
+
+@pytest.mark.parametrize(
+    "size_range", [(7 * 10e9, 9 * 10e9), "(, 9*10e9)", "(7*10e9, )"]
+)
+def test_search_asset_check_filter_range(size_range):
+    """using range in a filter option for assets search"""
+    runner = CliRunner()
+    r = runner.invoke(
+        search,
+        [
+            "-t",
+            "assets",
+            "--select_fields",
+            "format,size",
+            "--filter_fields",
+            "size",
+            f"{size_range}",
+        ],
+    )
+    assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}"
+    out = r.stdout
+    assert "size" in out.split("\n")[0]
+    assert len(out.split("\n")) > 1
+
+
+def test_search_from_file(tmpdir):
+    """using search command with a file that contains any sprql query"""
+    query_file = tmpdir / "query.txt"
+    query = """
+    SELECT DISTINCT ?apr WHERE
+    {
+    ?as dandi:approach / schema:name ?apr
+    }
+    """
+    with query_file.open("w") as f:
+        f.write(query)
+    runner = CliRunner()
+    r = runner.invoke(search, ["-F", query_file])
+    assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}"
+    out = r.stdout
+    assert "apr" in out.split("\n")[0]
+    assert len(out.split("\n")) > 1
diff --git a/dandi/support/pyout.py b/dandi/support/pyout.py
index 90b422428..ebc51d3a1 100644
--- a/dandi/support/pyout.py
+++ b/dandi/support/pyout.py
@@ -36,7 +36,7 @@ def naturalsize(v):
     return humanize.naturalsize(v)
 
 
-def datefmt(v, fmt=u"%Y-%m-%d/%H:%M:%S"):
+def datefmt(v, fmt="%Y-%m-%d/%H:%M:%S"):
     if isinstance(v, datetime.datetime):
         return v.strftime(fmt)
     else:
@@ -71,8 +71,8 @@ def counts(values):
     color=dict(
         interval=[
             [0, 1024, "blue"],
-            [1024, 1024 ** 2, "green"],
-            [1024 ** 2, None, "red"],
+            [1024, 1024**2, "green"],
+            [1024**2, None, "red"],
         ]
     ),
     aggregate=lambda x: naturalsize(sum(x)),

From 2852e4208c147a1e15f33d0a8e94caaa58998c10 Mon Sep 17 00:00:00 2001
From: Dorota Jarecka <djarecka@gmail.com>
Date: Sun, 6 Nov 2022 23:21:18 -0500
Subject: [PATCH 3/4] removing stardog and adding SPARQLWrapper library to read
 from DB

---
 dandi/cli/cmd_search.py | 16 +++++++---------
 setup.cfg               |  1 +
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/dandi/cli/cmd_search.py b/dandi/cli/cmd_search.py
index e7f25f18b..5a3811e36 100644
--- a/dandi/cli/cmd_search.py
+++ b/dandi/cli/cmd_search.py
@@ -1,8 +1,8 @@
 from pathlib import Path
 
+from SPARQLWrapper import JSON, SPARQLWrapper
 import click
 import pandas as pd
-import stardog
 
 from .base import map_to_click_exceptions
 
@@ -100,12 +100,6 @@ def search(
     if file and search_type:
         raise Exception("file and type are mutually exclusive options")
 
-    conn_details = {
-        "endpoint": "https://search.dandiarchive.org:5820",
-        "username": "anonymous",
-        "password": "anonymous",
-    }
-
     if file:
         filepath = Path(file)
         with filepath.open() as f:
@@ -129,8 +123,12 @@ def search(
     else:
         raise NotImplementedError
 
-    with stardog.Connection(database_name, **conn_details) as conn:
-        results = conn.select(query_str)
+    endpoint = "https://search.dandiarchive.org:5820/dandisets_new/query"
+    sparql = SPARQLWrapper(endpoint)
+    sparql.setCredentials("anonymous", "anonymous")
+    sparql.setReturnFormat(JSON)
+    sparql.setQuery(query_str)
+    results = sparql.queryAndConvert()
     res_df = results2df(results, number_of_lines)
 
     if format == "stdout":
diff --git a/setup.cfg b/setup.cfg
index 0de477174..b26ce0254 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -57,6 +57,7 @@ install_requires =
     requests ~= 2.20
     ruamel.yaml >=0.15, <1
     semantic-version
+    sparqlwrapper
     tenacity
     zarr ~= 2.10
 zip_safe = False

From 447c8050392231aa27c0563fe413646ef6115f94 Mon Sep 17 00:00:00 2001
From: Dorota Jarecka <djarecka@gmail.com>
Date: Mon, 28 Nov 2022 13:49:55 -0500
Subject: [PATCH 4/4] adding age to the asstes fields

---
 dandi/cli/cmd_search.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dandi/cli/cmd_search.py b/dandi/cli/cmd_search.py
index 5a3811e36..b3c4e6abc 100644
--- a/dandi/cli/cmd_search.py
+++ b/dandi/cli/cmd_search.py
@@ -17,6 +17,7 @@
 ASSETS_FIELDS = {
     "size": ["size", "?asset schema:contentSize ?size ."],
     "format": ["format", "?asset schema:encodingFormat ?format ."],
+    "age": ["age", "?asset prov:wasAttributedTo / dandi:age / schema:value ?age ."],
 }