Skip to content

Commit

Permalink
Add work for supporting CURIE query support ...
Browse files Browse the repository at this point in the history
Specifically targets _getannotation and _getannotations methods at the
moment and only for the mygene instance
  • Loading branch information
Johnathan Schaff committed Feb 6, 2024
1 parent 925ffbe commit 8cc86e7
Show file tree
Hide file tree
Showing 2 changed files with 157 additions and 1 deletion.
8 changes: 7 additions & 1 deletion biothings_client/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import requests

from .utils import str_types, is_py27
from .utils.curie import generate_annotation_prefix_patterns, transform_query

try:
from collections.abc import Iterable
Expand All @@ -38,14 +39,15 @@
__version__ = "0.3.1"

logger = logging.getLogger("biothings.client")
logger.setLevel(logging.INFO)

if is_py27:
# we need to setup default log handler in Py 2.7
# Py 3.x does it by default
handler = logging.StreamHandler()
# formatter = logging.Formatter("%(levelname)s:%(name)s:%(message)s")
# handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

# Future work:
# Consider use "verbose" settings to control default logging output level
Expand Down Expand Up @@ -120,6 +122,7 @@ def __init__(self, url=None):
if self.url[-1] == "/":
self.url = self.url[:-1]
self.max_query = self._max_query

# delay and step attributes are for batch queries.
self.delay = self._delay # delay is ignored when requests made from cache.
self.step = self._step
Expand All @@ -139,6 +142,7 @@ def __init__(self, url=None):
}
)
self._cached = False
self.annotation_prefix_patterns = generate_annotation_prefix_patterns(self._biolink_model_prefix_mapping)

def use_http(self):
"""Use http instead of https for API calls."""
Expand Down Expand Up @@ -330,6 +334,7 @@ def _get_fields(self, search_term=None, verbose=True):
logger.info(self._from_cache_notification)
return ret

@transform_query
def _getannotation(self, _id, fields=None, **kwargs):
"""Return the object given id.
This is a wrapper for GET query of the biothings annotation service.
Expand Down Expand Up @@ -363,6 +368,7 @@ def _annotations_generator(self, query_fn, ids, verbose=True, **kwargs):
for hit in hits:
yield hit

@transform_query
def _getannotations(self, ids, fields=None, **kwargs):
"""Return the list of annotation objects for the given list of ids.
This is a wrapper for POST query of the biothings annotation service.
Expand Down
150 changes: 150 additions & 0 deletions biothings_client/utils/curie.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
"""
Methods that provide CURIE ID query support to the biothings client
"""

import functools
import logging
import re


logger = logging.getLogger("biothings.client")


def generate_annotation_prefix_patterns(prefix_mapping):
"""
Takes the optionally provided BIOLINK_MODEL_PREFIX_BIOTHINGS_MAPPING
configuration and generates the regex patterns for matching against our
annotation queries
"""
biolink_curie_regex_list = []
for (
biolink_prefix,
mapping,
) in prefix_mapping.items():
expression = re.compile(rf"({biolink_prefix}):(?P<term>[^:]+)", re.I)
field_match = mapping["field"]
pattern = (expression, field_match)
biolink_curie_regex_list.append(pattern)

default_pattern = (
re.compile(r"(?P<scope>\w+):(?P<term>[^:]+)"),
[],
)
biolink_curie_regex_list.append(default_pattern)
return biolink_curie_regex_list


def parse_query(query, regex_mapping):
"""
Parsing method for handling the provided query
Inputs Arguments:
query: string argument indicating the id value to search against our indices
Can be of the form:
_id = <term>
_id = <scope>:<term>
regex_mapping: dictionary mapping of the following structure:
<regex_pattern>:<matched_fields>
Outputs:
Returns a tuple of the modified or unmodified query and any potential transformed fields
"""
discovered_fields = []
for regex, fields in regex_mapping:
match = re.fullmatch(regex, query)
if match:
logger.debug(f"Discovered match: {regex} -> {query}")
named_groups = match.groupdict()
query = named_groups.get("term", query)
discovered_fields = named_groups.get("scopes", [])
logger.debug(f"Transformed query: {query} Discovered fields: {fields}")
break
return (query, discovered_fields)


def transform_query(func):
"""
Decorator for adding support to the CURIE ID querying without modifying the
original signature API for the clients.
Intended to support the _get_annotation and _get_annotations client methods
"""

@functools.wraps(func)
def _support_curie_id(self, *args, **kwargs):
"""
Provides the regex pattern matching over the associated query to extract potentially
embedded fields within the term
In the case of supporting CURIE ID values, we leverage the biolink prefixes to map the
biolink term to an equivalent biothings term.
Otherwise we default to atttempting to support the basic <scope>:<term> structure
This method handles the GET request method _get_annotation which expects a singular ID
"""
query = ""
fields = []
if len(args) == 0:
query = kwargs.get("_id", query)
fields = kwargs.get("fields", fields)
elif len(args) == 1:
query = args[0]
fields = kwargs.get("fields", fields)
elif len(args) == 2:
query = args[0]
fields = args[1]

logger.debug(f"Input prior to transformation <query: {query}> <fields: {fields}>")
query, discovered_fields = parse_query(query, self.annotation_prefix_patterns)
fields.extend(discovered_fields)

args = ()
kwargs["_id"] = query
kwargs["fields"] = fields

return func(self, *args, **kwargs)

@functools.wraps(func)
def _support_multiple_curie_id(self, *args, **kwargs):
"""
Provides the regex pattern matching over the associated query to extract potentially
embedded fields within the term
In the case of supporting CURIE ID values, we leverage the biolink prefixes to map the
biolink term to an equivalent biothings term.
Otherwise we default to atttempting to support the basic <scope>:<term> structure
This method handles the POST request method _get_annotations which expects a collection of
ID values
"""
query_collection = []
fields = []
if len(args) == 0:
query_collection = kwargs.get("ids", query_collection)
fields = kwargs.get("fields", fields)
elif len(args) == 1:
query = args[0]
fields = kwargs.get("fields", fields)
elif len(args) == 2:
query = args[0]
fields = args[1]

logger.debug(f"Input prior to transformation <query values: {query_collection}> <fields: {fields}>")

query_aggregation = []
for query_entry in query_collection:
query, discovered_fields = parse_query(query_entry, self.annotation_prefix_patterns)
query_aggregation.append(query)
fields.extend(discovered_fields)

args = ()
kwargs["ids"] = query_aggregation
kwargs["fields"] = fields

return func(self, *args, **kwargs)

function_mapping = {"_getannotation": _support_curie_id, "_getannotations": _support_multiple_curie_id}
return function_mapping[func.__name__]

0 comments on commit 8cc86e7

Please sign in to comment.