diff --git a/.gitignore b/.gitignore index ba74660..d520285 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ __pycache__/ # Distribution / packaging .Python env/ +.virtualenv/ build/ develop-eggs/ dist/ diff --git a/README.md b/README.md index 4069408..6f34bb5 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,69 @@ -# reference-server -A simple illustrative reference server for the Matchmaker Exchange API +# Matchmaker Exchange Reference Server +A simple illustrative reference server for the Matchmaker Exchange API. + +The server is backed by elasticsearch, and creates local indexes of the Human Phenotype Ontology, Ensembl-Entrez-HGNC gene symbol mappings, and the MME API benchmark set of 50 rare disease patients. + +## Dependencies +- Python 3.X (not yet tested on 2.7 but should be easy to get working) +- elasticsearch 2.X + + +## Quickstart + +1. Start up a local elasticsearch cluster, for example: + + ```bash + $ wget https://download.elasticsearch.org/elasticsearch/release/org/elasticsearch/distribution/tar/elasticsearch/2.1.1/elasticsearch-2.1.1.tar.gz + $ tar -xzf elasticsearch-2.1.1.tar.gz + $ cd elasticsearch-2.1.1/ + $ ./bin/elasticsearch + ``` + +1. Set up your Python virtual environment and install necessary Python packages, for example: + + ```bash + $ virtualenv -p python3 --prompt="(mme-server)" .virtualenv + $ source .virtualenv/bin/activate + $ pip install -r requirements.txt + ``` + +1. Download and index vocabularies and sample data: + + ```bash + $ python datastore.py + ``` + +1. Run tests: + + ```bash + $ python test.py + ``` + +1. Start up MME reference server: + + ```bash + $ python server.py + ``` + + By default, the server listens globally (`--host 0.0.0.0`) on port 8000 (`--port 8000`). + +1. Try it out: + + ```bash + $ curl -XPOST -d '{"patient":{ \ + "id":"1", \ + "contact": {"name":"Jane Doe", "href":"mailto:jdoe@example.edu"}, \ + "features":[{"id":"HP:0000522"}], \ + "genomicFeatures":[{"gene":{"id":"NGLY1"}}] \ + }}' localhost:8000/match + ``` + + +## TODO +- Avoid costly/redundant parsing `api.Patient` objects when generating MatchResponse objects from patients in database +- Inspect `Accepts` header for API versioning +- Add `Content-Type` header to responses +- Handle errors with proper HTTP statuses and JSON message bodies +- Add tests for gene index +- Add end-to-end API query tests +- Add parser tests diff --git a/api.py b/api.py new file mode 100644 index 0000000..cf65101 --- /dev/null +++ b/api.py @@ -0,0 +1,178 @@ +""" +The API module: + +Contains API methods and classes for API objects. +Handles parsing of API requests into API objects, and serializing API objects into API responses. + +Also contains some code to help convert API objects to their database representations. +""" +from __future__ import with_statement, division, unicode_literals + +import json + +from datastore import DatastoreConnection + + +class Feature: + # Connection to backend to validate vocabulary terms + db = DatastoreConnection() + + def __init__(self, data): + self._observed = data.get('observed', 'yes') == 'yes' + # TODO: parse ageOfOnset + self.term = self.db.get_vocabulary_term(data['id']) + + def _get_implied_terms(self): + return self.term['term_category'] + + def _get_id(self): + return self.term['id'] + + @property + def observed(self): + return self._observed + + +class GenomicFeature: + # Connection to backend to validate vocabulary terms + db = DatastoreConnection() + + def __init__(self, data): + self.term = None + gene_id = data.get('gene', {}).get('id') + # TODO: parse additional genomicFeature fields + if gene_id: + self.term = self.db.get_vocabulary_term(gene_id) + + def _get_gene_id(self): + if self.term: + return self.term['id'] + + +class Patient: + def __init__(self, data): + self.id = data['id'] + self.contact = data['contact'] + assert self.contact['name'] and self.contact['href'] + + features_json = data.get('features', []) + genomic_features_json = data.get('genomicFeatures', []) + + assert features_json or genomic_features_json, "At least one of 'features' or 'genomicFeatures' must be provided" + + # Parse phenotype terms + features = [Feature(feature_json) for feature_json in features_json] + + # Parse genomic features + genomic_features = [GenomicFeature(gf_json) for gf_json in genomic_features_json] + + assert features or genomic_features, "Was unable to parse any phenotype or gene terms" + + disorders = data.get('disorders', []) + self.label = data.get('label') + self.age_of_onset = data.get('ageOfOnset') + self.features = features + self.genomic_features = genomic_features + self.disorders = disorders + self.test = data.get('test', False) + + def _get_genes(self): + genes = set() + for genomic_feature in self.genomic_features: + gene_id = genomic_feature._get_gene_id() + if gene_id: + genes.add(gene_id) + + return genes + + def _get_present_phenotypes(self): + terms = set() + for feature in self.features: + if feature.observed: + terms.add(feature._get_id()) + + return terms + + def _get_implied_present_phenotypes(self): + terms = set() + for feature in self.features: + if feature.observed: + terms.update(feature._get_implied_terms()) + + return terms + + def to_json(self): + data = { + 'id': self.id, + 'contact': { + 'name': self.contact['name'], + 'href': self.contact['href'], + } + } + + if self.label: + data['label'] = self.label + + if self.age_of_onset: + data['ageOfOnset'] = self.age_of_onset + + phenotype_ids = self._get_present_phenotypes() + if phenotype_ids: + data['features'] = [{'id': id} for id in phenotype_ids] + + gene_ids = self._get_genes() + if gene_ids: + data['genomicFeatures'] = [{'gene': {'id': gene_id}} for gene_id in gene_ids] + + if self.disorders: + data['disorders'] = self.disorders + + if self.test: + data['test'] = True + + return data + + +class MatchRequest: + def __init__(self, request): + self.patient = Patient(request['patient']) + self._data = request + + +class MatchResult: + def __init__(self, match, score): + self.match = match + self.score = score + + def to_json(self): + response = {} + response['score'] = {'patient': self.score} + response['patient'] = self.match.to_json() + return response + + +def match(request, backend=None): + assert isinstance(request, MatchRequest), "Argument to match must be MatchResponse object" + + if not backend: + backend = DatastoreConnection() + + matches = [] + # Unpack patient and query backend + patient = request.patient + for score, patient in backend.find_similar_patients(patient): + match = MatchResult(patient, score) + matches.append(match) + + response = MatchResponse(matches) + return response + + +class MatchResponse: + def __init__(self, response): + self._data = response + + def to_json(self): + response = {} + response['results'] = [match.to_json() for match in self._data] + return response diff --git a/datastore.py b/datastore.py new file mode 100644 index 0000000..69efff6 --- /dev/null +++ b/datastore.py @@ -0,0 +1,339 @@ +""" +Module for interfacing with the backend database (elasticsearch). + +Stores: +* Patient records (`patients` index) +* Human Phenotype Ontology (HPO) (`hpo` index) +* Ensembl-Entrez-HGNC-GeneSymbol mappings (`genes` index) +""" + +from __future__ import with_statement, division, unicode_literals + +import sys +import os +import json +import logging + +try: + from urllib import urlretrieve +except ImportError: + from urllib.request import urlretrieve + +from elasticsearch import Elasticsearch +from parsers import OBOParser, GeneParser + +logger = logging.getLogger(__name__) + +# Matchmaker Exchange benchmark dataset of 50 patients +TEST_DATA_URL = 'https://raw.githubusercontent.com/ga4gh/mme-apis/hotfix/v1.0b/testing/benchmark_patients.json' +DEFAULT_DATA_FILENAME = 'data.json' + +# Human Phenotype Ontology +HPO_URL = 'http://purl.obolibrary.org/obo/hp.obo' +DEFAULT_HPO_FILENAME = 'hp.obo' + +# Gene identifier mappings, from HGNC: www.genenames.org +# Ensembl Genes 83 (GRCh38.p5) +# Columns: +# HGNC ID +# Approved Symbol +# Approved Name +# Synonyms +# Entrez Gene ID (supplied by NCBI) +# Ensembl Gene ID (supplied by Ensembl) +GENE_URL = 'http://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_aliases&col=md_eg_id&col=md_ensembl_id&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit' +DEFAULT_GENE_FILENAME = 'genes.tsv' + + +class PatientManager: + TYPE_NAME = 'patient' + INDEX_CONFIG = { + 'mappings': { + TYPE_NAME: { + '_all': { + 'enabled': False, + }, + 'properties': { + 'phenotype': { + 'type': 'string', + 'index': 'not_analyzed', + }, + 'gene': { + 'type': 'string', + 'index': 'not_analyzed', + }, + 'doc': { + 'type': 'object', + 'enabled': False, + 'include_in_all': False, + } + } + } + } + } + + def __init__(self, backend, index='patients'): + self._db = backend + self._index = index + + def index(self, filename): + """Populate the database with patient data from the given file""" + from api import Patient + + if self._db.indices.exists(index=self._index): + logging.warning("Patient index already exists: '{}'".format(self._index)) + return + else: + logging.info("Creating patient ElasticSearch index: '{}'".format(self._index)) + self._db.indices.create(index=self._index, body=self.INDEX_CONFIG) + + with open(filename) as ifp: + data = json.load(ifp) + + logging.info("Found data for {} patient records".format(len(data))) + for record in data: + patient = Patient(record) + self.add_patient(patient) + + # Update index before returning record count + self._db.indices.refresh(index=self._index) + n = self._db.count(index=self._index, doc_type=self.TYPE_NAME) + logger.info('Datastore now contains {} records'.format(n['count'])) + + def add_patient(self, patient): + """Add the provided api.Patient object to the datastore""" + id = patient.id + data = self._patient_to_index(patient) + self._db.index(index=self._index, doc_type=self.TYPE_NAME, id=id, body=data) + logging.info("Indexed patient: '{}'".format(id)) + + def _patient_to_index(self, patient): + genes = patient._get_genes() + phenotypes = patient._get_implied_present_phenotypes() + + return { + 'phenotype': list(phenotypes), + 'gene': list(genes), + 'doc': patient.to_json(), + } + + def find_similar_patients(self, patient, n=5): + """Return the n most similar patients to the given query api.Patient""" + from api import Patient + + query_parts = [] + for id in patient._get_implied_present_phenotypes(): + query_parts.append({'match': {'phenotype': id}}) + + for gene_id in patient._get_genes(): + query_parts.append({'match': {'gene': gene_id}}) + + query = { + 'query': { + 'bool': { + 'should': [ + query_parts + ] + } + } + } + + result = self._db.search(index=self._index, body=query) + + scored_patients = [] + for hit in result['hits']['hits'][:n]: + # Just use the ElasticSearch TF/IDF score, normalized to [0, 1] + score = 1 - 1 / (1 + hit['_score']) + scored_patients.append((score, Patient(hit['_source']['doc']))) + + return scored_patients + + +class VocabularyManager: + TERM_TYPE_NAME = 'term' + META_TYPE_NAME = 'meta' + INDEX_CONFIG = { + 'mappings': { + TERM_TYPE_NAME: { + '_all': { + 'enabled': False, + }, + 'properties': { + 'id': { + 'type': 'string', + 'index': 'not_analyzed', + }, + 'name': { + 'type': 'string', + }, + 'synonym': { + 'type': 'string', + }, + 'alt_id': { + 'type': 'string', + 'index': 'not_analyzed', + }, + 'is_a': { + 'type': 'string', + 'index': 'not_analyzed', + }, + 'term_category': { + 'type': 'string', + 'index': 'not_analyzed', + }, + } + } + } + } + + def __init__(self, backend): + self._db = backend + + def index(self, index, filename, Parser): + """Index terms from the given file + + :param index: the id of the index + :param filename: the path to the vocabulary file + :param Parser: the Parser class to use to parse the vocabulary file + """ + parser = Parser(filename) + + if self._db.indices.exists(index=index): + logging.warning('Vocabulary index already exists: {!r} ... skipping'.format(index)) + return + else: + logging.info("Creating index: {!r}".format(index)) + self._db.indices.create(index=index, body=self.INDEX_CONFIG) + + logging.info("Parsing vocabulary from: {!r}".format(filename)) + commands = [] + for term in parser: + id = term['id'] + command = [ + {'index': {'_id': id}}, + term, + ] + commands.extend(command) + + data = "".join([json.dumps(command) + "\n" for command in commands]) + self._db.bulk(data, index=index, doc_type=self.TERM_TYPE_NAME, refresh=True) + + n = self._db.count(index=index, doc_type=self.TERM_TYPE_NAME) + logger.info('Index now contains {} terms'.format(n['count'])) + + def get_term(self, id, index='_all'): + """Get vocabulary term by ID""" + query = { + 'query': { + 'filtered': { + 'filter': { + 'bool': { + 'should': [ + {'term': {'id': id}}, + {'term': {'alt_id': id}}, + ] + } + } + } + } + } + results = self._db.search(index=index, doc_type=self.TERM_TYPE_NAME, body=query) + if results['hits']['total'] == 1: + return results['hits']['hits'][0]['_source'] + else: + logger.error("Unable to uniquely resolve term: {!r}".format(id)) + + +class DatastoreConnection: + def __init__(self): + self._es = Elasticsearch() + self._patients = PatientManager(self) + self._vocabularies = VocabularyManager(self) + + def index_patients(self, filename): + return self._patients.index(filename) + + def index_hpo(self, filename): + return self._vocabularies.index(index='hpo', filename=filename, Parser=OBOParser) + + def index_genes(self, filename): + return self._vocabularies.index(index='genes', filename=filename, Parser=GeneParser) + + def get_vocabulary_term(self, id, index='_all'): + return self._vocabularies.get_term(id, index=index) + + def find_similar_patients(self, patient, n=5): + """Return the n most similar patients to the given query api.Patient""" + return self._patients.find_similar_patients(patient=patient, n=n) + + def search(self, *args, **kwargs): + """Expose ElasticSearch method""" + return self._es.search(*args, **kwargs) + + def bulk(self, *args, **kwargs): + """Expose ElasticSearch method""" + return self._es.bulk(*args, **kwargs) + + def index(self, *args, **kwargs): + """Expose ElasticSearch method""" + return self._es.index(*args, **kwargs) + + def count(self, *args, **kwargs): + """Expose ElasticSearch method""" + return self._es.count(*args, **kwargs) + + @property + def indices(self): + """Expose ElasticSearch property""" + return self._es.indices + + +def initialize_backend(data_filename, hpo_filename, gene_filename): + backend = DatastoreConnection() + backend.index_hpo(hpo_filename) + backend.index_genes(gene_filename) + # Patients must be indexed AFTER vocabularies + backend.index_patients(data_filename) + + +def fetch_resource(url, filename): + if os.path.isfile(filename): + logger.info('Found local resource: {}'.format(filename)) + else: + logger.info('Downloading file from: {}'.format(url)) + urlretrieve(url, filename) + logger.info('Saved file to: {}'.format(filename)) + + +def parse_args(args): + from argparse import ArgumentParser + + description = "Initialize datastore with example data and necessary vocabularies" + + parser = ArgumentParser(description=description) + parser.add_argument("--data-file", default=DEFAULT_DATA_FILENAME, + dest="data_filename", metavar="FILE", + help="Load data from the following file (will download test data if file does not exist)") + parser.add_argument("--hpo-file", default=DEFAULT_HPO_FILENAME, + dest="hpo_filename", metavar="FILE", + help="Load HPO from the following file (will download if file does not exist)") + parser.add_argument("--gene-file", default=DEFAULT_GENE_FILENAME, + dest="gene_filename", metavar="FILE", + help="Load gene mappings from the following file (will download if file does not exist)") + + return parser.parse_args(args) + + +def main(args=sys.argv[1:]): + args = parse_args(args) + logging.basicConfig(level='INFO') + + fetch_resource(TEST_DATA_URL, args.data_filename) + fetch_resource(HPO_URL, args.hpo_filename) + fetch_resource(GENE_URL, args.gene_filename) + + initialize_backend(args.data_filename, args.hpo_filename, args.gene_filename) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/obo.py b/obo.py new file mode 100644 index 0000000..ecb4276 --- /dev/null +++ b/obo.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python +""" +Downloaded from: https://github.com/ntamas/gfam/blob/master/gfam/go/obo.py + +A very simple and not 100% compliant parser for the OBO file format. + +This parser is supplied "as is"; it is not an official parser, it +might puke on perfectly valid OBO files, it might parse perfectly +invalid OBO files, it might steal your kitten or set your garden shed +on fire. Apart from that, it should be working, or at least +it should be in a suitable condition to parse the Gene Ontology, +which is my only test case anyway. + +Usage example:: + + from obo import Parser + parser = Parser(open("hp.obo")) + hp_ontology = {} + for stanza in parser: + hp_ontology[stanza.tags["id"][0]] = stanza.tags +""" + +__author__ = "Tamas Nepusz" +__email__ = "tamas@cs.rhul.ac.uk" +__copyright__ = "Copyright (c) 2009, Tamas Nepusz" +__license__ = "MIT" +__version__ = "0.1" + +__all__ = ["ParseError", "Stanza", "Parser", "Value"] + +try: + from io import StringIO +except ImportError: + from cStringIO import StringIO + +import re +import tokenize +import logging + + +class ParseError(Exception): + pass + + +# pylint:disable-msg=R0903 +# R0903: too few public methods +class Value(object): + """Class representing a value and its modifiers in the OBO file + + This class has two member variables. `value` is the value itself, + `modifiers` are the corresponding modifiers in a tuple. Currently + the modifiers are not parsed in any way, but this might change in + the future. + """ + + __slots__ = ["value", "modifiers"] + + def __init__(self, value, modifiers=()): + self.value = str(value) + if modifiers: + self.modifiers = tuple(modifiers) + else: + self.modifiers = None + + def __str__(self): + """Returns the value itself (without modifiers)""" + return str(self.value) + + def __repr__(self): + """Returns a Python representation of this object""" + return "{:s}({!r}, {!r})".format(self.__class__.__name__, + self.value, self.modifiers) + + +# pylint:disable-msg=R0903 +# R0903: too few public methods +class Stanza(object): + """Class representing an OBO stanza. + + An OBO stanza looks like this:: + + [name] + tag: value + tag: value + tag: value + + Values may optionally have modifiers, see the OBO specification + for more details. This class stores the stanza name in the + `name` member variable and the tags and values in a Python + dict called `tags`. Given a valid stanza, you can do stuff like + this: + + >>> stanza.name + "Term" + >>> stanza.tags["id"] + ['GO:0015036'] + >>> stanza.tags["name"] + ['disulfide oxidoreductase activity'] + + Note that the `tags` dict contains lists associated to each + tag name. This is because theoretically there could be more than + a single value associated to a tag in the OBO file format. + """ + + __slots__ = ["name", "tags"] + + def __init__(self, name, tags=None): + self.name = name + if tags: + self.tags = dict(tags) + else: + self.tags = dict() + + def __repr__(self): + """Returns a Python representation of this object""" + return "{:s}({!r}, {!r})".format(self.__class__.__name__, + self.name, self.tags) + + +# pylint:disable-msg=R0903 +# R0903: too few public methods +class Parser(object): + """The main attraction, the OBO parser.""" + + def __init__(self, file_handle): + """Creates an OBO parser that reads the given file-like object. + If you want to create a parser that reads an OBO file, do this: + + >>> import obo + >>> parser = obo.Parser(open("hp.obo")) + + Only the headers are read when creating the parser. You can + access these right after construction as follows: + + >>> parser.headers["format-version"] + ['1.2'] + + To read the stanzas in the file, you must iterate over the + parser as if it were a list. The iterator yields `Stanza` + objects. + """ + self.file_handle = file_handle + self.line_re = re.compile(r"\s*(?P[^:]+):\s*(?P.*)") + self.lineno = 0 + self.headers = {} + self._extra_line = None + self._read_headers() + + def _lines(self): + """Iterates over the lines of the file, removing + comments and trailing newlines and merging multi-line + tag-value pairs into a single line""" + while True: + self.lineno += 1 + line = self.file_handle.readline() + if not line: + break + + line = line.strip() + if not line: + yield line + continue + + if line[0] == '!': + continue + if line[-1] == '\\': + # This line is continued in the next line + lines = [line[:-1]] + finished = False + while not finished: + self.lineno += 1 + line = self.file_handle.readline() + if line[0] == '!': + continue + line = line.strip() + if line[-1] == '\\': + lines.append(line[:-1]) + else: + lines.append(line) + finished = True + line = " ".join(lines) + else: + in_quotes = False + escape = False + comment_char_index = None + for index, char in enumerate(line): + if escape: + escape = False + continue + if char == '"': + in_quotes = not in_quotes + elif char == '\\' and in_quotes: + escape = True + elif char == '!' and not in_quotes: + comment_char_index = index + break + if comment_char_index is not None: + line = line[0:comment_char_index].strip() + + yield line + + def _parse_line(self, line): + """Parses a single line consisting of a tag-value pair + and optional modifiers. Returns the tag name and the + value as a `Value` object.""" + match = self.line_re.match(line) + if not match: + return False + tag, value_and_mod = match.group("tag"), match.group("value") + + # If the value starts with a quotation mark, we parse it as a + # Python string -- luckily this is the same as an OBO string + if value_and_mod and value_and_mod[0] == '"': + gen = tokenize.generate_tokens(StringIO(value_and_mod).readline) + for toknum, tokval, _, (_, ecol), _ in gen: + if toknum == tokenize.STRING: + value = eval(tokval) + mod = (value_and_mod[ecol:].strip(),) + break + raise ParseError("cannot parse string literal", self.lineno) + else: + value = value_and_mod + mod = None + + value = Value(value, mod) + return tag, value + + def _read_headers(self): + """Reads the headers from the OBO file""" + for line in self._lines(): + if not line or line[0] == '[': + # We have reached the end of headers + self._extra_line = line + return + key, value = self._parse_line(line) + try: + self.headers[key].append(value.value) + except KeyError: + self.headers[key] = [value.value] + + def stanzas(self): + """Iterates over the stanzas in this OBO file, + yielding a `Stanza` object for each stanza.""" + stanza = None + if self._extra_line and self._extra_line[0] == '[': + stanza = Stanza(self._extra_line[1:-1]) + for line in self._lines(): + if not line: + continue + if line[0] == '[': + if stanza: + yield stanza + stanza = Stanza(line[1:-1]) + continue + tag, value = self._parse_line(line) + try: + stanza.tags[tag].append(value) + except KeyError: + stanza.tags[tag] = [value] + + def __iter__(self): + return self.stanzas() + + +def test(): + """Simple smoke testing for the OBO parser""" + logging.basicConfig(level='INFO') + parser = Parser("hp.obo") + for i, _ in enumerate(parser): + if i % 1000 == 0: + logging.info("{:d} stanzas processed".format(i)) + logging.info("Parsing successful, {:d} stanzas".format(i)) + + +if __name__ == "__main__": + import sys + + sys.exit(test()) diff --git a/parsers.py b/parsers.py new file mode 100644 index 0000000..c667269 --- /dev/null +++ b/parsers.py @@ -0,0 +1,138 @@ +""" +Module for providing file and dataset parsing functionality. +""" + +from __future__ import with_statement, division, unicode_literals + +from csv import DictReader +from collections import defaultdict + +from obo import Parser as BaseOBOParser + + +class BaseParser: + def __init__(self, filename): + self._filename = filename + + def documents(self): + raise NotImplementedError() + + def __iter__(self): + return self.documents() + + +class OBOParser(BaseParser): + def documents(self): + parser = BaseOBOParser(open(self._filename)) + + # Parse all terms first + terms = {} + + def get_tag_strings(stanza, tag): + return list(map(str, stanza.tags.get(tag, []))) + + for stanza in parser: + id = str(stanza.tags['id'][0]) + name = str(stanza.tags['name'][0]) + alt_id = get_tag_strings(stanza, 'alt_id') + synonym = get_tag_strings(stanza, 'synonym') + is_a = get_tag_strings(stanza, 'is_a') + is_obsolete = get_tag_strings(stanza, 'is_obsolete') + # Skip obsolete terms + if is_obsolete and 'true' in is_obsolete: + continue + + terms[id] = { + 'id': id, + 'name': name, + 'synonym': synonym, + 'alt_id': alt_id, + 'is_a': is_a, + 'term_category': [], # Added later + } + + # Then compute ancestor paths for each term + def get_ancestors(node_id, ancestors=None): + if ancestors is None: + ancestors = set() + ancestors.add(node_id) + for parent_id in terms[node_id]['is_a']: + get_ancestors(parent_id, ancestors) + return ancestors + + for id in terms: + term = terms[id] + term['term_category'] = list(get_ancestors(id)) + + yield term + + +class TSVParser(BaseParser): + def _documents(self, columns): + with open(self._filename) as ifp: + reader = DictReader(ifp, delimiter='\t') + for row in reader: + term = defaultdict(list) + for column in columns: + key = column['column'] + field = column['field'] + prefix = column.get('prefix') + delimiter = column.get('delimiter') + length = column.get('length') + + value = row[key] + # Split multivalued fields + if delimiter: + values = value.split(delimiter) + else: + values = [value] + + # Ensure all lengths are correct + if length: + for value in values: + assert len(value) == length or not value + + # Prepend prefix to all values + if prefix: + values = ['{}:{}'.format(prefix, value) for value in values] + + term[field].extend(values) + + # ElasticSearch does not allow id field to be singleton list + assert len(term['id']) == 1 + term['id'] = term['id'][0] + yield term + + +class GeneParser(TSVParser): + def documents(self): + columns = [ + { + 'column': 'Ensembl ID(supplied by Ensembl)', + 'field': 'id', + 'length': 15, + }, + { + 'column': 'Approved Name', + 'field': 'name', + }, + { + 'column': 'Approved Symbol', + 'field': 'alt_id', + }, + { + 'column': 'Synonyms', + 'field': 'alt_id', + 'delimiter': ', ', + }, + { + 'column': 'Entrez Gene ID(supplied by NCBI)', + 'field': 'alt_id', + 'prefix': 'NCBIGene', + }, + { + 'column': 'HGNC ID', + 'field': 'alt_id', + }, + ] + return TSVParser._documents(self, columns) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d513307 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +Flask==0.10.1 +elasticsearch>=2.0.0,<3.0.0 diff --git a/server.py b/server.py new file mode 100644 index 0000000..917e8a9 --- /dev/null +++ b/server.py @@ -0,0 +1,67 @@ +""" +Usage: python server.py + +This is a minimum working example of a Matchmaker Exchange server. +It is intended strictly as a useful reference, and should not be +used in a production setting. +""" + +from __future__ import with_statement, division, unicode_literals + +import sys +import logging +import json + +import api +from flask import Flask, request, make_response, after_this_request + +DEFAULT_HOST = '0.0.0.0' +DEFAULT_PORT = 8000 +API_MIME_TYPE = 'application/vnd.ga4gh.matchmaker.v1.0+json' + +# Global flask application +app = Flask(__name__) +# Logger +logger = logging.getLogger(__name__) + + +# def add_content_type_header(response): +# response.headers['Content-Type'] = API_MIME_TYPE + +@app.route('/match', methods=['POST']) +def match(): + """Return patients similar to the query patient""" + logging.info("Getting flask request data") + data = request.get_json(force=True) + logging.info("Parsing query") + query = api.MatchRequest(data) + logging.info("Finding similar patients") + matches = api.match(query) + logging.info("Serializing response") + return (json.dumps(matches.to_json()), 200, {}) + + +def parse_args(args): + from argparse import ArgumentParser + + description = __doc__.strip() + + parser = ArgumentParser(description=description) + parser.add_argument("-p", "--port", default=DEFAULT_PORT, + dest="port", type=int, metavar="PORT", + help="The port the server will listen on") + parser.add_argument("--host", default=DEFAULT_HOST, + dest="host", metavar="IP", + help="The host the server will listen to (0.0.0.0 to listen globally)") + + return parser.parse_args(args) + + +def main(args=sys.argv[1:]): + args = parse_args(args) + logging.basicConfig(level='INFO') + app.run(host=args.host, port=args.port) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/test.py b/test.py new file mode 100644 index 0000000..b98337d --- /dev/null +++ b/test.py @@ -0,0 +1,137 @@ +import unittest +from unittest import TestCase + +import api +from elasticsearch import Elasticsearch +from datastore import DatastoreConnection + +class ElasticSearchTests(TestCase): + @classmethod + def setUpClass(cls): + cls.es = Elasticsearch() + + # Unittest test backwards compatibility to Python 2.X + try: + assertCountEqual = TestCase.assertCountEqual + except AttributeError: + assertCountEqual = TestCase.assertItemsEqual + + def test_patient_indexed(self): + record = self.es.get(index='patients', id='P0001135') + self.assertTrue(record['found']) + self.assertCountEqual(record['_source']['gene'], ['ENSG00000151092']) # NGLY1 + + def test_hpo_indexed(self): + term = self.es.get(index='hpo', id='HP:0000252') + self.assertTrue(term['found']) + doc = term['_source'] + self.assertEqual(doc['name'], 'Microcephaly') + self.assertAlmostEqual(len(doc['alt_id']), 4, delta=1) + self.assertIn('small head', doc['synonym']) + self.assertCountEqual(doc['is_a'], ['HP:0040195', 'HP:0007364']) + self.assertAlmostEqual(len(doc['term_category']), 19, delta=2) + + def test_gene_filter(self): + query = { + 'query': { + 'filtered': { + 'filter': { + 'term': { + 'gene': 'ENSG00000151092', # NGLY1 + } + } + } + } + } + results = self.es.search(index='patients', body=query) + self.assertEqual(results['hits']['total'], 8, "Expected 8 cases with NGLY1 gene") + + def test_phenotype_filter(self): + query = { + 'query': { + 'filtered': { + 'filter': { + 'term': { + 'phenotype': 'HP:0000118' + } + } + } + } + } + results = self.es.search(index='patients', body=query) + self.assertEqual(results['hits']['total'], 50, "Expected 50 cases with some phenotypic abnormality") + + def test_fuzzy_search(self): + query = { + 'query': { + 'bool': { + 'should': [ + {'match': {'phenotype': 'HP:0000252'}}, # Microcephaly + {'match': {'phenotype': 'HP:0000522'}}, # Alacrima + {'match': {'gene': 'NGLY1'}}, + ] + } + } + } + results = self.es.search(index='patients', body=query) + self.assertEqual(results['hits']['hits'][0]['_id'], 'P0001070') + + +class DatastoreTests(TestCase): + @classmethod + def setUpClass(cls): + cls.backend = DatastoreConnection() + + def test_get_term(self): + # Lookup term using alias + term = self.backend.get_vocabulary_term('HP:0001366') + + self.assertEqual(term['id'], 'HP:0000252') + self.assertEqual(term['name'], 'Microcephaly') + self.assertEqual(len(term['is_a']), 2) + self.assertAlmostEqual(len(term['term_category']), 20, delta=5) + + +class MatchRequestTests(TestCase): + def setUp(self): + self._request_template = { + 'patient': { + 'id': '1', + 'contact': { + 'name': 'First Last', + 'institution': 'Contact Institution', + 'href': 'first.last@example.com', + }, + 'features': [], + 'genomicFeatures': [], + } + } + + def test_gene_symbol_match(self): + data = self._request_template + data['patient']['features'].extend([ + { + 'id': 'HP:0000252', + 'label': 'Microcephaly', + }, + { + 'id': 'HP:0000522', + 'label': 'Alacrima', + }, + ]) + data['patient']['genomicFeatures'].append({ + 'gene': { + 'id': 'NGLY1' + } + }) + + request = api.MatchRequest(data) + response = api.match(request) + + self.assertTrue(isinstance(response, api.MatchResponse)) + results = response.to_json()['results'] + self.assertTrue(results) + + +if __name__ == '__main__': + unittest.main()