Initial version with basic match functionality

MatchmakerExchange · Jan 5, 2016 · abc78a7 · abc78a7
1 parent ed074f8
commit abc78a7
Show file tree

Hide file tree

Showing 9 changed files with 1,209 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@ __pycache__/
 # Distribution / packaging
 .Python
 env/
+.virtualenv/
 build/
 develop-eggs/
 dist/

diff --git a/README.md b/README.md
@@ -1,2 +1,69 @@
-# reference-server
-A simple illustrative reference server for the Matchmaker Exchange API
+# Matchmaker Exchange Reference Server
+A simple illustrative reference server for the Matchmaker Exchange API.
+
+The server is backed by elasticsearch, and creates local indexes of the Human Phenotype Ontology, Ensembl-Entrez-HGNC gene symbol mappings, and the MME API benchmark set of 50 rare disease patients.
+
+## Dependencies
+- Python 3.X (not yet tested on 2.7 but should be easy to get working)
+- elasticsearch 2.X
+
+
+## Quickstart
+
+1. Start up a local elasticsearch cluster, for example:
+
+    ```bash
+    $ wget https://download.elasticsearch.org/elasticsearch/release/org/elasticsearch/distribution/tar/elasticsearch/2.1.1/elasticsearch-2.1.1.tar.gz
+    $ tar -xzf elasticsearch-2.1.1.tar.gz
+    $ cd elasticsearch-2.1.1/
+    $ ./bin/elasticsearch
+    ```
+
+1. Set up your Python virtual environment and install necessary Python packages, for example:
+
+    ```bash
+    $ virtualenv -p python3 --prompt="(mme-server)" .virtualenv
+    $ source .virtualenv/bin/activate
+    $ pip install -r requirements.txt
+    ```
+
+1. Download and index vocabularies and sample data:
+
+    ```bash
+    $ python datastore.py
+    ```
+
+1. Run tests:
+
+    ```bash
+    $ python test.py
+    ```
+
+1. Start up MME reference server:
+
+    ```bash
+    $ python server.py
+    ```
+
+    By default, the server listens globally (`--host 0.0.0.0`) on port 8000 (`--port 8000`).
+
+1. Try it out:
+
+    ```bash
+    $ curl -XPOST -d '{"patient":{ \
+      "id":"1", \
+      "contact": {"name":"Jane Doe", "href":"mailto:[email protected]"}, \
+      "features":[{"id":"HP:0000522"}], \
+      "genomicFeatures":[{"gene":{"id":"NGLY1"}}] \
+      }}' localhost:8000/match
+    ```
+
+
+## TODO
+- Avoid costly/redundant parsing `api.Patient` objects when generating MatchResponse objects from patients in database
+- Inspect `Accepts` header for API versioning
+- Add `Content-Type` header to responses
+- Handle errors with proper HTTP statuses and JSON message bodies
+- Add tests for gene index
+- Add end-to-end API query tests
+- Add parser tests
diff --git a/api.py b/api.py
@@ -0,0 +1,178 @@
+"""
+The API module:
+
+Contains API methods and classes for API objects.
+Handles parsing of API requests into API objects, and serializing API objects into API responses.
+
+Also contains some code to help convert API objects to their database representations.
+"""
+from __future__ import with_statement, division, unicode_literals
+
+import json
+
+from datastore import DatastoreConnection
+
+
+class Feature:
+    # Connection to backend to validate vocabulary terms
+    db = DatastoreConnection()
+
+    def __init__(self, data):
+        self._observed = data.get('observed', 'yes') == 'yes'
+        # TODO: parse ageOfOnset
+        self.term = self.db.get_vocabulary_term(data['id'])
+
+    def _get_implied_terms(self):
+        return self.term['term_category']
+
+    def _get_id(self):
+        return self.term['id']
+
+    @property
+    def observed(self):
+        return self._observed
+
+
+class GenomicFeature:
+    # Connection to backend to validate vocabulary terms
+    db = DatastoreConnection()
+
+    def __init__(self, data):
+        self.term = None
+        gene_id = data.get('gene', {}).get('id')
+        # TODO: parse additional genomicFeature fields
+        if gene_id:
+            self.term = self.db.get_vocabulary_term(gene_id)
+
+    def _get_gene_id(self):
+        if self.term:
+            return self.term['id']
+
+
+class Patient:
+    def __init__(self, data):
+        self.id = data['id']
+        self.contact = data['contact']
+        assert self.contact['name'] and self.contact['href']
+
+        features_json = data.get('features', [])
+        genomic_features_json = data.get('genomicFeatures', [])
+
+        assert features_json or genomic_features_json, "At least one of 'features' or 'genomicFeatures' must be provided"
+
+        # Parse phenotype terms
+        features = [Feature(feature_json) for feature_json in features_json]
+
+        # Parse genomic features
+        genomic_features = [GenomicFeature(gf_json) for gf_json in genomic_features_json]
+
+        assert features or genomic_features, "Was unable to parse any phenotype or gene terms"
+
+        disorders = data.get('disorders', [])
+        self.label = data.get('label')
+        self.age_of_onset = data.get('ageOfOnset')
+        self.features = features
+        self.genomic_features = genomic_features
+        self.disorders = disorders
+        self.test = data.get('test', False)
+
+    def _get_genes(self):
+        genes = set()
+        for genomic_feature in self.genomic_features:
+            gene_id = genomic_feature._get_gene_id()
+            if gene_id:
+                genes.add(gene_id)
+
+        return genes
+
+    def _get_present_phenotypes(self):
+        terms = set()
+        for feature in self.features:
+            if feature.observed:
+                terms.add(feature._get_id())
+
+        return terms
+
+    def _get_implied_present_phenotypes(self):
+        terms = set()
+        for feature in self.features:
+            if feature.observed:
+                terms.update(feature._get_implied_terms())
+
+        return terms
+
+    def to_json(self):
+        data = {
+            'id': self.id,
+            'contact': {
+                'name': self.contact['name'],
+                'href': self.contact['href'],
+            }
+        }
+
+        if self.label:
+            data['label'] = self.label
+
+        if self.age_of_onset:
+            data['ageOfOnset'] = self.age_of_onset
+
+        phenotype_ids = self._get_present_phenotypes()
+        if phenotype_ids:
+            data['features'] = [{'id': id} for id in phenotype_ids]
+
+        gene_ids = self._get_genes()
+        if gene_ids:
+            data['genomicFeatures'] = [{'gene': {'id': gene_id}} for gene_id in gene_ids]
+
+        if self.disorders:
+            data['disorders'] = self.disorders
+
+        if self.test:
+            data['test'] = True
+
+        return data
+
+
+class MatchRequest:
+    def __init__(self, request):
+        self.patient = Patient(request['patient'])
+        self._data = request
+
+
+class MatchResult:
+    def __init__(self, match, score):
+        self.match = match
+        self.score = score
+
+    def to_json(self):
+        response = {}
+        response['score'] = {'patient': self.score}
+        response['patient'] = self.match.to_json()
+        return response
+
+
+def match(request, backend=None):
+    assert isinstance(request, MatchRequest), "Argument to match must be MatchResponse object"
+
+    if not backend:
+        backend = DatastoreConnection()
+
+    matches = []
+    # Unpack patient and query backend
+    patient = request.patient
+    for score, patient in backend.find_similar_patients(patient):
+        match = MatchResult(patient, score)
+        matches.append(match)
+
+    response = MatchResponse(matches)
+    return response
+
+
+class MatchResponse:
+    def __init__(self, response):
+        self._data = response
+
+    def to_json(self):
+        response = {}
+        response['results'] = [match.to_json() for match in self._data]
+        return response
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,7 @@ __pycache__/ @@
     # Distribution / packaging
     .Python
     env/
+    .virtualenv/
     build/
     develop-eggs/
     dist/
@@ Expand Down @@