Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mongoengine genome #71

Open
wants to merge 5 commits into
base: feature/mongoengine
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,8 @@
init-hook="from pylint.config import find_pylintrc; import os, sys; sys.path.append(os.path.dirname(find_pylintrc()))"
max-line-length=120

disable=missing-class-docstring, missing-function-docstring, line-too-long
disable=missing-class-docstring, missing-function-docstring, line-too-long, too-few-public-methods

[TYPECHECK]
ignored-classes=Assembly,Genome,Species

87 changes: 87 additions & 0 deletions common/mongoengine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""
.. See the NOTICE file distributed with this work for additional information

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the usage of """ notice is wrong (I know the sop in confluence might be as well), but the use of the """ will introduce some issue if ever we generate some documentation automatically. Best is to just set his text with inline comments #

regarding copyright ownership.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from configparser import NoOptionError

import pymongo
from mongoengine import connect

from scripts.mongoengine_documents.base import ThoasDocument


class MongoDbClient:
'''
A pymongo wrapper class to take care of configuration and collection
management
'''

def __init__(self, config, collection_name=None):
'''
Note that config here is a configparser object
'''
self.mongo_db = MongoDbClient.connect_mongo(config)
try:
self.collection_name = config.get('MONGO DB', 'collection')
print(f'Using MongoDB collection with name {self.collection_name} from config file')
except NoOptionError as no_option_error:
if not collection_name:
raise IOError("Unable to find a MongoDB collection name") from no_option_error
self.collection_name = collection_name
print(f'Using MongoDB collection name {self.collection_name}')

# We need to monkey-patch _get_collection_name so that all subclasses of ThoasDocument get written to the same
# collection
def _get_collection_name():
return self.collection_name
ThoasDocument._get_collection_name = _get_collection_name

@staticmethod
def connect_mongo(config):
'Get a MongoDB connection'

host = config.get('MONGO DB', 'host')
port = config.getint('MONGO DB', 'port')
user = config.get('MONGO DB', 'user')
password = config.get('MONGO DB', 'password')
dbname = config.get('MONGO DB', 'db')

client = connect(db=dbname, username=user, password=password, host=host, authentication_source='admin',
port=port, read_preference=pymongo.ReadPreference.SECONDARY_PREFERRED)

print('connected to MongoDB ' + host)
return client[dbname]

def collection(self):
'''
Get the currently set default collection to run queries against
'''
return self.mongo_db[self.collection_name]


class FakeMongoDbClient:
'''
Sets up a mongomock collection for thoas code to test with
'''

def __init__(self):
'Override default setup'
conn = connect('test_db', host='mongomock://localhost')
self.mongo_db = conn['test_db']
self.collection_name = 'test'

def _get_collection_name():
return self.collection_name
ThoasDocument._get_collection_name = _get_collection_name

def collection(self):
return self.mongo_db[self.collection_name]
2 changes: 1 addition & 1 deletion common/tests/test_transcript_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
See the License for the specific language governing permissions and
limitations under the License.
"""
from common.transcript_metadata import TSL, APPRIS, MANE, GencodeBasic, Biotype, EnsemblCanonical

from common.transcript_metadata import *

def test_parse_input_tsl1():
'''
Expand Down
5 changes: 3 additions & 2 deletions graphql_service/ariadne_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from graphql_service.resolver.gene_model import (
QUERY_TYPE, GENE_TYPE, TRANSCRIPT_TYPE, PGC_TYPE,
PRODUCT_TYPE, SLICE_TYPE, REGION_TYPE, GENE_METADATA_TYPE
PRODUCT_TYPE, SLICE_TYPE, REGION_TYPE, GENE_METADATA_TYPE, ASSEMBLY_TYPE
)


Expand All @@ -37,7 +37,8 @@ def prepare_executable_schema() -> GraphQLSchema:
PRODUCT_TYPE,
GENE_METADATA_TYPE,
SLICE_TYPE,
REGION_TYPE
REGION_TYPE,
ASSEMBLY_TYPE
)


Expand Down
12 changes: 10 additions & 2 deletions graphql_service/resolver/gene_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
GENE_METADATA_TYPE = ObjectType('GeneMetadata')
SLICE_TYPE = ObjectType('Slice')
REGION_TYPE = ObjectType('Region')
ASSEMBLY_TYPE = ObjectType('Assembly')


@QUERY_TYPE.field('gene')
Expand Down Expand Up @@ -219,6 +220,13 @@ def overlap_region(context: Dict, genome_id: str, region_id: str, start: int, en
return results


# We cannot use 'id' as the name of a field in mongoengine documents, because it will clash with the auto-generated
# 'id' field. This converts the mongengine id fields to the 'id' field used in the GraphQL schema
@ASSEMBLY_TYPE.field('id')
def resolve_assembly_id(assembly: Dict, _: GraphQLResolveInfo) -> str:
return assembly['assembly_id']


@PGC_TYPE.field('three_prime_utr')
def resolve_three_prime_utr(pgc: Dict, _: GraphQLResolveInfo) -> Optional[Dict]:
'Convert stored 3` UTR to GraphQL compatible form'
Expand Down Expand Up @@ -247,7 +255,7 @@ def resolve_product_by_id(_, info: GraphQLResolveInfo, genome_id: str, stable_id
result = collection.find_one(query)

if not result:
raise ProductNotFoundError(genome_id, stable_id)
raise ProductNotFoundError(stable_id, genome_id)
return result


Expand Down Expand Up @@ -298,7 +306,7 @@ async def resolve_assembly(region: Dict, info: GraphQLResolveInfo) -> Optional[D

query = {
'type': 'Assembly',
'id': region['assembly_id']
'assembly_id': region['assembly_id']
}

collection = info.context['mongo_db']
Expand Down
2 changes: 1 addition & 1 deletion graphql_service/tests/fixtures/human_brca2.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ def build_assembly():
return {
"type": "Assembly",
"default": True,
"id": "GRCh38.p13",
"assembly_id": "GRCh38.p13",
"name": "GRCh38",
"accession_id": "GCA_000001405.28",
"accessioning_body": "EGA",
Expand Down
14 changes: 14 additions & 0 deletions graphql_service/tests/snapshots/snap_test_slice_retrieval.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
"""
.. See the NOTICE file distributed with this work for additional information
regarding copyright ownership.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

# -*- coding: utf-8 -*-
# snapshottest: v1 - https://goo.gl/zC4yUc

Expand Down
10 changes: 5 additions & 5 deletions scripts/load_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,13 +132,13 @@ def load_gene_info(mongo_client, json_file, cds_info, assembly, genome, phase_in
strand=int(gene['strand']),
start=int(gene['start']),
end=int(gene['end']),
genome_id=genome['id']
genome_id=genome['genome_id']
),
'transcripts': [
[common.utils.get_stable_id(transcript["id"], transcript["version"]) \
for transcript in gene['transcripts']]
],
'genome_id': genome['id'],
'genome_id': genome['genome_id'],
'external_references': gene_xrefs,
'metadata' : gene_metadata
}
Expand All @@ -150,7 +150,7 @@ def load_gene_info(mongo_client, json_file, cds_info, assembly, genome, phase_in
transcript=transcript,
gene=gene,
region_name=gene['seq_region_name'],
genome_id=genome['id'],
genome_id=genome['genome_id'],
cds_info=cds_info,
phase_info=phase_info,
tr_metadata_info=tr_metadata_info
Expand Down Expand Up @@ -477,13 +477,13 @@ def preload_classifiers(classifier_path):
ASSEMBLY, GENOME = get_genome_assembly(ASSEMBLY_NAME, MONGO_CLIENT)

if ARGS.log_faulty_urls:
URL_LOGGER = ThoasLogging(logging_file=f'url_log_{GENOME["id"]}', logger_name=f'url_logger_{GENOME["id"]}')
URL_LOGGER = ThoasLogging(logging_file=f'url_log_{GENOME["genome_id"]}', logger_name=f'url_logger_{GENOME["genome_id"]}')

print("Loading gene info into Mongo")

load_gene_info(MONGO_CLIENT, JSON_FILE, CDS_INFO, ASSEMBLY, GENOME, PHASE_INFO, TRANSCRIPT_METADATA, METADATA_CLASSIFIER, GENE_NAME_METADATA, XREF_RESOLVER, URL_LOGGER)

TRANSLATIONS_FILE = f'{ARGS.species}_{ARGS.assembly}_translations.json'
load_product_info(MONGO_CLIENT, TRANSLATIONS_FILE, CDS_INFO, GENOME['id'])
load_product_info(MONGO_CLIENT, TRANSLATIONS_FILE, CDS_INFO, GENOME['genome_id'])

create_index(MONGO_CLIENT)
50 changes: 22 additions & 28 deletions scripts/load_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@
import pymongo

import common.utils
from common.mongo import MongoDbClient
from common.mongoengine import MongoDbClient
from scripts.mongoengine_documents.genome import Assembly, Species, Genome


def load_genome_info(mongo_client, source_file):
def load_genome_info(source_file):
'''
Load assembly, species and organism information from a JSON file
and create a new collection to put them in. Run before load_genes.py
Expand All @@ -31,33 +32,27 @@ def load_genome_info(mongo_client, source_file):
except json.decoder.JSONDecodeError as error:
raise IOError(f'Failed to parse genome file at {source_file} with error {error}') from error

mongo_client.collection().insert_one({
'type': 'Assembly',
'default': True,
'id': doc['assembly']['name'],
'name': doc['assembly']['default'],
'accession_id': doc['assembly']['accession'],
'accessioning_body': 'EGA',
'species': doc['organism']['name']
})
Assembly(default=True,
assembly_id=doc['assembly']['name'],
name=doc['assembly']['default'],
accession_id=doc['assembly']['accession'],
accessioning_body='EGA',
species=doc['organism']['name']
).save()

mongo_client.collection().insert_one({
'type': 'Species',
'id': doc['organism']['name'],
'scientific_name': doc['organism']['scientific_name'],
'taxon_id': doc['organism']['species_taxonomy_id']
})
Species(species_id=doc['organism']['name'],
scientific_name=doc['organism']['scientific_name'],
taxon_id=doc['organism']['species_taxonomy_id']
).save()

# "Genome" (name to be used quietly), represents the sum of related
# information that people will want to use together. It allows users
# remember less between interactions, and ask shorter queries
mongo_client.collection().insert_one({
'type': 'Genome',
'id': common.utils.get_genome_id(doc['organism']['name'], doc['assembly']['accession']),
'name': doc['assembly']['default'],
'assembly': doc['assembly']['name'],
'species': doc['organism']['name'],
})
Genome(genome_id=common.utils.get_genome_id(doc['organism']['name'], doc['assembly']['accession']),
name=doc['assembly']['default'],
assembly=doc['assembly']['name'],
species=doc['organism']['name']
).save()


def create_index(mongo_client):
Expand All @@ -79,14 +74,13 @@ def create_index(mongo_client):

ARGS = common.utils.parse_args()
CONFIG = common.utils.load_config(ARGS.config_file)
MONGO_COLLECTION = ARGS.mongo_collection
MONGO_CLIENT = MongoDbClient(CONFIG, MONGO_COLLECTION)
MONGO_CLIENT = MongoDbClient(CONFIG, ARGS.mongo_collection)

# Combine arguments to give the path to the relevant $species_genome.json file
# Directory structure differs if a collection is involved
if ARGS.collection:
JSON_FILE = f'{ARGS.data_path}/{ARGS.collection}/{ARGS.species}/{ARGS.species}_genome.json'
else:
JSON_FILE = f'{ARGS.data_path}/{ARGS.species}/{ARGS.species}_genome.json'

load_genome_info(MONGO_CLIENT, JSON_FILE)
load_genome_info(JSON_FILE)
create_index(MONGO_CLIENT)
2 changes: 1 addition & 1 deletion scripts/load_regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def load_regions(config, section_name, chr_checksums_path, mongo_client):
'type': 'Assembly',
'name': config.get(section_name, 'assembly')
})
assembly_id = assembly["id"]
assembly_id = assembly["assembly_id"]

mysql_client = MySQLClient(config, section_name)

Expand Down
71 changes: 71 additions & 0 deletions scripts/mongoengine_documents/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
""" See the NOTICE file distributed with this work for additional information
regarding copyright ownership.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License."""

from mongoengine import Document, StringField, EmbeddedDocumentField, EmbeddedDocument, IntField, BooleanField


class ThoasDocument(Document):
meta = {'allow_inheritance': True}


class ExternalDB(EmbeddedDocument):
name = StringField()
external_db_id = StringField()
description = StringField()
url = StringField()
release = StringField()


class ExternalMethod(EmbeddedDocument):
type = StringField()
description = StringField()


class ExternalReference(EmbeddedDocument):
accession_id = StringField()
name = StringField()
description = StringField()
assignment_method = EmbeddedDocumentField(ExternalMethod)
url = StringField()
source = EmbeddedDocumentField(ExternalDB)


class Alphabet(EmbeddedDocument):
accession_id = StringField()
label = StringField()
value = StringField()
definition = StringField()
description = StringField()


class Sequence(EmbeddedDocument):
alphabet = EmbeddedDocumentField(Alphabet)
checksum = StringField()


class Location(EmbeddedDocument):
start = IntField()
end = IntField()
length = IntField()


class Strand(EmbeddedDocument):
code = StringField()
value = IntField()


class Slice(EmbeddedDocument):
region_id = StringField()
location = EmbeddedDocumentField(Location)
strand = EmbeddedDocumentField(Strand)
default = BooleanField()
genome_id = StringField()
Loading