diff --git a/.pylintrc b/.pylintrc index a73e7eea..7c1909aa 100644 --- a/.pylintrc +++ b/.pylintrc @@ -2,4 +2,8 @@ init-hook="from pylint.config import find_pylintrc; import os, sys; sys.path.append(os.path.dirname(find_pylintrc()))" max-line-length=120 -disable=missing-class-docstring, missing-function-docstring, line-too-long +disable=missing-class-docstring, missing-function-docstring, line-too-long, too-few-public-methods + +[TYPECHECK] +ignored-classes=Assembly,Genome,Species + diff --git a/common/mongoengine.py b/common/mongoengine.py new file mode 100644 index 00000000..bc07f516 --- /dev/null +++ b/common/mongoengine.py @@ -0,0 +1,87 @@ +""" +.. See the NOTICE file distributed with this work for additional information + regarding copyright ownership. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from configparser import NoOptionError + +import pymongo +from mongoengine import connect + +from scripts.mongoengine_documents.base import ThoasDocument + + +class MongoDbClient: + ''' + A pymongo wrapper class to take care of configuration and collection + management + ''' + + def __init__(self, config, collection_name=None): + ''' + Note that config here is a configparser object + ''' + self.mongo_db = MongoDbClient.connect_mongo(config) + try: + self.collection_name = config.get('MONGO DB', 'collection') + print(f'Using MongoDB collection with name {self.collection_name} from config file') + except NoOptionError as no_option_error: + if not collection_name: + raise IOError("Unable to find a MongoDB collection name") from no_option_error + self.collection_name = collection_name + print(f'Using MongoDB collection name {self.collection_name}') + + # We need to monkey-patch _get_collection_name so that all subclasses of ThoasDocument get written to the same + # collection + def _get_collection_name(): + return self.collection_name + ThoasDocument._get_collection_name = _get_collection_name + + @staticmethod + def connect_mongo(config): + 'Get a MongoDB connection' + + host = config.get('MONGO DB', 'host') + port = config.getint('MONGO DB', 'port') + user = config.get('MONGO DB', 'user') + password = config.get('MONGO DB', 'password') + dbname = config.get('MONGO DB', 'db') + + client = connect(db=dbname, username=user, password=password, host=host, authentication_source='admin', + port=port, read_preference=pymongo.ReadPreference.SECONDARY_PREFERRED) + + print('connected to MongoDB ' + host) + return client[dbname] + + def collection(self): + ''' + Get the currently set default collection to run queries against + ''' + return self.mongo_db[self.collection_name] + + +class FakeMongoDbClient: + ''' + Sets up a mongomock collection for thoas code to test with + ''' + + def __init__(self): + 'Override default setup' + conn = connect('test_db', host='mongomock://localhost') + self.mongo_db = conn['test_db'] + self.collection_name = 'test' + + def _get_collection_name(): + return self.collection_name + ThoasDocument._get_collection_name = _get_collection_name + + def collection(self): + return self.mongo_db[self.collection_name] diff --git a/common/tests/test_transcript_metadata.py b/common/tests/test_transcript_metadata.py index e04d1114..78d0be5c 100644 --- a/common/tests/test_transcript_metadata.py +++ b/common/tests/test_transcript_metadata.py @@ -11,8 +11,8 @@ See the License for the specific language governing permissions and limitations under the License. """ +from common.transcript_metadata import TSL, APPRIS, MANE, GencodeBasic, Biotype, EnsemblCanonical -from common.transcript_metadata import * def test_parse_input_tsl1(): ''' diff --git a/graphql_service/ariadne_app.py b/graphql_service/ariadne_app.py index e0b75ffc..3ffd95c5 100644 --- a/graphql_service/ariadne_app.py +++ b/graphql_service/ariadne_app.py @@ -19,7 +19,7 @@ from graphql_service.resolver.gene_model import ( QUERY_TYPE, GENE_TYPE, TRANSCRIPT_TYPE, PGC_TYPE, - PRODUCT_TYPE, SLICE_TYPE, REGION_TYPE, GENE_METADATA_TYPE + PRODUCT_TYPE, SLICE_TYPE, REGION_TYPE, GENE_METADATA_TYPE, ASSEMBLY_TYPE ) @@ -37,7 +37,8 @@ def prepare_executable_schema() -> GraphQLSchema: PRODUCT_TYPE, GENE_METADATA_TYPE, SLICE_TYPE, - REGION_TYPE + REGION_TYPE, + ASSEMBLY_TYPE ) diff --git a/graphql_service/resolver/gene_model.py b/graphql_service/resolver/gene_model.py index f3019262..edb8fd4e 100644 --- a/graphql_service/resolver/gene_model.py +++ b/graphql_service/resolver/gene_model.py @@ -26,6 +26,7 @@ GENE_METADATA_TYPE = ObjectType('GeneMetadata') SLICE_TYPE = ObjectType('Slice') REGION_TYPE = ObjectType('Region') +ASSEMBLY_TYPE = ObjectType('Assembly') @QUERY_TYPE.field('gene') @@ -219,6 +220,13 @@ def overlap_region(context: Dict, genome_id: str, region_id: str, start: int, en return results +# We cannot use 'id' as the name of a field in mongoengine documents, because it will clash with the auto-generated +# 'id' field. This converts the mongengine id fields to the 'id' field used in the GraphQL schema +@ASSEMBLY_TYPE.field('id') +def resolve_assembly_id(assembly: Dict, _: GraphQLResolveInfo) -> str: + return assembly['assembly_id'] + + @PGC_TYPE.field('three_prime_utr') def resolve_three_prime_utr(pgc: Dict, _: GraphQLResolveInfo) -> Optional[Dict]: 'Convert stored 3` UTR to GraphQL compatible form' @@ -247,7 +255,7 @@ def resolve_product_by_id(_, info: GraphQLResolveInfo, genome_id: str, stable_id result = collection.find_one(query) if not result: - raise ProductNotFoundError(genome_id, stable_id) + raise ProductNotFoundError(stable_id, genome_id) return result @@ -298,7 +306,7 @@ async def resolve_assembly(region: Dict, info: GraphQLResolveInfo) -> Optional[D query = { 'type': 'Assembly', - 'id': region['assembly_id'] + 'assembly_id': region['assembly_id'] } collection = info.context['mongo_db'] diff --git a/graphql_service/tests/fixtures/human_brca2.py b/graphql_service/tests/fixtures/human_brca2.py index c2fdf9e3..49955e96 100644 --- a/graphql_service/tests/fixtures/human_brca2.py +++ b/graphql_service/tests/fixtures/human_brca2.py @@ -385,7 +385,7 @@ def build_assembly(): return { "type": "Assembly", "default": True, - "id": "GRCh38.p13", + "assembly_id": "GRCh38.p13", "name": "GRCh38", "accession_id": "GCA_000001405.28", "accessioning_body": "EGA", diff --git a/graphql_service/tests/snapshots/snap_test_slice_retrieval.py b/graphql_service/tests/snapshots/snap_test_slice_retrieval.py index caf724a7..34106fce 100644 --- a/graphql_service/tests/snapshots/snap_test_slice_retrieval.py +++ b/graphql_service/tests/snapshots/snap_test_slice_retrieval.py @@ -1,3 +1,17 @@ +""" +.. See the NOTICE file distributed with this work for additional information + regarding copyright ownership. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + # -*- coding: utf-8 -*- # snapshottest: v1 - https://goo.gl/zC4yUc diff --git a/scripts/load_genes.py b/scripts/load_genes.py index 65998cca..b347d4e6 100644 --- a/scripts/load_genes.py +++ b/scripts/load_genes.py @@ -132,13 +132,13 @@ def load_gene_info(mongo_client, json_file, cds_info, assembly, genome, phase_in strand=int(gene['strand']), start=int(gene['start']), end=int(gene['end']), - genome_id=genome['id'] + genome_id=genome['genome_id'] ), 'transcripts': [ [common.utils.get_stable_id(transcript["id"], transcript["version"]) \ for transcript in gene['transcripts']] ], - 'genome_id': genome['id'], + 'genome_id': genome['genome_id'], 'external_references': gene_xrefs, 'metadata' : gene_metadata } @@ -150,7 +150,7 @@ def load_gene_info(mongo_client, json_file, cds_info, assembly, genome, phase_in transcript=transcript, gene=gene, region_name=gene['seq_region_name'], - genome_id=genome['id'], + genome_id=genome['genome_id'], cds_info=cds_info, phase_info=phase_info, tr_metadata_info=tr_metadata_info @@ -477,13 +477,13 @@ def preload_classifiers(classifier_path): ASSEMBLY, GENOME = get_genome_assembly(ASSEMBLY_NAME, MONGO_CLIENT) if ARGS.log_faulty_urls: - URL_LOGGER = ThoasLogging(logging_file=f'url_log_{GENOME["id"]}', logger_name=f'url_logger_{GENOME["id"]}') + URL_LOGGER = ThoasLogging(logging_file=f'url_log_{GENOME["genome_id"]}', logger_name=f'url_logger_{GENOME["genome_id"]}') print("Loading gene info into Mongo") load_gene_info(MONGO_CLIENT, JSON_FILE, CDS_INFO, ASSEMBLY, GENOME, PHASE_INFO, TRANSCRIPT_METADATA, METADATA_CLASSIFIER, GENE_NAME_METADATA, XREF_RESOLVER, URL_LOGGER) TRANSLATIONS_FILE = f'{ARGS.species}_{ARGS.assembly}_translations.json' - load_product_info(MONGO_CLIENT, TRANSLATIONS_FILE, CDS_INFO, GENOME['id']) + load_product_info(MONGO_CLIENT, TRANSLATIONS_FILE, CDS_INFO, GENOME['genome_id']) create_index(MONGO_CLIENT) diff --git a/scripts/load_genome.py b/scripts/load_genome.py index 37bd285d..9e62fd30 100644 --- a/scripts/load_genome.py +++ b/scripts/load_genome.py @@ -16,10 +16,11 @@ import pymongo import common.utils -from common.mongo import MongoDbClient +from common.mongoengine import MongoDbClient +from scripts.mongoengine_documents.genome import Assembly, Species, Genome -def load_genome_info(mongo_client, source_file): +def load_genome_info(source_file): ''' Load assembly, species and organism information from a JSON file and create a new collection to put them in. Run before load_genes.py @@ -31,33 +32,27 @@ def load_genome_info(mongo_client, source_file): except json.decoder.JSONDecodeError as error: raise IOError(f'Failed to parse genome file at {source_file} with error {error}') from error - mongo_client.collection().insert_one({ - 'type': 'Assembly', - 'default': True, - 'id': doc['assembly']['name'], - 'name': doc['assembly']['default'], - 'accession_id': doc['assembly']['accession'], - 'accessioning_body': 'EGA', - 'species': doc['organism']['name'] - }) + Assembly(default=True, + assembly_id=doc['assembly']['name'], + name=doc['assembly']['default'], + accession_id=doc['assembly']['accession'], + accessioning_body='EGA', + species=doc['organism']['name'] + ).save() - mongo_client.collection().insert_one({ - 'type': 'Species', - 'id': doc['organism']['name'], - 'scientific_name': doc['organism']['scientific_name'], - 'taxon_id': doc['organism']['species_taxonomy_id'] - }) + Species(species_id=doc['organism']['name'], + scientific_name=doc['organism']['scientific_name'], + taxon_id=doc['organism']['species_taxonomy_id'] + ).save() # "Genome" (name to be used quietly), represents the sum of related # information that people will want to use together. It allows users # remember less between interactions, and ask shorter queries - mongo_client.collection().insert_one({ - 'type': 'Genome', - 'id': common.utils.get_genome_id(doc['organism']['name'], doc['assembly']['accession']), - 'name': doc['assembly']['default'], - 'assembly': doc['assembly']['name'], - 'species': doc['organism']['name'], - }) + Genome(genome_id=common.utils.get_genome_id(doc['organism']['name'], doc['assembly']['accession']), + name=doc['assembly']['default'], + assembly=doc['assembly']['name'], + species=doc['organism']['name'] + ).save() def create_index(mongo_client): @@ -79,14 +74,13 @@ def create_index(mongo_client): ARGS = common.utils.parse_args() CONFIG = common.utils.load_config(ARGS.config_file) - MONGO_COLLECTION = ARGS.mongo_collection - MONGO_CLIENT = MongoDbClient(CONFIG, MONGO_COLLECTION) + MONGO_CLIENT = MongoDbClient(CONFIG, ARGS.mongo_collection) + # Combine arguments to give the path to the relevant $species_genome.json file # Directory structure differs if a collection is involved if ARGS.collection: JSON_FILE = f'{ARGS.data_path}/{ARGS.collection}/{ARGS.species}/{ARGS.species}_genome.json' else: JSON_FILE = f'{ARGS.data_path}/{ARGS.species}/{ARGS.species}_genome.json' - - load_genome_info(MONGO_CLIENT, JSON_FILE) + load_genome_info(JSON_FILE) create_index(MONGO_CLIENT) diff --git a/scripts/load_regions.py b/scripts/load_regions.py index 5f97e638..c4e1099a 100644 --- a/scripts/load_regions.py +++ b/scripts/load_regions.py @@ -27,7 +27,7 @@ def load_regions(config, section_name, chr_checksums_path, mongo_client): 'type': 'Assembly', 'name': config.get(section_name, 'assembly') }) - assembly_id = assembly["id"] + assembly_id = assembly["assembly_id"] mysql_client = MySQLClient(config, section_name) diff --git a/scripts/mongoengine_documents/base.py b/scripts/mongoengine_documents/base.py new file mode 100644 index 00000000..96fa5018 --- /dev/null +++ b/scripts/mongoengine_documents/base.py @@ -0,0 +1,71 @@ +""" See the NOTICE file distributed with this work for additional information + regarding copyright ownership. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License.""" + +from mongoengine import Document, StringField, EmbeddedDocumentField, EmbeddedDocument, IntField, BooleanField + + +class ThoasDocument(Document): + meta = {'allow_inheritance': True} + + +class ExternalDB(EmbeddedDocument): + name = StringField() + external_db_id = StringField() + description = StringField() + url = StringField() + release = StringField() + + +class ExternalMethod(EmbeddedDocument): + type = StringField() + description = StringField() + + +class ExternalReference(EmbeddedDocument): + accession_id = StringField() + name = StringField() + description = StringField() + assignment_method = EmbeddedDocumentField(ExternalMethod) + url = StringField() + source = EmbeddedDocumentField(ExternalDB) + + +class Alphabet(EmbeddedDocument): + accession_id = StringField() + label = StringField() + value = StringField() + definition = StringField() + description = StringField() + + +class Sequence(EmbeddedDocument): + alphabet = EmbeddedDocumentField(Alphabet) + checksum = StringField() + + +class Location(EmbeddedDocument): + start = IntField() + end = IntField() + length = IntField() + + +class Strand(EmbeddedDocument): + code = StringField() + value = IntField() + + +class Slice(EmbeddedDocument): + region_id = StringField() + location = EmbeddedDocumentField(Location) + strand = EmbeddedDocumentField(Strand) + default = BooleanField() + genome_id = StringField() diff --git a/scripts/mongoengine_documents/genome.py b/scripts/mongoengine_documents/genome.py new file mode 100644 index 00000000..97cfd1cf --- /dev/null +++ b/scripts/mongoengine_documents/genome.py @@ -0,0 +1,40 @@ +""" See the NOTICE file distributed with this work for additional information + regarding copyright ownership. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License.""" + +from mongoengine import IntField, StringField, BooleanField + +from scripts.mongoengine_documents.base import ThoasDocument + + +class Assembly(ThoasDocument): + type = StringField(default="Assembly") + default = BooleanField() + assembly_id = StringField() + name = StringField() + accession_id = StringField() + accessioning_body = StringField() + species = StringField() + + +class Species(ThoasDocument): + type = StringField(default="Species") + species_id = StringField() + scientific_name = StringField() + taxon_id = IntField() + + +class Genome(ThoasDocument): + type = StringField(default="Genome") + genome_id = StringField() + name = StringField() + assembly = StringField() + species = StringField() diff --git a/scripts/prepare_gene_name_metadata.py b/scripts/prepare_gene_name_metadata.py index c1c381b8..ebf1c90f 100644 --- a/scripts/prepare_gene_name_metadata.py +++ b/scripts/prepare_gene_name_metadata.py @@ -49,7 +49,8 @@ def retrieve_gene_name_metadata(sp_production_name, sp_assembly_name, mysql_curs } gene_name_info = extractor(gene, gene_name_info) - if gene_name_info.get('external_db_name') is None: gene_name_info = backup_extractor(gene, gene_name_info) + if gene_name_info.get('external_db_name') is None: + gene_name_info = backup_extractor(gene, gene_name_info) # Add all the possible information retrieved. Also add if no 'display_xref_id' and 'description'. gene_names.append(gene_name_info) diff --git a/scripts/tests/plasmodium_falciparum_genome.json b/scripts/tests/plasmodium_falciparum_genome.json new file mode 100644 index 00000000..d2c6ba24 --- /dev/null +++ b/scripts/tests/plasmodium_falciparum_genome.json @@ -0,0 +1,48 @@ +{ + "has_variations": "false", + "division": "EnsemblProtists", + "has_peptide_compara": "false", + "has_other_alignments": "false", + "has_synteny": "false", + "species_id": 1, + "dbname": "plasmodium_falciparum_core_53_106_1", + "reference": null, + "has_genome_alignments": "false", + "genebuild": "2017-10-ENA", + "id": "plasmodium_falciparum", + "has_pan_compara": "false", + "organism": { + "name": "plasmodium_falciparum", + "url_name": "Plasmodium_falciparum", + "display_name": "Plasmodium falciparum 3D7", + "taxonomy_id": 36329, + "lineage": [ + "Plasmodium falciparum", + "Plasmodium (Laverania)", + "Plasmodium", + "Plasmodiidae", + "Haemosporida", + "Aconoidasida", + "Apicomplexa", + "Alveolata", + "Sar", + "Eukaryota", + "cellular organisms" + ], + "scientific_name": "Plasmodium falciparum 3D7", + "strain": null, + "aliases": [ + "Plasmodium falciparum 3D7", + "plasmodium_falciparum_3d7" + ], + "serotype": null, + "species_taxonomy_id": 5833 + }, + "assembly": { + "level": "chromosome", + "name": "ASM276v2", + "ucsc": null, + "default": "ASM276v2", + "accession": "GCA_000002765.2" + } +} \ No newline at end of file diff --git a/scripts/tests/test_load_genome.py b/scripts/tests/test_load_genome.py new file mode 100644 index 00000000..41114ab5 --- /dev/null +++ b/scripts/tests/test_load_genome.py @@ -0,0 +1,60 @@ +""" +.. See the NOTICE file distributed with this work for additional information + regarding copyright ownership. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import os + +from mongoengine import connect, disconnect + +from scripts.load_genome import load_genome_info +from scripts.mongoengine_documents.genome import Assembly, Species, Genome + + +def test_load_genome_info(): + connect('mongoenginetest', host='mongomock://localhost') + path_to_parent_dir = os.path.dirname(os.path.realpath(__file__)) + load_genome_info(os.path.join(path_to_parent_dir, "plasmodium_falciparum_genome.json")) + + assert Assembly.objects.count() == Species.objects.count() == Genome.objects.count() == 1 + assembly_json = Assembly.objects[0].to_mongo() + species_json = Species.objects[0].to_mongo() + genome_json = Genome.objects[0].to_mongo() + + # id assignment is handled by mongoengine + del assembly_json['_id'] + del species_json['_id'] + del genome_json['_id'] + + assert assembly_json == {"_cls": "ThoasDocument.Assembly", + "type": "Assembly", + "default": True, + "assembly_id": "ASM276v2", + "name": "ASM276v2", + "accession_id": "GCA_000002765.2", + "accessioning_body": "EGA", + "species": "plasmodium_falciparum"} + + assert species_json == {'_cls': 'ThoasDocument.Species', + 'scientific_name': 'Plasmodium falciparum 3D7', + 'species_id': 'plasmodium_falciparum', + 'taxon_id': 5833, + 'type': 'Species'} + + assert genome_json == {'_cls': 'ThoasDocument.Genome', + 'assembly': 'ASM276v2', + 'genome_id': 'plasmodium_falciparum_GCA_000002765_2', + 'name': 'ASM276v2', + 'species': 'plasmodium_falciparum', + 'type': 'Genome'} + + disconnect() diff --git a/scripts/tests/test_scripts.py b/scripts/tests/test_scripts.py index 9e452235..d05656c5 100644 --- a/scripts/tests/test_scripts.py +++ b/scripts/tests/test_scripts.py @@ -1,14 +1,14 @@ -# See the NOTICE file distributed with this work for additional information -# regarding copyright ownership. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +""" See the NOTICE file distributed with this work for additional information + regarding copyright ownership. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License.""" from scripts.prepare_gene_name_metadata import extract_info_from_description_column diff --git a/setup.py b/setup.py index a7826499..9c7b5097 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ from setuptools import setup, find_packages -with open(Path(__file__).parent / 'LICENSE') as f: +with open(Path(__file__).parent / 'LICENSE', encoding="UTF-8") as f: LICENSE_CT = f.read()