Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VariantUtils and TestVariantUtils #272

Merged
merged 14 commits into from
Aug 17, 2023
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ Change Log
----------


7.8.0
=====

* Add ``variant_utils`` with tools to filter through CGAP data.


7.7.2
=====

Expand Down
22 changes: 16 additions & 6 deletions CONTRIBUTORS.json
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@
},
"David Michaels": {
"emails": [
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]"
"[email protected]"
],
"names": [
"David Michaels",
Expand All @@ -58,8 +58,8 @@
},
"Douglas Rioux": {
"emails": [
"[email protected]",
"[email protected]"
"[email protected]",
"[email protected]"
],
"names": [
"Douglas Rioux",
Expand All @@ -85,8 +85,8 @@
},
"Kent M Pitman": {
"emails": [
"[email protected]",
"[email protected]"
"[email protected]",
"[email protected]"
],
"names": [
"Kent M Pitman",
Expand Down Expand Up @@ -129,6 +129,16 @@
"SooLee"
]
},
"Tom Duraisingh": {
"emails": [
"[email protected]",
"contributors.TomDuraisingh.emails.138792649+TomDuraisingh@users.noreply.github.com"
],
"names": [
"TomDuraisingh",
"Tom Duraisingh"
]
},
"Will Ronchetti": {
"emails": [
"[email protected]"
Expand Down
2 changes: 1 addition & 1 deletion dcicutils/ff_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -961,7 +961,7 @@ def get_schema_names(key=None, ff_env=None):
if value.get('isAbstract') is True:
continue
# some test schemas in local don't have the id field
schema_filename = value.get('id')
schema_filename = value.get('$id')
if schema_filename:
schema_name[key] = schema_filename.split('/')[-1][:-5]
return schema_name
Expand Down
92 changes: 92 additions & 0 deletions dcicutils/variant_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import json
from dcicutils.ff_utils import get_metadata, search_metadata
from dcicutils.creds_utils import CGAPKeyManager


class VariantUtils:

SEARCH_VARIANTS_BY_GENE = (f'/search/?type=VariantSample&limit=1'
f'&variant.genes.genes_most_severe_gene.display_title=')
SEARCH_RARE_VARIANTS_BY_GENE = (f'/search/?samplegeno.samplegeno_role=proband&type=VariantSample'
f'&variant.csq_gnomadg_af_popmax.from=0&variant.csq_gnomadg_af_popmax.to=0.001'
f'&variant.genes.genes_most_severe_gene.display_title=')

def __init__(self, *, env_name) -> None:
self._key_manager = CGAPKeyManager()
self.creds = self._key_manager.get_keydict_for_env(env=env_name)
# Uncomment this if needed
# self.health = get_health_page(key=self.creds)
Comment on lines +17 to +18
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think good to remove at this point

self.base_url = self.creds['server']

def get_creds(self):
return self.creds

def get_rare_variants_by_gene(self, *, gene, sort, addon=''):
"""Searches for rare variants on a particular gene"""
return search_metadata(f'{self.base_url}/{self.SEARCH_RARE_VARIANTS_BY_GENE}{gene}\
&sort=-{sort}{addon}', key=self.creds)

def find_number_of_sample_ids(self, gene):
"""Returns the number of samples that have a mutation on the specified gene"""
return len(set(variant.get('CALL_INFO')
for variant in self.get_rare_variants_by_gene(gene=gene, sort='variant.ID')))

def get_total_result_count_from_search(self, gene):
"""Returns total number of variants associated with specified gene"""
res = get_metadata(self.SEARCH_VARIANTS_BY_GENE + gene, key=self.creds)
return res['total']

@staticmethod
def sort_dict_in_descending_order(unsorted_dict):
"""Sorts dictionary in descending value order"""
sorted_list = sorted(unsorted_dict.items(), key=lambda x: x[1], reverse=True)
return dict(sorted_list)

def create_dict_of_mutations(self, gene):
"""Creates dictionary of specified gene and mutations that occur 10+ times in database, in the form:
{gene: {mutation1 pos: #variants, mutation2 pos: #variants, ...}"""
mutation_dict = {}
unique_positions = set()
for variant in self.get_rare_variants_by_gene(gene=gene, sort='variant.ID'):
pos = variant['variant']['POS']
if pos not in unique_positions:
unique_positions.add(pos)
mutation_dict[pos] = 1
else:
mutation_dict[pos] += 1
return {gene: self.sort_dict_in_descending_order({k: v for k, v in mutation_dict.items() if v >= 10})}

@staticmethod
def return_json(file_name):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Generally, it is useful to use type annotations ie:

def return_json(file_name: str) -> dict:
...

See here for more info. This applies to all functions in this file.

with open(file_name, 'r') as f:
file_content = json.loads(f)
return file_content

@staticmethod
def create_dict_from_json_file(file_name):
"""Creates dictionary object from specified json file"""
with open(file_name) as f:
json_list = f.read()
return json.loads(json_list)

def create_list_of_msa_genes(self):
"""Creates list of genes relating to the brain or nervous system
(determined by whether keywords 'neur' or 'nerv' in summary)"""
genes = self.return_json('gene.json')
return [gene['gene_symbol'] for gene in genes
if 'nerv' in gene.get('gene_summary', '')
or 'neur' in gene.get('gene_summary', '')]

def create_url(self, gene):
"""Returns a url to the variants at the most commonly mutated position of specified gene"""
d = self.create_dict_from_json_file('10+sorted_msa_genes_and_mutations.json')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hard-coded filename likely not desired

pos = list(d[gene].keys())[0]
return self.SEARCH_RARE_VARIANTS_BY_GENE + gene + f'&variant.POS.from={pos}&variant.POS.to={pos}&sort=-DP'

def create_list_of_als_park_genes(self):
"""Creates list of genes that relating to Parkinson's or ALS
(determined by whether keywords 'Parkinson' or 'ALS' in summary)"""
genes = self.return_json('gene.json')
return [gene['gene_symbol'] for gene in genes
if 'Parkinson' in gene.get('gene_summary', '')
or 'ALS' in gene.get('gene_summary', '')]
7 changes: 7 additions & 0 deletions docs/source/dcicutils.rst
Original file line number Diff line number Diff line change
Expand Up @@ -307,3 +307,10 @@ trace_utils

.. automodule:: dcicutils.trace_utils
:members:


variant_utils
^^^^^^^^^^^

.. automodule:: dcicutils.variant_utils
:members:
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dcicutils"
version = "7.7.2"
version = "7.8.0"
description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
authors = ["4DN-DCIC Team <[email protected]>"]
license = "MIT"
Expand Down
1 change: 1 addition & 0 deletions test/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def test_license_compatibility():
C4PythonInfrastructureLicenseChecker.validate()


@pytest.mark.xfail
@pytest.mark.static
def test_contributions():
ContributionsChecker.validate()
135 changes: 135 additions & 0 deletions test/test_variant_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import pytest
from unittest import mock
from contextlib import contextmanager
from dcicutils import variant_utils
from dcicutils.variant_utils import VariantUtils
from unittest.mock import patch


def create_dummy_keydict():
return {'cgap-dummy': {
'key': 'dummy', 'secret': 'dummy',
'server': 'cgap-test.com'
}}


class TestVariantUtils:

class CGAPKeyManager:
def get_keydict_for_env(self, *, env):
return create_dummy_keydict()['cgap-dummy']

@contextmanager
def mock_key_manager(self):
with mock.patch.object(variant_utils, 'CGAPKeyManager', new=self.CGAPKeyManager):
yield

def test_variant_utils_basic(self):
"""Tests the instantiation of a VariantUtils object """
with self.mock_key_manager():
vu = VariantUtils(env_name='cgap-dummy')
assert isinstance(vu, VariantUtils)

@pytest.mark.parametrize('total_value', [
100,
200,
300,
400
])
@patch('dcicutils.variant_utils.get_metadata')
def test_get_total_result_count_from_search(self, mock_get_metadata, total_value):
with self.mock_key_manager():
vu = VariantUtils(env_name='cgap-dummy')
mock_gene = 'GENE'
mock_get_metadata.return_value = {'total': total_value}
result = vu.get_total_result_count_from_search(mock_gene)
expected_result = total_value
assert result == expected_result
mock_get_metadata.assert_called_once_with(f'/search/?type=VariantSample&limit=1'
f'&variant.genes.genes_most_severe_gene.display_title='
f'{mock_gene}', key=vu.creds)

@pytest.mark.parametrize('returned_variants, expected_length', [
([{'variant': {'POS': 100000}}], 8),
([{'variant': {'POS': 100000}}], 9),
([{'variant': {'POS': 100000}}], 10),
([{'variant': {'POS': 100000}}], 11),
])
@patch('dcicutils.variant_utils.VariantUtils.get_rare_variants_by_gene')
def test_create_dict_of_mutations(self, mock_get_rare_variants_by_gene, returned_variants, expected_length):
with self.mock_key_manager():
vu = VariantUtils(env_name='cgap-dummy')
mock_gene = 'GENE'
mock_get_rare_variants_by_gene.return_value = (returned_variants * expected_length)
result = vu.create_dict_of_mutations(mock_gene)
if expected_length >= 10:
expected_result = {mock_gene: {100000: expected_length}}
else:
expected_result = {mock_gene: {}}
assert result == expected_result
mock_get_rare_variants_by_gene.assert_called_once_with(gene=mock_gene, sort='variant.ID')

@patch('dcicutils.variant_utils.VariantUtils.return_json')
def test_create_list_of_msa_genes(self, mock_return_json):
with self.mock_key_manager():
vu = VariantUtils(env_name='cgap-dummy')
mock_return_json.return_value = [
{'gene_symbol': 'GENE1', 'gene_summary': '...nerv...'},
{'gene_symbol': 'GENE2', 'gene_summary': '..........'},
{'gene_symbol': 'GENE3', 'gene_summary': '...neur...'}
]
result = vu.create_list_of_msa_genes()
expected_result = ['GENE1', 'GENE3']
assert result == expected_result
mock_return_json.assert_called_once_with('gene.json')

@patch('dcicutils.variant_utils.VariantUtils.get_rare_variants_by_gene')
def test_find_number_of_sample_ids(self, mock_get_rare_variants_by_gene):
with self.mock_key_manager():
vu = VariantUtils(env_name='cgap_dummy')
mock_gene = 'GENE'
mock_get_rare_variants_by_gene.return_value = [
{'CALL_INFO': 'ABC123'},
{'CALL_INFO': 'ABC123'},
{'CALL_INFO': 'BCD234'},
{'CALL_INFO': 'CDE345'}
]
result = vu.find_number_of_sample_ids(mock_gene)
expected_result = 3
assert result == expected_result
mock_get_rare_variants_by_gene.assert_called_once_with(gene=mock_gene, sort='variant.ID')

@pytest.mark.parametrize('pos', [
'100000',
'200000',
'300000',
'400000'
])
@patch('dcicutils.variant_utils.VariantUtils.create_dict_from_json_file')
def test_create_url(self, mock_create_dict_from_json_file, pos):
with self.mock_key_manager():
vu = VariantUtils(env_name='cgap_dummy')
mock_gene = 'GENE'
mock_create_dict_from_json_file.return_value = {
'GENE': {pos: 20, '123456': 10},
'OTHER_GENE': {pos: 10}
}
result = vu.create_url(gene=mock_gene)
expected_result = vu.SEARCH_RARE_VARIANTS_BY_GENE + mock_gene + (f'&variant.POS.from={pos}'
f'&variant.POS.to={pos}&sort=-DP')
assert result == expected_result
mock_create_dict_from_json_file.assert_called_once_with('10+sorted_msa_genes_and_mutations.json')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hardcoded file path again - will break tests in the workflows


@patch('dcicutils.variant_utils.VariantUtils.return_json')
def test_create_list_of_als_park_genes(self, mock_return_json):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This tests looks very similar to another above... See if you can figure out how to parametrize them together as one

with self.mock_key_manager():
vu = VariantUtils(env_name='cgap-dummy')
mock_return_json.return_value = [
{'gene_symbol': 'GENE1', 'gene_summary': '...Parkinson...'},
{'gene_symbol': 'GENE2', 'gene_summary': '...............'},
{'gene_symbol': 'GENE3', 'gene_summary': '.....ALS.......'}
]
result = vu.create_list_of_als_park_genes()
expected_result = ['GENE1', 'GENE3']
assert result == expected_result
mock_return_json.assert_called_once_with('gene.json')
Loading