From 246e4241e5425df1115c6d825ddd78da71c0938d Mon Sep 17 00:00:00 2001 From: "Alex H. Wagner, PhD" Date: Tue, 9 Oct 2018 17:15:17 -0500 Subject: [PATCH 01/17] WIP: refactor validation, test for variant type --- civicpy/exports.py | 66 ++++++++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/civicpy/exports.py b/civicpy/exports.py index 191c001..947df22 100644 --- a/civicpy/exports.py +++ b/civicpy/exports.py @@ -232,30 +232,29 @@ def _validate_evidence_record(self, record): variant = record.variant valid = self.VALID_VARIANTS.get(variant, None) if valid is None: - # valid = self._validate_structural_variant(variant) - valid = self._validate_coordinates(variant) + valid = self._validate_sequence_variant(variant) and self._validate_coordinates(variant) if not valid: logging.info(f'{record} has invalid VCF variant {variant}.') return valid - def _validate_structural_variant(self, variant): + def _validate_sequence_variant(self, variant): # Requires all types to have SO_IDs types = variant.types for variant_type in types: if not variant_type.so_id.startswith('SO:'): return self._cache_variant_validation(variant, False) - # Filter types if multiple direct lineage to most specific, remove non-structural types + # Filter types if multiple direct lineage to most specific, remove non-variant types type_len = len(types) simplified_types = list() for i in range(type_len): remove = False try: - structural = self.SO_READER.same_or_has_ancestor(types[i].so_id, 'SO:0001537') + sequence_variant = self.SO_READER.same_or_has_ancestor(types[i].so_id, 'SO:0001060') except networkx.NetworkXError as e: logging.warning(f'Error for variant {variant}: {e.args[0]}') return self._cache_variant_validation(variant, False) - if not structural: + if not sequence_variant: continue for j in range(type_len): if i == j: @@ -270,25 +269,48 @@ def _validate_structural_variant(self, variant): simplified_types.append(types[i]) types = simplified_types + valid = self._validate_coordinates(variant, types) + # Requires at least one variant type (other than filtered types above) to be specified by CIViC - return self._cache_variant_validation(variant, bool(types)) - - def _validate_coordinates(self, variant): - # Requires exactly one coordinate set with ref and alt - coordinates = variant.coordinates - valid = all([ - coordinates.chromosome, - coordinates.start, - coordinates.stop, - coordinates.reference_bases, - coordinates.variant_bases, - not coordinates.chromosome2, - not coordinates.start2, - not coordinates.stop2 - ]) and all([c.upper() in ['A', 'C', 'G', 'T', 'N', '*'] for c in coordinates.variant_bases]) \ - and all([c.upper() in ['A', 'C', 'G', 'T', 'N'] for c in coordinates.reference_bases]) return self._cache_variant_validation(variant, valid) + def _validate_coordinates(self, variant, types): + # If multiple types, requires exactly one to be structural type. + if not types: + return False + + if len(types) > 1: + structural_types = [t for t in types if self.SO_READER.same_or_has_ancestor('SO:0001537')] + if len(structural_types) == 1: + types = structural_types + elif len(structural_types > 1): + logging.warning(f'Variant {variant} has multiple structural types. Skipping.') + return False + else: + logging.warning(f'Variant {variant} has multiple types, none structural. Skipping.') + return False + + variant_type = types[0] + + # If type is a transcript variant, requires exactly one coordinate set with ref and alt + if self.SO_READER.same_or_has_ancestor('SO:0001576'): + coordinates = variant.coordinates + valid = all([ + coordinates.chromosome, + coordinates.start, + coordinates.stop, + coordinates.reference_bases, + coordinates.variant_bases, + not coordinates.chromosome2, + not coordinates.start2, + not coordinates.stop2 + ]) and all([c.upper() in ['A', 'C', 'G', 'T', 'N', '*'] for c in coordinates.variant_bases]) \ + and all([c.upper() in ['A', 'C', 'G', 'T', 'N'] for c in coordinates.reference_bases]) + return self._cache_variant_validation(variant, valid) + else: + raise NotImplementedError + # TODO: handle non-transcript variants here + def _cache_variant_validation(self, variant, result): self.VALID_VARIANTS[variant] = result return result From 1c88552e4af58cef7be2b8081606e549305f091d Mon Sep 17 00:00:00 2001 From: "Alex H. Wagner, PhD" Date: Mon, 25 Mar 2019 10:30:24 -0500 Subject: [PATCH 02/17] WIP --- civicpy/__init__.py | 5 +++++ civicpy/__version__.py | 2 +- civicpy/civic.py | 3 ++- civicpy/exports.py | 41 ++++++++++++++++++++++++++--------- civicpy/tests/fixtures.py | 7 ++++++ civicpy/tests/test_exports.py | 24 ++++++++++++++++++++ docs/user/civic.rst | 4 ++++ 7 files changed, 74 insertions(+), 12 deletions(-) create mode 100644 civicpy/tests/fixtures.py create mode 100644 civicpy/tests/test_exports.py diff --git a/civicpy/__init__.py b/civicpy/__init__.py index e69de29..141d0b5 100644 --- a/civicpy/__init__.py +++ b/civicpy/__init__.py @@ -0,0 +1,5 @@ +from .__version__ import __version__ + + +def version(): + return __version__ \ No newline at end of file diff --git a/civicpy/__version__.py b/civicpy/__version__.py index f7b9932..7e42eca 100644 --- a/civicpy/__version__.py +++ b/civicpy/__version__.py @@ -1,7 +1,7 @@ __title__ = 'civicpy' __description__ = 'CIViC variant knowledgebase analysis toolkit.' __url__ = 'http://civicpy.org' -__version__ = '0.0.2' +__version__ = '0.0.3a1' # __build__ = 0x021901 __author__ = 'Alex H. Wagner' __author_email__ = 'ahwagner22@gmail.com' diff --git a/civicpy/civic.py b/civicpy/civic.py index e4f150c..508d9a5 100644 --- a/civicpy/civic.py +++ b/civicpy/civic.py @@ -2,6 +2,7 @@ import importlib import logging + CACHE = dict() HPO_TERMS = dict() @@ -293,7 +294,7 @@ def hpo_ids(self): return [x.hpo_id for x in self.phenotypes if x.hpo_id] -class CivicAttribute(CivicRecord): +class CivicAttribute(CivicRecord, dict): _SIMPLE_FIELDS = {'type'} _COMPLEX_FIELDS = set() diff --git a/civicpy/exports.py b/civicpy/exports.py index 947df22..da2a004 100644 --- a/civicpy/exports.py +++ b/civicpy/exports.py @@ -232,7 +232,7 @@ def _validate_evidence_record(self, record): variant = record.variant valid = self.VALID_VARIANTS.get(variant, None) if valid is None: - valid = self._validate_sequence_variant(variant) and self._validate_coordinates(variant) + valid = self._validate_sequence_variant(variant) if not valid: logging.info(f'{record} has invalid VCF variant {variant}.') return valid @@ -259,7 +259,7 @@ def _validate_sequence_variant(self, variant): for j in range(type_len): if i == j: continue - if types[i] == types[j] and i > j: + if types[i].id == types[j].id and i > j: remove = True elif self.SO_READER.same_or_has_descendant(types[i].so_id, types[j].so_id): remove = True @@ -280,10 +280,10 @@ def _validate_coordinates(self, variant, types): return False if len(types) > 1: - structural_types = [t for t in types if self.SO_READER.same_or_has_ancestor('SO:0001537')] + structural_types = [t for t in types if self.SO_READER.same_or_has_ancestor(t.so_id, 'SO:0001537')] if len(structural_types) == 1: types = structural_types - elif len(structural_types > 1): + elif len(structural_types) > 1: logging.warning(f'Variant {variant} has multiple structural types. Skipping.') return False else: @@ -293,9 +293,9 @@ def _validate_coordinates(self, variant, types): variant_type = types[0] # If type is a transcript variant, requires exactly one coordinate set with ref and alt - if self.SO_READER.same_or_has_ancestor('SO:0001576'): + if self.SO_READER.same_or_has_ancestor(variant_type.so_id, 'SO:0001576'): coordinates = variant.coordinates - valid = all([ + valid_array = [bool(x) for x in [ coordinates.chromosome, coordinates.start, coordinates.stop, @@ -304,12 +304,33 @@ def _validate_coordinates(self, variant, types): not coordinates.chromosome2, not coordinates.start2, not coordinates.stop2 - ]) and all([c.upper() in ['A', 'C', 'G', 'T', 'N', '*'] for c in coordinates.variant_bases]) \ - and all([c.upper() in ['A', 'C', 'G', 'T', 'N'] for c in coordinates.reference_bases]) + ]] + valid = all(valid_array) \ + and all([c.upper() in ['A', 'C', 'G', 'T'] for c in coordinates.variant_bases]) \ + and all([c.upper() in ['A', 'C', 'G', 'T'] for c in coordinates.reference_bases]) + if not valid: + if sum(valid_array[:5]) == 0: + # Nothing to do here. No inference is to be performed, and no coordinates are provided. + logging.warning(f'Variant {variant} has a structural type but no coordinates. Skipping.') + elif sum(valid_array[:3]) + sum(valid_array[-3:]) == 6: + if sum(valid_array[3:5]) == 0: + # Here, neither ref nor alt is specified, as in ambiguous mutations for an amino acid. + logging.warning(f'Variant {variant} has a structural type but no ref or alt. Skipping.') + elif self.SO_READER.same_or_has_ancestor(variant_type.so_id, 'SO:0001589') or \ + self.SO_READER.same_or_has_ancestor(variant_type.so_id, 'SO:0001820') or \ + self.SO_READER.same_or_has_ancestor(variant_type.so_id, 'SO:0001587') or \ + self.SO_READER.same_or_has_ancestor(variant_type.so_id, 'SO:0002012'): + # Here, one of ref or alt is specified, and is of a compatible variant type for an indel + # These are allowed. + valid = True + else: + raise ValueError(f'Unexpected type ({variant_type.name}) for variant ( {variant.site_link} ).') + else: + raise ValueError(f'Unexpected coordinates for ( {variant.site_link} ).') return self._cache_variant_validation(variant, valid) else: - raise NotImplementedError - # TODO: handle non-transcript variants here + raise NotImplementedError(f'No logic to handle {variant_type.name} {variant}') + # TODO: handle non-transcript variants here. Currently aren't any that meet other criteria. def _cache_variant_validation(self, variant, result): self.VALID_VARIANTS[variant] = result diff --git a/civicpy/tests/fixtures.py b/civicpy/tests/fixtures.py new file mode 100644 index 0000000..9cf6445 --- /dev/null +++ b/civicpy/tests/fixtures.py @@ -0,0 +1,7 @@ +import pytest +from civicpy import civic + + +@pytest.fixture() +def v600e(): + return civic.get_variant_by_id(12) \ No newline at end of file diff --git a/civicpy/tests/test_exports.py b/civicpy/tests/test_exports.py new file mode 100644 index 0000000..6d81698 --- /dev/null +++ b/civicpy/tests/test_exports.py @@ -0,0 +1,24 @@ +import pytest +from civicpy import exports +from civicpy.tests.fixtures import * +import io + + +@pytest.fixture(scope='module') +def vcf_stream(): + return io.StringIO() + + +@pytest.fixture(scope='module') +def vcf_writer(vcf_stream): + return exports.VCFWriter(vcf_stream) + + +class TestVcfExport(object): + + def test_protein_altering(self, vcf_writer, v600e, caplog): + vcf_writer.addrecord(v600e) + assert not caplog.records + state1 = len(vcf_writer.evidence_records) + assert state1 == len(v600e.evidence) + vcf_writer.addrecord() \ No newline at end of file diff --git a/docs/user/civic.rst b/docs/user/civic.rst index 210d1ad..78148a0 100644 --- a/docs/user/civic.rst +++ b/docs/user/civic.rst @@ -197,18 +197,22 @@ Records can be obtained by ID through a collection of functions provided in the objects can be queried by the following methods: .. function:: get_genes_by_ids(gene_id_list) + A list of CIViC gene IDs are provided as `gene_id_list` and queried against the cache and (as needed) CIViC. Returns a list of :class:`Gene` objects. .. function:: get_gene_by_id(gene_id) + Similar to :func:`get_genes_by_ids`, but only one ID is passed (not a list) and only one :class:`Gene` returned. .. function:: get_all_genes() + Queries CIViC for all genes and returns as list of :class:`Gene` objects. The cache is not considered by this function. .. function:: get_all_gene_ids() + Queries CIViC for a list of all gene IDs. Useful for passing to :func:`get_genes_by_id` to first check cache for any previously queried genes. From 7185076efbaee88e54f2fd2b9cfe33ec0092a68c Mon Sep 17 00:00:00 2001 From: "Alex H. Wagner, PhD" Date: Mon, 25 Mar 2019 14:10:38 -0500 Subject: [PATCH 03/17] consistent spacing --- docs/user/civic.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user/civic.rst b/docs/user/civic.rst index 78148a0..6f566bd 100644 --- a/docs/user/civic.rst +++ b/docs/user/civic.rst @@ -145,7 +145,7 @@ The primary CIViC records are found on the CIViC advanced search page, and are f A list of :class:`Source` objects associated with the variant description. .. attribute:: variant_aliases - aliases + aliases A curated list of aliases by which this variant is referenced. From 84973443e62c59616c7aa5a262fd1fc27b188592 Mon Sep 17 00:00:00 2001 From: "Alex H. Wagner, PhD" Date: Mon, 25 Mar 2019 14:46:08 -0500 Subject: [PATCH 04/17] newline --- civicpy/tests/test_exports.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/civicpy/tests/test_exports.py b/civicpy/tests/test_exports.py index 6d81698..c49de3e 100644 --- a/civicpy/tests/test_exports.py +++ b/civicpy/tests/test_exports.py @@ -21,4 +21,4 @@ def test_protein_altering(self, vcf_writer, v600e, caplog): assert not caplog.records state1 = len(vcf_writer.evidence_records) assert state1 == len(v600e.evidence) - vcf_writer.addrecord() \ No newline at end of file + vcf_writer.addrecord() From e4c1f2225e09d6790444874f80a36ae2fd72f2b8 Mon Sep 17 00:00:00 2001 From: "Alex H. Wagner, PhD" Date: Mon, 25 Mar 2019 14:46:47 -0500 Subject: [PATCH 05/17] smarter attributes do more if id is provided --- civicpy/civic.py | 21 +++++++++++++++------ civicpy/tests/test_civic.py | 11 +++++++++++ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/civicpy/civic.py b/civicpy/civic.py index 508d9a5..867375b 100644 --- a/civicpy/civic.py +++ b/civicpy/civic.py @@ -184,7 +184,7 @@ def __init__(self, **kwargs): def evidence_sources(self): sources = set() for evidence in self.evidence_items: - if evidence.source: + if evidence.source is not None: sources.add(evidence.source) return sources @@ -300,7 +300,12 @@ class CivicAttribute(CivicRecord, dict): _COMPLEX_FIELDS = set() def __repr__(self): - return f'' + try: + _id = self.id + except AttributeError: + return f'' + else: + return f'' def __init__(self, **kwargs): kwargs['partial'] = False @@ -309,10 +314,14 @@ def __init__(self, **kwargs): super().__init__(**kwargs) def __hash__(self): - raise NotImplementedError - - def __eq__(self, other): - raise NotImplementedError + try: + _id = self.id + except AttributeError: + raise NotImplementedError + if _id is not None: + return CivicRecord.__hash__(self) + else: + raise ValueError @property def site_link(self): diff --git a/civicpy/tests/test_civic.py b/civicpy/tests/test_civic.py index 185bf0a..4ed31f5 100644 --- a/civicpy/tests/test_civic.py +++ b/civicpy/tests/test_civic.py @@ -1,5 +1,6 @@ import pytest from civicpy import civic +from civicpy.tests.fixtures import * ELEMENTS = [ 'Assertion' @@ -41,3 +42,13 @@ def test_completeness(self, element): complex_value = complex_value[0] if isinstance(complex_value, civic.CivicAttribute): assert not complex_value._partial + + +class TestEvidence(object): + + def test_get_source_ids(self, v600e): + assert len(v600e.evidence) + assert len(v600e.evidence) / 2 <= len(v600e.evidence_sources) + for source in v600e.evidence_sources: + assert source.citation_id + assert source.source_type From 1b8f2d99314ea46c55658bfba10d4f6bcaf219ec Mon Sep 17 00:00:00 2001 From: "Alex H. Wagner, PhD" Date: Mon, 25 Mar 2019 23:34:38 -0500 Subject: [PATCH 06/17] timestamp when full cache updates are performed and allow for reloading full cache --- civicpy/civic.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/civicpy/civic.py b/civicpy/civic.py index 867375b..4d9845f 100644 --- a/civicpy/civic.py +++ b/civicpy/civic.py @@ -1,12 +1,15 @@ import requests import importlib import logging +import datetime CACHE = dict() HPO_TERMS = dict() +FRESH_DELTA = datetime.timedelta(days=7) + MODULE = importlib.import_module('civicpy.civic') API_URL = 'https://civicdb.org/api' @@ -345,12 +348,23 @@ def get_cached(element_type, element_id): return CACHE.get(hash(r), False) +def _has_all_cached_fresh(element): + s = '{}_all_cached'.format(element) + if CACHE.get(s, False): + return CACHE[s] + FRESH_DELTA < datetime.datetime.now() + return False + + def _get_elements_by_ids(element, id_list=[], allow_cached=True, get_all=False): if allow_cached and not get_all: cached = [get_cached(element, element_id) for element_id in id_list] if all(cached): logging.info(f'Loading {pluralize(element)} from cache') return cached + elif allow_cached and _has_all_cached_fresh(element): + cached = [get_cached(element, element_id) for element_id in CACHE['{}_all_ids'.format(element)]] + logging.info(f'Loading {pluralize(element)} from cache') + return cached if id_list and get_all: raise ValueError('Please pass list of ids or use the get_all flag, not both.') if get_all: @@ -362,6 +376,8 @@ def _get_elements_by_ids(element, id_list=[], allow_cached=True, get_all=False): response.raise_for_status() cls = get_class(element) elements = [cls(**x) for x in response.json()['results']] + CACHE['{}_all_cached'.format(element)] = datetime.datetime.now() + CACHE['{}_all_ids'.format(element)] = [x['id'] for x in response.json()['results']] return elements From f96156e5ef3b23ab87c731a8b25a3bb84b098d57 Mon Sep 17 00:00:00 2001 From: "Alex H. Wagner, PhD" Date: Tue, 26 Mar 2019 17:29:07 -0500 Subject: [PATCH 07/17] add cache save/load --- .gitignore | 2 ++ civicpy/__init__.py | 7 ++++++ civicpy/civic.py | 43 ++++++++++++++++++++++++++++++++----- civicpy/tests/test_civic.py | 12 +++++++++++ requirements.txt | 3 ++- setup.py | 2 +- 6 files changed, 62 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 389a9d3..a41cc80 100644 --- a/.gitignore +++ b/.gitignore @@ -103,3 +103,5 @@ ENV/ # notebooks *.ipynb + +civicpy/data/ \ No newline at end of file diff --git a/civicpy/__init__.py b/civicpy/__init__.py index 141d0b5..9656a97 100644 --- a/civicpy/__init__.py +++ b/civicpy/__init__.py @@ -1,4 +1,11 @@ from .__version__ import __version__ +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent +DATA_ROOT = PROJECT_ROOT / 'data' + +if not DATA_ROOT.exists(): + DATA_ROOT.mkdir() def version(): diff --git a/civicpy/civic.py b/civicpy/civic.py index 4d9845f..26a1d97 100644 --- a/civicpy/civic.py +++ b/civicpy/civic.py @@ -2,10 +2,17 @@ import importlib import logging import datetime +import pandas as pd +import pickle +from civicpy import DATA_ROOT CACHE = dict() +CACHE_FILE = DATA_ROOT / 'CACHE.pkl' + +COORDINATE_TABLE = None + HPO_TERMS = dict() FRESH_DELTA = datetime.timedelta(days=7) @@ -22,6 +29,7 @@ 'evidence_items': 'evidence' } + def pluralize(string): if string in UNMARKED_PLURALS: return f'{string}_items' @@ -64,6 +72,28 @@ def get_class(element_type): return cls +def save_cache(): + with open(CACHE_FILE, 'wb') as pf: + pickle.dump(CACHE, pf) + + +def load_cache(): + with open(CACHE_FILE, 'rb') as pf: + old_cache = pickle.load(pf) + c = dict() + variants = set() + for k, v in old_cache.items(): + if isinstance(k, str): + c[k] = v + elif isinstance(k, int): + c[hash(v)] = v + if v.type == 'variant': + variants.add(v) + else: + raise ValueError + MODULE.CACHE = c + _build_coordinate_table(variants) + class CivicRecord: _SIMPLE_FIELDS = {'id', 'type'} @@ -129,6 +159,9 @@ def __hash__(self): def __eq__(self, other): return hash(self) == hash(other) + def __setstate__(self, state): + self.__dict__ = state + def update(self, allow_partial=True, force=False, **kwargs): """Updates record and returns True if record is complete after update, else False.""" if kwargs: @@ -349,9 +382,9 @@ def get_cached(element_type, element_id): def _has_all_cached_fresh(element): - s = '{}_all_cached'.format(element) + s = '{}_all_cached'.format(pluralize(element)) if CACHE.get(s, False): - return CACHE[s] + FRESH_DELTA < datetime.datetime.now() + return CACHE[s] + FRESH_DELTA > datetime.datetime.now() return False @@ -362,7 +395,7 @@ def _get_elements_by_ids(element, id_list=[], allow_cached=True, get_all=False): logging.info(f'Loading {pluralize(element)} from cache') return cached elif allow_cached and _has_all_cached_fresh(element): - cached = [get_cached(element, element_id) for element_id in CACHE['{}_all_ids'.format(element)]] + cached = [get_cached(element, element_id) for element_id in CACHE['{}_all_ids'.format(pluralize(element))]] logging.info(f'Loading {pluralize(element)} from cache') return cached if id_list and get_all: @@ -376,8 +409,8 @@ def _get_elements_by_ids(element, id_list=[], allow_cached=True, get_all=False): response.raise_for_status() cls = get_class(element) elements = [cls(**x) for x in response.json()['results']] - CACHE['{}_all_cached'.format(element)] = datetime.datetime.now() - CACHE['{}_all_ids'.format(element)] = [x['id'] for x in response.json()['results']] + CACHE['{}_all_cached'.format(pluralize(element))] = datetime.datetime.now() + CACHE['{}_all_ids'.format(pluralize(element))] = [x['id'] for x in response.json()['results']] return elements diff --git a/civicpy/tests/test_civic.py b/civicpy/tests/test_civic.py index 4ed31f5..92fe66f 100644 --- a/civicpy/tests/test_civic.py +++ b/civicpy/tests/test_civic.py @@ -7,6 +7,18 @@ ] +def setup_module(): + try: + civic.load_cache() + except FileNotFoundError: + pass + civic.get_all_variants() + + +def teardown_module(): + civic.save_cache() + + @pytest.fixture(scope="module", params=ELEMENTS) def element(request): element_type = request.param diff --git a/requirements.txt b/requirements.txt index e57e95e..33f872d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ pytest>=3.5 requests~=2.18 obonet==0.2.3 -networkx~=2.1 \ No newline at end of file +networkx~=2.1 +pandas==0.24.1 diff --git a/setup.py b/setup.py index 907dbe1..45259ed 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ 'pytest', 'requests' ], - python_requires='~=3.6', + python_requires='>=3.6', entry_points={}, ) From 214ebd8df6df816dd515e33988db9ab2fbb7c599 Mon Sep 17 00:00:00 2001 From: "Alex H. Wagner, PhD" Date: Wed, 27 Mar 2019 03:53:24 -0500 Subject: [PATCH 08/17] enable cache override --- civicpy/civic.py | 17 +++++++++++------ civicpy/tests/fixtures.py | 7 ------- civicpy/tests/test_civic.py | 10 +++++----- 3 files changed, 16 insertions(+), 18 deletions(-) delete mode 100644 civicpy/tests/fixtures.py diff --git a/civicpy/civic.py b/civicpy/civic.py index 26a1d97..0a85d4b 100644 --- a/civicpy/civic.py +++ b/civicpy/civic.py @@ -402,6 +402,7 @@ def _get_elements_by_ids(element, id_list=[], allow_cached=True, get_all=False): raise ValueError('Please pass list of ids or use the get_all flag, not both.') if get_all: payload = _construct_get_all_payload() + logging.warning('Getting all {}. This may take a couple of minutes...'.format(pluralize(element))) else: payload = _construct_query_payload(id_list) url = search_url(element) @@ -500,18 +501,22 @@ def get_variant_by_id(variant_id): return get_variants_by_ids([variant_id])[0] -def get_all_variants(): - return _get_all_genes_and_variants()['variants'] +def get_all_variants(allow_cached=True): + precached = _has_all_cached_fresh('variants') + variants = _get_all_genes_and_variants(allow_cached)['variants'] + if not (precached and allow_cached): + _build_coordinate_table(variants) + return variants + def get_all_variant_ids(): return _get_all_element_ids('variants') -def _get_all_genes_and_variants(): - logging.warning('Getting all genes or variants. This may take a couple of minutes...') - variants = _get_elements_by_ids('variants', get_all=True) - genes = _get_elements_by_ids('gene', get_all=True) +def _get_all_genes_and_variants(allow_cached=True): + variants = _get_elements_by_ids('variants', get_all=True, allow_cached=allow_cached) + genes = _get_elements_by_ids('gene', get_all=True, allow_cached=allow_cached) for variant in variants: variant.gene.update() return {'genes': genes, 'variants': variants} diff --git a/civicpy/tests/fixtures.py b/civicpy/tests/fixtures.py deleted file mode 100644 index 9cf6445..0000000 --- a/civicpy/tests/fixtures.py +++ /dev/null @@ -1,7 +0,0 @@ -import pytest -from civicpy import civic - - -@pytest.fixture() -def v600e(): - return civic.get_variant_by_id(12) \ No newline at end of file diff --git a/civicpy/tests/test_civic.py b/civicpy/tests/test_civic.py index 92fe66f..6512788 100644 --- a/civicpy/tests/test_civic.py +++ b/civicpy/tests/test_civic.py @@ -1,6 +1,5 @@ import pytest from civicpy import civic -from civicpy.tests.fixtures import * ELEMENTS = [ 'Assertion' @@ -15,16 +14,17 @@ def setup_module(): civic.get_all_variants() -def teardown_module(): - civic.save_cache() - - @pytest.fixture(scope="module", params=ELEMENTS) def element(request): element_type = request.param return civic._get_elements_by_ids(element_type, [1])[0] +@pytest.fixture(scope="module") +def v600e(): + return civic.get_variant_by_id(12) + + class TestGetFunctions(object): def test_get_assertions(self): From 7deb6ada72f1b3ca524b854960e512568eeac1bb Mon Sep 17 00:00:00 2001 From: "Alex H. Wagner, PhD" Date: Wed, 27 Mar 2019 03:53:45 -0500 Subject: [PATCH 09/17] add coordinate search --- civicpy/civic.py | 60 +++++++++++++++++++++++++++++++++++++ civicpy/tests/test_civic.py | 19 ++++++++++++ 2 files changed, 79 insertions(+) diff --git a/civicpy/civic.py b/civicpy/civic.py index 0a85d4b..02a1fe0 100644 --- a/civicpy/civic.py +++ b/civicpy/civic.py @@ -485,6 +485,15 @@ def get_all_assertions(): return get_assertions_by_ids(get_all=True) +def search_assertions_by_coordinates(coordinates, search_mode='any'): + variants = search_variants_by_coordinates(coordinates, search_mode=search_mode) + assertions = set() + for v in variants: + if v.assertions: + assertions.update(v.assertions) + return list(assertions) + + def get_variants_by_ids(variant_id_list): logging.info('Getting variants...') variants = _get_elements_by_ids('variant', variant_id_list) @@ -501,6 +510,29 @@ def get_variant_by_id(variant_id): return get_variants_by_ids([variant_id])[0] +def _build_coordinate_table(variants): + variant_records = list() + for v in variants: + c = v.coordinates + start = getattr(c, 'start', None) + stop = getattr(c, 'stop', None) + chr = getattr(c, 'chromosome', None) + alt = getattr(c, 'variant_bases', None) + if all([start, stop, chr]): + variant_records.append([chr, start, stop, alt, hash(v)]) + else: + continue + start = getattr(c, 'start2', None) + stop = getattr(c, 'stop2', None) + chr = getattr(c, 'chromosome2', None) + if all([start, stop, chr]): + variant_records.append([chr, start, stop, None, hash(v)]) + MODULE.COORDINATE_TABLE = pd.DataFrame.from_records( + variant_records, + columns=['chr', 'start', 'stop', 'alt', 'v_hash'] + ) + + def get_all_variants(allow_cached=True): precached = _has_all_cached_fresh('variants') variants = _get_all_genes_and_variants(allow_cached)['variants'] @@ -509,6 +541,34 @@ def get_all_variants(allow_cached=True): return variants +def search_variants_by_coordinates(coordinates, search_mode='any'): + """ + Search the cache for variants matching provided coordinates using the corresponding search strategy. + + :param coordinates: A dictionary comprised of 'start', 'stop', 'chr', and optional 'alt' keys + start: the genomic start coordinate of the query + stop: the genomic end coordinate of the query + chr: the GRCh37 chromosome of the query (e.g. "7", "X") + alt: the alternate allele at the coordinate [optional] + + :param search_mode: ['any', 'include_smaller', 'include_larger', 'exact'] + any: any overlap between a query and a variant is a match + include_smaller: variants must fit within the coordinates of the query + include_larger: variants must encompass the coordinates of the query + exact: variants must match coordinates precisely, as well as alternate + allele, if provided + search_mode is 'exact' by default + + :return: Returns a list of variants matching the coordinates and search_mode + """ + get_all_variants() + ct = COORDINATE_TABLE + overlapping = (coordinates['start'] <= ct.stop) & (coordinates['stop'] >= ct.start) + if search_mode == 'any': + var_ids = ct[overlapping].v_hash + else: + raise NotImplementedError # TODO: Implement other search modes + return [CACHE[v] for v in var_ids] def get_all_variant_ids(): return _get_all_element_ids('variants') diff --git a/civicpy/tests/test_civic.py b/civicpy/tests/test_civic.py index 6512788..ffe96f4 100644 --- a/civicpy/tests/test_civic.py +++ b/civicpy/tests/test_civic.py @@ -64,3 +64,22 @@ def test_get_source_ids(self, v600e): for source in v600e.evidence_sources: assert source.citation_id assert source.source_type + + +class TestCoordinateSearch(object): + + def test_search_assertions(self): + coordinates = { + 'chr': 7, + 'start': 140453136, + 'stop': 140453136, + 'alt': 'T' + } + assertions = civic.search_assertions_by_coordinates(coordinates) + assertion_ids = [x.id for x in assertions] + v600e_assertion_ids = (7, 10, 12, 20) + v600k_assertion_ids = (11, 13) + assert set(assertion_ids) == set(v600e_assertion_ids + v600k_assertion_ids) + assertions = civic.search_assertions_by_coordinates(coordinates, search_mode='exact') + assertion_ids = [x.id for x in assertions] + assert set(assertion_ids) == set(v600e_assertion_ids) From 593e2514dc99fa06dc108300ff918a865fe4203e Mon Sep 17 00:00:00 2001 From: "Alex H. Wagner, PhD" Date: Wed, 27 Mar 2019 04:05:29 -0500 Subject: [PATCH 10/17] implement search strategies --- civicpy/civic.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/civicpy/civic.py b/civicpy/civic.py index 02a1fe0..2b361a0 100644 --- a/civicpy/civic.py +++ b/civicpy/civic.py @@ -565,9 +565,18 @@ def search_variants_by_coordinates(coordinates, search_mode='any'): ct = COORDINATE_TABLE overlapping = (coordinates['start'] <= ct.stop) & (coordinates['stop'] >= ct.start) if search_mode == 'any': - var_ids = ct[overlapping].v_hash + match = overlapping + elif search_mode == 'include_smaller': + match = overlapping & (coordinates['start'] <= ct.start) & (coordinates['stop'] >= ct.stop) + elif search_mode == 'include_larger': + match = overlapping & (coordinates['start'] >= ct.start) & (coordinates['stop'] <= ct.stop) + elif search_mode == 'exact': + match = (coordinates['start'] == ct.stop) & (coordinates['stop'] == ct.start) + if coordinates.get('alt', False): + match = match & (coordinates['alt'] == ct.alt) else: - raise NotImplementedError # TODO: Implement other search modes + raise ValueError("unexpected search mode") + var_ids = ct[match].v_hash return [CACHE[v] for v in var_ids] def get_all_variant_ids(): From c8d23702b84420d1ba5af650e8193ffd86dd54ee Mon Sep 17 00:00:00 2001 From: "Alex H. Wagner, PhD" Date: Wed, 27 Mar 2019 22:08:30 -0500 Subject: [PATCH 11/17] cache and coordinate search improvements --- .gitignore | 6 ++---- civicpy/civic.py | 48 ++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 40 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index a41cc80..e94e44b 100644 --- a/.gitignore +++ b/.gitignore @@ -101,7 +101,5 @@ ENV/ # mypy .mypy_cache/ -# notebooks -*.ipynb - -civicpy/data/ \ No newline at end of file +# data folder +civicpy/data/ diff --git a/civicpy/civic.py b/civicpy/civic.py index 2b361a0..d194d32 100644 --- a/civicpy/civic.py +++ b/civicpy/civic.py @@ -12,6 +12,9 @@ CACHE_FILE = DATA_ROOT / 'CACHE.pkl' COORDINATE_TABLE = None +COORDINATE_TABLE_START = None +COORDINATE_TABLE_STOP = None +COORDINATE_TABLE_CHR = None HPO_TERMS = dict() @@ -92,6 +95,10 @@ def load_cache(): else: raise ValueError MODULE.CACHE = c + for k, v in MODULE.CACHE.items(): + if isinstance(k, str): + continue + v.update() _build_coordinate_table(variants) class CivicRecord: @@ -527,10 +534,14 @@ def _build_coordinate_table(variants): chr = getattr(c, 'chromosome2', None) if all([start, stop, chr]): variant_records.append([chr, start, stop, None, hash(v)]) - MODULE.COORDINATE_TABLE = pd.DataFrame.from_records( + df = pd.DataFrame.from_records( variant_records, columns=['chr', 'start', 'stop', 'alt', 'v_hash'] ) + MODULE.COORDINATE_TABLE = df + MODULE.COORDINATE_TABLE_START = df.start.sort_values() + MODULE.COORDINATE_TABLE_STOP = df.stop.sort_values() + MODULE.COORDINATE_TABLE_CHR = df.chr.sort_values() def get_all_variants(allow_cached=True): @@ -559,25 +570,42 @@ def search_variants_by_coordinates(coordinates, search_mode='any'): allele, if provided search_mode is 'exact' by default - :return: Returns a list of variants matching the coordinates and search_mode + :return: Returns a list of variant hashes matching the coordinates and search_mode """ get_all_variants() ct = COORDINATE_TABLE - overlapping = (coordinates['start'] <= ct.stop) & (coordinates['stop'] >= ct.start) + start_idx = COORDINATE_TABLE_START + stop_idx = COORDINATE_TABLE_STOP + chr_idx = COORDINATE_TABLE_CHR + start = int(coordinates['start']) + stop = int(coordinates['stop']) + chromosome = str(coordinates['chr']) + # overlapping = (start <= ct.stop) & (stop >= ct.start) + left_idx = chr_idx.searchsorted(chromosome) + right_idx = chr_idx.searchsorted(chromosome, side='right') + chr_ct_idx = chr_idx[left_idx:right_idx].index + right_idx = start_idx.searchsorted(stop, side='right') + start_ct_idx = start_idx[:right_idx].index + left_idx = stop_idx.searchsorted(start) + stop_ct_idx = stop_idx[left_idx:].index + match_idx = chr_ct_idx & start_ct_idx & stop_ct_idx + m_df = ct.loc[match_idx, ] if search_mode == 'any': - match = overlapping + var_digests = m_df.v_hash.to_list() + return [CACHE[v] for v in var_digests] elif search_mode == 'include_smaller': - match = overlapping & (coordinates['start'] <= ct.start) & (coordinates['stop'] >= ct.stop) + match_idx = (start <= m_df.start) & (stop >= m_df.stop) elif search_mode == 'include_larger': - match = overlapping & (coordinates['start'] >= ct.start) & (coordinates['stop'] <= ct.stop) + match_idx = (start >= m_df.start) & (stop <= m_df.stop) elif search_mode == 'exact': - match = (coordinates['start'] == ct.stop) & (coordinates['stop'] == ct.start) + match_idx = (start == m_df.stop) & (stop == m_df.start) if coordinates.get('alt', False): - match = match & (coordinates['alt'] == ct.alt) + match_idx = match_idx & (coordinates['alt'] == m_df.alt) else: raise ValueError("unexpected search mode") - var_ids = ct[match].v_hash - return [CACHE[v] for v in var_ids] + var_digests = m_df.loc[match_idx,].v_hash.to_list() + return [CACHE[v] for v in var_digests] + def get_all_variant_ids(): return _get_all_element_ids('variants') From 971104a7fba3edbce148dd1114644ee74376aab5 Mon Sep 17 00:00:00 2001 From: "Alex H. Wagner, PhD" Date: Thu, 28 Mar 2019 01:05:49 -0500 Subject: [PATCH 12/17] add bulk search --- civicpy/civic.py | 95 ++++++++++++++++++++++++++++++++++++- civicpy/tests/test_civic.py | 28 +++++++++++ 2 files changed, 121 insertions(+), 2 deletions(-) diff --git a/civicpy/civic.py b/civicpy/civic.py index d194d32..2c2cd82 100644 --- a/civicpy/civic.py +++ b/civicpy/civic.py @@ -537,7 +537,7 @@ def _build_coordinate_table(variants): df = pd.DataFrame.from_records( variant_records, columns=['chr', 'start', 'stop', 'alt', 'v_hash'] - ) + ).sort_values(by=['chr', 'start', 'stop', 'alt']) MODULE.COORDINATE_TABLE = df MODULE.COORDINATE_TABLE_START = df.start.sort_values() MODULE.COORDINATE_TABLE_STOP = df.stop.sort_values() @@ -554,7 +554,7 @@ def get_all_variants(allow_cached=True): def search_variants_by_coordinates(coordinates, search_mode='any'): """ - Search the cache for variants matching provided coordinates using the corresponding search strategy. + Search the cache for variants matching provided coordinates using the corresponding search mode. :param coordinates: A dictionary comprised of 'start', 'stop', 'chr', and optional 'alt' keys start: the genomic start coordinate of the query @@ -607,6 +607,97 @@ def search_variants_by_coordinates(coordinates, search_mode='any'): return [CACHE[v] for v in var_digests] +# TODO: Refactor this method +def bulk_search_variants_by_coordinates(sorted_query_generator, search_mode='any'): + """ + An interator to search the cache for variants matching the set of sorted coordinates and yield + matches corresponding to the search mode. + + :param coordinates: A dictionary comprised of 'start', 'stop', 'chr', and optional 'alt' keys + start: the genomic start coordinate of the query + stop: the genomic end coordinate of the query + chr: the GRCh37 chromosome of the query (e.g. "7", "X") + alt: the alternate allele at the coordinate [optional] + + :param search_mode: ['any', 'include_smaller', 'include_larger', 'exact'] + any: any overlap between a query and a variant is a match + include_smaller: variants must fit within the coordinates of the query + include_larger: variants must encompass the coordinates of the query + exact: variants must match coordinates precisely, as well as alternate + allele, if provided + search_mode is 'exact' by default + + :yield: Yields (query, match) tuples for each identified match + """ + def iter_sorted_cache(): + generator = MODULE.COORDINATE_TABLE.iterrows() + for row in generator: + yield row[1].to_dict() + + sorted_cache_generator = iter_sorted_cache() + current_query_coords = next(sorted_query_generator) + current_cache_coords = next(sorted_cache_generator) + new_query = False + review = [] + while True: + if new_query and review: + review.append(current_cache_coords) + current_cache_coords = review.pop(0) + q_chr = str(current_query_coords['chr']) + c_chr = current_cache_coords['chr'] + if q_chr < c_chr: + current_query_coords = next(sorted_query_generator) + continue + if q_chr > c_chr: + current_cache_coords = next(sorted_cache_generator) + continue + q_start = int(current_query_coords['start']) + c_start = current_cache_coords['start'] + q_stop = int(current_query_coords['stop']) + c_stop = current_cache_coords['stop'] + if q_start > c_stop: + if review: + current_cache_coords = review.pop(0) + else: + current_cache_coords = next(sorted_cache_generator) + continue + if q_stop < c_start: + current_query_coords = next(sorted_query_generator) + new_query = True + continue + if search_mode == 'any': + yield (current_query_coords, current_cache_coords) + review.append(current_cache_coords) + current_cache_coords = next(sorted_cache_generator) + continue + q_alt = current_query_coords.get('alt', None) + c_alt = current_cache_coords.get('alt', None) + if q_start == c_start and q_stop == c_stop: + if search_mode == 'exact' and q_alt: + if q_alt > c_alt: + if review: + current_cache_coords = review.pop(0) + else: + current_cache_coords = next(sorted_cache_generator) + continue + elif q_alt < c_alt: + current_query_coords = next(sorted_query_generator) + new_query = True + continue + yield (current_query_coords, current_cache_coords) + review.append(current_cache_coords) + current_cache_coords = next(sorted_cache_generator) + continue + if search_mode == 'include_smaller': + raise NotImplementedError + if search_mode == 'include_larger': + raise NotImplementedError + if review: + current_cache_coords = review.pop(0) + else: + current_cache_coords = next(sorted_cache_generator) + + def get_all_variant_ids(): return _get_all_element_ids('variants') diff --git a/civicpy/tests/test_civic.py b/civicpy/tests/test_civic.py index ffe96f4..beb59a2 100644 --- a/civicpy/tests/test_civic.py +++ b/civicpy/tests/test_civic.py @@ -1,5 +1,6 @@ import pytest from civicpy import civic +from collections import defaultdict ELEMENTS = [ 'Assertion' @@ -83,3 +84,30 @@ def test_search_assertions(self): assertions = civic.search_assertions_by_coordinates(coordinates, search_mode='exact') assertion_ids = [x.id for x in assertions] assert set(assertion_ids) == set(v600e_assertion_ids) + + def test_bulk_search_variants(self): + def coord_gen(): + coordinate_sets = [ + { + 'chr': '7', + 'start': 140453136, + 'stop': 140453136, + 'alt': 'T' + }, + { + 'chr': '7', + 'start': 140453136, + 'stop': 140453137, + 'alt': 'TT' + }, + ] + for c in coordinate_sets: + yield c + gen = coord_gen() + search_results = list(civic.bulk_search_variants_by_coordinates(gen)) + results_dict = defaultdict(list) + for q, r in search_results: + k = (q['chr'], q['start'], q['stop'], q['alt']) + results_dict[k].append(civic.CACHE[r['v_hash']]) + active_variants = [v for v in results_dict[('7', 140453136, 140453136, 'T')] if len(v.evidence_items)] + assert len(active_variants) >= 12 From 001eaa1a49c010fcda27516544a80c6e59179b10 Mon Sep 17 00:00:00 2001 From: "Alex H. Wagner, PhD" Date: Thu, 28 Mar 2019 06:40:58 -0500 Subject: [PATCH 13/17] WIP: bulk search --- civicpy/civic.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/civicpy/civic.py b/civicpy/civic.py index 2c2cd82..b89e475 100644 --- a/civicpy/civic.py +++ b/civicpy/civic.py @@ -634,19 +634,40 @@ def iter_sorted_cache(): for row in generator: yield row[1].to_dict() + def is_sorted(prev_q, current_q): + if prev_q['chr'] < current_q['chr']: + return True + if prev_q['chr'] > current_q['chr']: + return False + if prev_q['start'] < current_q['start']: + return True + if prev_q['start'] > current_q['start']: + return False + if prev_q['stop'] < current_q['stop']: + return True + if prev_q['stop'] > current_q['stop']: + return False + return True + sorted_cache_generator = iter_sorted_cache() current_query_coords = next(sorted_query_generator) current_cache_coords = next(sorted_cache_generator) + previous_query = None new_query = False review = [] while True: if new_query and review: review.append(current_cache_coords) current_cache_coords = review.pop(0) + if new_query: + assert is_sorted(previous_query, current_query_coords), (previous_query, current_query_coords) + new_query = False q_chr = str(current_query_coords['chr']) c_chr = current_cache_coords['chr'] if q_chr < c_chr: + previous_query = current_query_coords current_query_coords = next(sorted_query_generator) + new_query = True continue if q_chr > c_chr: current_cache_coords = next(sorted_cache_generator) @@ -662,6 +683,7 @@ def iter_sorted_cache(): current_cache_coords = next(sorted_cache_generator) continue if q_stop < c_start: + previous_query = current_query_coords current_query_coords = next(sorted_query_generator) new_query = True continue @@ -674,13 +696,14 @@ def iter_sorted_cache(): c_alt = current_cache_coords.get('alt', None) if q_start == c_start and q_stop == c_stop: if search_mode == 'exact' and q_alt: - if q_alt > c_alt: + if c_alt is None or q_alt > c_alt: if review: current_cache_coords = review.pop(0) else: current_cache_coords = next(sorted_cache_generator) continue elif q_alt < c_alt: + previous_query = current_query_coords current_query_coords = next(sorted_query_generator) new_query = True continue From 342e9049177d81f179d49bd8e1faa7c6547cd0c1 Mon Sep 17 00:00:00 2001 From: "Alex H. Wagner, PhD" Date: Sat, 30 Mar 2019 16:55:17 -0400 Subject: [PATCH 14/17] update python version --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 45259ed..f853c48 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,6 @@ 'pytest', 'requests' ], - python_requires='>=3.6', + python_requires='>=3.7', entry_points={}, - ) From e1dc23d009a81caef835597f50c7cfb8308dd00d Mon Sep 17 00:00:00 2001 From: "Alex H. Wagner, PhD" Date: Sat, 30 Mar 2019 16:55:32 -0400 Subject: [PATCH 15/17] fix bulk coordinate search --- civicpy/civic.py | 130 +++++++++++++++--------------------- civicpy/tests/test_civic.py | 45 +++---------- 2 files changed, 65 insertions(+), 110 deletions(-) diff --git a/civicpy/civic.py b/civicpy/civic.py index b89e475..7936346 100644 --- a/civicpy/civic.py +++ b/civicpy/civic.py @@ -4,6 +4,7 @@ import datetime import pandas as pd import pickle +from collections import defaultdict, namedtuple from civicpy import DATA_ROOT @@ -33,6 +34,9 @@ } +CoordinateQuery = namedtuple('CoordinateQuery', ['chr', 'start', 'stop', 'alt', 'key'], defaults=(None, None)) + + def pluralize(string): if string in UNMARKED_PLURALS: return f'{string}_items' @@ -552,11 +556,11 @@ def get_all_variants(allow_cached=True): return variants -def search_variants_by_coordinates(coordinates, search_mode='any'): +def search_variants_by_coordinates(coordinate_query, search_mode='any'): """ Search the cache for variants matching provided coordinates using the corresponding search mode. - :param coordinates: A dictionary comprised of 'start', 'stop', 'chr', and optional 'alt' keys + :param coordinate_query: A civic CoordinateQuery object start: the genomic start coordinate of the query stop: the genomic end coordinate of the query chr: the GRCh37 chromosome of the query (e.g. "7", "X") @@ -577,9 +581,9 @@ def search_variants_by_coordinates(coordinates, search_mode='any'): start_idx = COORDINATE_TABLE_START stop_idx = COORDINATE_TABLE_STOP chr_idx = COORDINATE_TABLE_CHR - start = int(coordinates['start']) - stop = int(coordinates['stop']) - chromosome = str(coordinates['chr']) + start = int(coordinate_query.start) + stop = int(coordinate_query.stop) + chromosome = str(coordinate_query.chr) # overlapping = (start <= ct.stop) & (stop >= ct.start) left_idx = chr_idx.searchsorted(chromosome) right_idx = chr_idx.searchsorted(chromosome, side='right') @@ -599,8 +603,8 @@ def search_variants_by_coordinates(coordinates, search_mode='any'): match_idx = (start >= m_df.start) & (stop <= m_df.stop) elif search_mode == 'exact': match_idx = (start == m_df.stop) & (stop == m_df.start) - if coordinates.get('alt', False): - match_idx = match_idx & (coordinates['alt'] == m_df.alt) + if coordinate_query.alt: + match_idx = match_idx & (coordinate_query.alt == m_df.alt) else: raise ValueError("unexpected search mode") var_digests = m_df.loc[match_idx,].v_hash.to_list() @@ -608,16 +612,16 @@ def search_variants_by_coordinates(coordinates, search_mode='any'): # TODO: Refactor this method -def bulk_search_variants_by_coordinates(sorted_query_generator, search_mode='any'): +def bulk_search_variants_by_coordinates(sorted_queries, search_mode='any'): """ An interator to search the cache for variants matching the set of sorted coordinates and yield matches corresponding to the search mode. - :param coordinates: A dictionary comprised of 'start', 'stop', 'chr', and optional 'alt' keys - start: the genomic start coordinate of the query - stop: the genomic end coordinate of the query - chr: the GRCh37 chromosome of the query (e.g. "7", "X") - alt: the alternate allele at the coordinate [optional] + :param sorted_queries: A list of civic CoordinateQuery objects, sorted by coordinate. + start: the genomic start coordinate of the query + stop: the genomic end coordinate of the query + chr: the GRCh37 chromosome of the query (e.g. "7", "X") + alt: the alternate allele at the coordinate [optional] :param search_mode: ['any', 'include_smaller', 'include_larger', 'exact'] any: any overlap between a query and a variant is a match @@ -629,10 +633,6 @@ def bulk_search_variants_by_coordinates(sorted_query_generator, search_mode='any :yield: Yields (query, match) tuples for each identified match """ - def iter_sorted_cache(): - generator = MODULE.COORDINATE_TABLE.iterrows() - for row in generator: - yield row[1].to_dict() def is_sorted(prev_q, current_q): if prev_q['chr'] < current_q['chr']: @@ -649,76 +649,54 @@ def is_sorted(prev_q, current_q): return False return True - sorted_cache_generator = iter_sorted_cache() - current_query_coords = next(sorted_query_generator) - current_cache_coords = next(sorted_cache_generator) - previous_query = None - new_query = False - review = [] - while True: - if new_query and review: - review.append(current_cache_coords) - current_cache_coords = review.pop(0) - if new_query: - assert is_sorted(previous_query, current_query_coords), (previous_query, current_query_coords) - new_query = False - q_chr = str(current_query_coords['chr']) - c_chr = current_cache_coords['chr'] + ct_pointer = 0 + query_pointer = 0 + last_query_pointer = -1 + match_start = None + ct = MODULE.COORDINATE_TABLE + matches = defaultdict(list) + Match = namedtuple('Match', ct.columns) + while query_pointer < len(sorted_queries) and ct_pointer < len(ct): + if last_query_pointer != query_pointer: + q = sorted_queries[query_pointer] + if match_start is not None: + ct_pointer = match_start + match_start = None + last_query_pointer = query_pointer + c = ct.iloc[ct_pointer] + q_chr = str(q.chr) + c_chr = c.chr if q_chr < c_chr: - previous_query = current_query_coords - current_query_coords = next(sorted_query_generator) - new_query = True + query_pointer += 1 continue if q_chr > c_chr: - current_cache_coords = next(sorted_cache_generator) + ct_pointer += 1 continue - q_start = int(current_query_coords['start']) - c_start = current_cache_coords['start'] - q_stop = int(current_query_coords['stop']) - c_stop = current_cache_coords['stop'] + q_start = int(q.start) + c_start = c.start + q_stop = int(q.stop) + c_stop = c.stop if q_start > c_stop: - if review: - current_cache_coords = review.pop(0) - else: - current_cache_coords = next(sorted_cache_generator) + ct_pointer += 1 continue if q_stop < c_start: - previous_query = current_query_coords - current_query_coords = next(sorted_query_generator) - new_query = True + query_pointer += 1 continue if search_mode == 'any': - yield (current_query_coords, current_cache_coords) - review.append(current_cache_coords) - current_cache_coords = next(sorted_cache_generator) - continue - q_alt = current_query_coords.get('alt', None) - c_alt = current_cache_coords.get('alt', None) - if q_start == c_start and q_stop == c_stop: - if search_mode == 'exact' and q_alt: - if c_alt is None or q_alt > c_alt: - if review: - current_cache_coords = review.pop(0) - else: - current_cache_coords = next(sorted_cache_generator) - continue - elif q_alt < c_alt: - previous_query = current_query_coords - current_query_coords = next(sorted_query_generator) - new_query = True - continue - yield (current_query_coords, current_cache_coords) - review.append(current_cache_coords) - current_cache_coords = next(sorted_cache_generator) - continue - if search_mode == 'include_smaller': + matches[q].append(c.to_dict()) + elif search_mode == 'exact' and q_start == c_start and q_stop == c_stop: + q_alt = q.alt + c_alt = c.alt + if not (q_alt and c_alt and q_alt != c_alt): + matches[q].append(Match(**c.to_dict())) + elif search_mode == 'include_smaller': raise NotImplementedError - if search_mode == 'include_larger': + elif search_mode == 'include_larger': raise NotImplementedError - if review: - current_cache_coords = review.pop(0) - else: - current_cache_coords = next(sorted_cache_generator) + if match_start is None: + match_start = ct_pointer + ct_pointer += 1 + return dict(matches) def get_all_variant_ids(): diff --git a/civicpy/tests/test_civic.py b/civicpy/tests/test_civic.py index beb59a2..e8e6ea0 100644 --- a/civicpy/tests/test_civic.py +++ b/civicpy/tests/test_civic.py @@ -1,6 +1,6 @@ import pytest from civicpy import civic -from collections import defaultdict +from civicpy.civic import CoordinateQuery ELEMENTS = [ 'Assertion' @@ -70,44 +70,21 @@ def test_get_source_ids(self, v600e): class TestCoordinateSearch(object): def test_search_assertions(self): - coordinates = { - 'chr': 7, - 'start': 140453136, - 'stop': 140453136, - 'alt': 'T' - } - assertions = civic.search_assertions_by_coordinates(coordinates) + query = CoordinateQuery('7', 140453136, 140453136, 'T') + assertions = civic.search_assertions_by_coordinates(query) assertion_ids = [x.id for x in assertions] v600e_assertion_ids = (7, 10, 12, 20) v600k_assertion_ids = (11, 13) assert set(assertion_ids) == set(v600e_assertion_ids + v600k_assertion_ids) - assertions = civic.search_assertions_by_coordinates(coordinates, search_mode='exact') + assertions = civic.search_assertions_by_coordinates(query, search_mode='exact') assertion_ids = [x.id for x in assertions] assert set(assertion_ids) == set(v600e_assertion_ids) def test_bulk_search_variants(self): - def coord_gen(): - coordinate_sets = [ - { - 'chr': '7', - 'start': 140453136, - 'stop': 140453136, - 'alt': 'T' - }, - { - 'chr': '7', - 'start': 140453136, - 'stop': 140453137, - 'alt': 'TT' - }, - ] - for c in coordinate_sets: - yield c - gen = coord_gen() - search_results = list(civic.bulk_search_variants_by_coordinates(gen)) - results_dict = defaultdict(list) - for q, r in search_results: - k = (q['chr'], q['start'], q['stop'], q['alt']) - results_dict[k].append(civic.CACHE[r['v_hash']]) - active_variants = [v for v in results_dict[('7', 140453136, 140453136, 'T')] if len(v.evidence_items)] - assert len(active_variants) >= 12 + sorted_queries = [ + CoordinateQuery('7', 140453136, 140453136, 'T'), + CoordinateQuery('7', 140453136, 140453137, 'TT') + ] + search_results = civic.bulk_search_variants_by_coordinates(sorted_queries) + assert len(search_results[sorted_queries[0]]) >= 12 + assert len(search_results[sorted_queries[1]]) >= len(search_results[sorted_queries[0]]) From 64061b366d0078dfe04945fa73bcaef2e72a87fd Mon Sep 17 00:00:00 2001 From: "Alex H. Wagner, PhD" Date: Sat, 30 Mar 2019 22:22:43 -0400 Subject: [PATCH 16/17] add project GENIE analysis --- .gitignore | 3 +- analysis/Project GENIE.ipynb | 419 +++++++++++++++++++++++++++++++++++ requirements_dev.txt | 2 + 3 files changed, 423 insertions(+), 1 deletion(-) create mode 100644 analysis/Project GENIE.ipynb create mode 100644 requirements_dev.txt diff --git a/.gitignore b/.gitignore index e94e44b..269eac5 100644 --- a/.gitignore +++ b/.gitignore @@ -101,5 +101,6 @@ ENV/ # mypy .mypy_cache/ -# data folder +# data folders civicpy/data/ +analysis/data/ diff --git a/analysis/Project GENIE.ipynb b/analysis/Project GENIE.ipynb new file mode 100644 index 0000000..8498ab5 --- /dev/null +++ b/analysis/Project GENIE.ipynb @@ -0,0 +1,419 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Loading the Project GENIE Cohort" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import csv\n", + "from collections import defaultdict, OrderedDict\n", + "from timeit import default_timer\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def get_coordinates_from_genie_record(record):\n", + " assert record['NCBI_Build'] == 'GRCh37'\n", + " chromosome = str(record['Chromosome'])\n", + " start = int(record['Start_Position'])\n", + " stop = int(record['End_Position'])\n", + " if record['Reference_Allele'] != record['Tumor_Seq_Allele1']:\n", + " alt = record['Tumor_Seq_Allele1']\n", + " else:\n", + " alt = record['Tumor_Seq_Allele2']\n", + " if alt == '-':\n", + " alt = None\n", + " d = OrderedDict([\n", + " ('chr', chromosome),\n", + " ('start', start),\n", + " ('stop', stop),\n", + " ('alt', alt),\n", + " ('barcode', record['Tumor_Sample_Barcode'])\n", + " ])\n", + " return d\n", + "\n", + "def genie_record_generator(records):\n", + " for r in records:\n", + " yield get_coordinates_from_genie_record(r)\n", + " \n", + "def genie_caster(records):\n", + " for r in records:\n", + " d = OrderedDict([\n", + " ('chr', r['chr']),\n", + " ('start', int(r['start'])),\n", + " ('stop', int(r['stop'])),\n", + " ('alt', r['alt']),\n", + " ('barcode', r['barcode'])\n", + " ])\n", + " yield d" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# GENIE data downloaded from https://www.synapse.org/#!Synapse:syn17394041\n", + "\n", + "with open('data/data_mutations_extended_5.0-public.txt', 'r') as f:\n", + " genie_samples = next(f).split()[1:]\n", + " genie_file_reader = csv.DictReader(f, delimiter='\\t')\n", + " df = pd.DataFrame.from_dict(genie_record_generator(genie_file_reader))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# sort and save the GENIE data in a compatible format\n", + "\n", + "df.columns = ['chr', 'start', 'stop', 'alt', 'key']\n", + "df.sort_values(by=['chr', 'start', 'stop', 'alt', 'key'], inplace=True)\n", + "df.to_csv('data/genie_5.0_sorted.txt', sep='\\t', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Searching CIViC" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "from civicpy import civic\n", + "from collections import Counter\n", + "civic.load_cache()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "80.9 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit\n", + "coords = civic.CoordinateQuery(chr='7', start=140453136, stop=140453136)\n", + "civic.search_variants_by_coordinates(coords, search_mode='any')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "19" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "coords = civic.CoordinateQuery(chr='7', start=140453136, stop=140453136)\n", + "x = civic.search_variants_by_coordinates(coords, search_mode='include_larger')\n", + "len(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['AGK-BRAF', 'TRIM24-BRAF', 'AKAP9-BRAF']\n", + "['PAPSS1-BRAF', 'BRAF-CUL1', 'AMPLIFICATION']\n", + "['V600D', 'WILD TYPE', 'V600_K601DELINSD']\n", + "['KIAA1549-BRAF', 'V600E+V600M', 'V600E AMPLIFICATION']\n", + "['ZKSCAN1-BRAF', 'V600E', 'PPFIBP2-BRAF']\n", + "['V600R', 'V600', 'V600K']\n", + "['MUTATION']\n" + ] + } + ], + "source": [ + "match_names = [v.name for v in x]\n", + "for _ in range(0, len(match_names), 3):\n", + " print(match_names[_:_+3])" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "394.33967355799996\n" + ] + } + ], + "source": [ + "tick = default_timer()\n", + "records = [civic.CoordinateQuery(**x) for x in df.to_dict('records')]\n", + "exact_results = civic.bulk_search_variants_by_coordinates(records, search_mode='exact')\n", + "tock = default_timer()\n", + "print(tock-tick)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "418.74014411200017\n" + ] + } + ], + "source": [ + "tick = default_timer()\n", + "records = [civic.CoordinateQuery(**x) for x in df.to_dict('records')]\n", + "permissive_results = civic.bulk_search_variants_by_coordinates(records, search_mode='any')\n", + "tock = default_timer()\n", + "print(tock-tick)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DELETE EVERYTHING BELOW THIS" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [], + "source": [ + "from importlib import reload\n", + "reload(civic)\n", + "civic.load_cache()" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Int64Index([103], dtype='int64')" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.start[df.start.sort_values() <= 533873].index" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "x = df.start.sort_values()\n", + "a = x[x <= 36932096].index\n", + "b = x[x >= 36932096].index" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Int64Index([16], dtype='int64')" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a & b" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [], + "source": [ + "v600e = civic.get_variant_by_id(12)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "missed_results = list()\n", + "with open('data/genie_5.0_sorted.txt', 'r') as f:\n", + " reader = csv.DictReader(f, delimiter='\\t')\n", + " for r in reader:\n", + " v = r.values()\n", + " if v not in exact_results:\n", + " missed_results.append(v)" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "59437" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(genie_samples)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2696" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(civic.CACHE)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1520" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(civic.COORDINATE_TABLE)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1520, 5)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "civic.COORDINATE_TABLE.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements_dev.txt b/requirements_dev.txt new file mode 100644 index 0000000..9ba7a5e --- /dev/null +++ b/requirements_dev.txt @@ -0,0 +1,2 @@ +jupyter==1.0.0 +notebook==5.7.7 \ No newline at end of file From 1753292e939331fe93d4653e1c226fe210ac0462 Mon Sep 17 00:00:00 2001 From: "Alex H. Wagner, PhD" Date: Mon, 1 Apr 2019 21:30:44 -0400 Subject: [PATCH 17/17] add analysis from AACR --- analysis/Project GENIE.ipynb | 289 +++++++++++++++++++++++------------ civicpy/__version__.py | 2 +- requirements_dev.txt | 3 +- 3 files changed, 192 insertions(+), 102 deletions(-) diff --git a/analysis/Project GENIE.ipynb b/analysis/Project GENIE.ipynb index 8498ab5..5e845fc 100644 --- a/analysis/Project GENIE.ipynb +++ b/analysis/Project GENIE.ipynb @@ -14,7 +14,7 @@ "outputs": [], "source": [ "import csv\n", - "from collections import defaultdict, OrderedDict\n", + "from collections import defaultdict, OrderedDict, Counter\n", "from timeit import default_timer\n", "import pandas as pd" ] @@ -77,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -97,7 +97,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -108,14 +108,14 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "80.9 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "82.9 ms ± 968 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -127,7 +127,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -136,7 +136,7 @@ "19" ] }, - "execution_count": 18, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -149,17 +149,17 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['AGK-BRAF', 'TRIM24-BRAF', 'AKAP9-BRAF']\n", - "['PAPSS1-BRAF', 'BRAF-CUL1', 'AMPLIFICATION']\n", + "['AGK-BRAF', 'KIAA1549-BRAF', 'TRIM24-BRAF']\n", + "['PAPSS1-BRAF', 'AKAP9-BRAF', 'AMPLIFICATION']\n", "['V600D', 'WILD TYPE', 'V600_K601DELINSD']\n", - "['KIAA1549-BRAF', 'V600E+V600M', 'V600E AMPLIFICATION']\n", + "['BRAF-CUL1', 'V600E+V600M', 'V600E AMPLIFICATION']\n", "['ZKSCAN1-BRAF', 'V600E', 'PPFIBP2-BRAF']\n", "['V600R', 'V600', 'V600K']\n", "['MUTATION']\n" @@ -174,35 +174,44 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 9, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "394.33967355799996\n" - ] - } - ], + "outputs": [], "source": [ - "tick = default_timer()\n", - "records = [civic.CoordinateQuery(**x) for x in df.to_dict('records')]\n", - "exact_results = civic.bulk_search_variants_by_coordinates(records, search_mode='exact')\n", - "tock = default_timer()\n", - "print(tock-tick)" + "def time_search(df, mode='any', subset=None):\n", + " if subset:\n", + " records = [civic.CoordinateQuery(**x) for x in df.sample(subset).sort_values(by=['chr', 'start', 'stop', 'alt', 'key']).to_dict('records')]\n", + " else:\n", + " records = [civic.CoordinateQuery(**x) for x in df.to_dict('records')]\n", + " tick = default_timer()\n", + " results = civic.bulk_search_variants_by_coordinates(records, search_mode=mode)\n", + " tock = default_timer()\n", + " return tock-tick, results" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "subsets = [1, 3, 10, 30, 100, 300, 1000, 3000, 10000, 30000, 100000, 300000]\n", + "timings = dict()\n", + "for subset in subsets:\n", + " timings[subset], _ = time_search(df, mode='exact', subset=subset)\n", + "timings[len(df)], exact_results = time_search(df, mode='exact')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "418.74014411200017\n" + "436.1503577210001\n" ] } ], @@ -218,181 +227,261 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# DELETE EVERYTHING BELOW THIS" + "# Results" ] }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "from importlib import reload\n", - "reload(civic)\n", - "civic.load_cache()" + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from numpy.polynomial.polynomial import polyfit" ] }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 14, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Int64Index([103], dtype='int64')" - ] - }, - "execution_count": 85, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], + "source": [ + "exact_ct = len(exact_results)\n", + "permissive_ct = len(permissive_results)\n", + "full_ct = len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "pmsv_xs = permissive_ct - exact_ct\n", + "rmdr = full_ct - permissive_ct" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], "source": [ - "df.start[df.start.sort_values() <= 533873].index" + "exact_tumor = defaultdict(list)\n", + "for q in exact_results.keys():\n", + " exact_tumor[q.key].extend(exact_results[q])\n", + "permissive_tumor = defaultdict(list)\n", + "for q in permissive_results.keys():\n", + " permissive_tumor[q.key].extend(permissive_results[q])" ] }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "x = df.start.sort_values()\n", - "a = x[x <= 36932096].index\n", - "b = x[x >= 36932096].index" + "genie_samples_ct = len(genie_samples)\n", + "no_variant_ct = len(set(genie_samples) - set(df.key))\n", + "perm_tum_ct = len(permissive_tumor)\n", + "exct_tum_ct = len(exact_tumor)\n", + "no_match_ct = genie_samples_ct - no_variant_ct - perm_tum_ct\n", + "perm_tum_xs = perm_tum_ct - exct_tum_ct" ] }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAO4AAADuCAYAAAA+7jsiAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xd8FGX+B/DP88zMzs62JKQREkoILSH0IlUMgoAoqFgQFeyevXGi2MB61kPsir2DSLOAiNJFei+hhQ4hfevMzszz+wO9n96BBMjM7Cbzfr146evO7PebZT877SmEMQabzRZfqNUN2Gy2U2cH12aLQ3ZwbbY4ZAfXZotDdnBttjhkB9dmi0N2cG22OGQH12aLQ3ZwbbY4ZAfXZotDdnBttjhkB9dmi0N2cG22OGQH12aLQ3ZwbbY4ZAfXZotDdnBttjhkB9dmi0N2cG22OGQH12aLQ3ZwbbY4ZAfXZotDdnBttjhkB9dmi0N2cG22OGQH12aLQ3ZwbbY4ZAfXZotDdnBttjhkB9dmi0O81Q3YTokEIAdAcwDN5KhWX9H0ZE1HPQaWSAAfIcTHUeLmKZF0Bk1nTNYZkxlDmDEWBhBmQJAjpEpycLsEju4BsA/AXgA7APit+/Vs1WUHNza5AJwFoENAVtuomp4vcDRb5GliaVAJ7SkNsR3FQeloQBaCsoagrCIoqwjIGoKKipCsIaRo4CiBKFA4eQqnwEHkKUSBg1OgcAkcUr0iy0ySIlmJktIg0UnSfKKk6iwsR/W9HCXrvU5+AYBfAWwGoFn6jtj+gtgbW8eEegB6RqJaX1nVz3M5uOZFJaHQmn0V0u6jQcfesjD2loVwuFKGZvDfV4rHgUb1XGia6kanxonBDo0SWaIk8CFFW+8WuZ8Eji4BsAxAmaGN2P6WHVxr8ADOCcnqcI2xfg6eZmw95A8v3VnmXb23gm46UIWIqlvd438kSgLyM31o1zBB65qdFGiR7pEUVd/vFLjPBI5OAbARgP1BMpEdXPPwAPoEZfUajpJLDlZE2Kz1hzzLd5fTwsMBw4+kNYkSoG1WAvrnpSnntU6POnnq5yj50ilwXwFYDiB2vnVqKTu4xuIAnB2U1ZEcJcMOVUYwc90h99zNxfRgRcTq3mpMy/oe9MtNUwflp4eTXA5NZ+xrt8i/BmCd1b3VVnZwjeFTdXajompjjvoVacbaQ+65m4/QA7UorCfSONmFga3T1eFdshRCsN0nCc8CmAZAsbq32sQObs1qGla00ZTg2qU7y9iHS/e4NhyosronS3CEoE/LFFzbo5E/J9WjUUpeF3n6BoCDVvdWG9jBPXMEwNn+SPRhjpLeX686QL9Yvt9xpEq2uq+Y0TTVjRFdsyKD2tSHqrG5Xif/KOzT6DNiB/fM9ApE1DcDstrk/SV73N+uP0QiUfu+zIl4RA5D2zfQbz67iUxAZnuc/D8B7LS6r3hkB/f0tAxE1ImKpvd6ee521w8bjtjPQk6BJHC4ultDdVSPxlEw9rlL5B8BcNjqvuKJHdxTkxaS1WdAMGLSoiLH57/t5xTNPsKerkRJwA29m8jDOjbQAbzmFLhnAFRY3Vc8sINbPS5F1UfrjI2ZsfYQ99aCXWJlWLW6p1oj3Sfi9oKm4X65aZrA0Xs4St6HPaDjb9nBPbmCkKJ+vqKo3PfSjztc+8vDVvdTazVP9+DZi1sH07ziOo+TvwpAkdU9xSo7uCfmC8rqxKimX/b4zC2uRdtLre6nTuAIwcgejdQbezdReEoeFjg6EfZIrP9hB/f4CsKKNnnu5mLPiz8WOgOyPTHGbI2TXXjm4rxgw3quHR6RHw5gq9U9xRI7uH8lhhXtBUXTb3x42iZp6U57AoyVCIDLu2Tpd/XNkSnFeJHnnod97QvADu6fNQ/K6g+r91ZkPD5ji6siHLW6H9vvMhKcmHBF22CDROdCt8hfAXuyvx3c3/UPK9rUCT/tcE9ZdcBezicGCRzB2PNbRvrnpRW7HPwA1PFT57oeXCKr+n2Kqj1571cbpNV77UeIse7iDg300QOahyWBuwbHJi/USXU5uGJQVj8sCSgX3vbZWvehyto/c6e2yMvwYuKV7UKSwL0pObgxqIPL6tTV4NYPyuqclXsqmj/0zUbJHl8cfxJdAv59eZtgTppnlUfkzwcQtLonM9XF4LYOK9r8j37dk/DuwiKhzv32tQhHCMYNyY2c0zJlq1vkC1CHhkvWteC2DSvagqe+25rww8YjxOpmbGeOABgzsIU8uG39IrfI9wZw1OqezFCXgts+rGjzx83a4pu7udgObS1zR0FTZXiXrEMuke8J4IDV/RitrgS3Uziq/fLY9M2eeVuP2qGtpa7t0Vi9sXeTEpeD6wlgl9X9GKkuBPessKLNHTttk3dBYYnVvdQICiA/y4fOjZOQk+pGfZ+oJzqY7uF1ODlCeI4QSo59PzHGoDNA0xmLaGBBjaI4zLjDlRFSVBrCyj0V2HywCnot+Rhc1ilTv6dfs3LJwfUGsMXqfoxS24PbI6xoc8ZM3ehZvCM+JwnwhKAgNxX9clPRNlVQk5yUCpKL6uEA1IPbNe3QTmhlB6leeZToVSXQq0rA5CBA/jixOPZPIrrAJaSC+lJAk+ozmpylcenZRGjQjCOCA3I4rB8K6mzdYZn7YeNhrCiK3/s8F7Stzx4a1LJMcnCdAOyxuh8j1Obg5kai2vLRUzZ44m3McYeGCRhxVkPWOZ3XfV43p5UfYfLG+Sy67Teq7t8Kdf9WsHDNjfqj3mTwDXPBN8yDkNdTE9ucwxHegdKAoi7cE+I//W0f9pSGaqyeGUac1VC7rU/2AZfIdwJQO061/qS2Bjc1pGjrn5u9LX3WusNxcU2bl+HFjb0as+6ZIhM4QiMrf9DkVd9z8sYFYH7zv3i4jGYQ88+G2Ol8zdG6FxcOy9r8vRFu0qIi7CmLjznJd52bo1zWKbPQLfLdUMue89bG4DqDsvrrlFUH8ibO2+mwupm/IwkU9/RvhsHNPLrT6aCR5d9q4fmfcsrGhQCLoUEhgghnh/MgnTtKE1ufzVUEwtqX68q595YUxfy18dMX5UV6t0hZ5BH5QahFI6xqW3BJUFa/XlFUPuj+yRukWP3NGiVJeGhgM71LlptGi9bpwVmvUnnNj4Aa+2uGE8kH51kXwn3BnYykNsKC3QH8a84OUhqIzd55SjBpVMdgszT3+y4Hf5fV/dSUWhXcSFR7cl9Z+N6R7690yzG0adYf8hv48OTgplrDJImLLJ+lBae/zKn7Nlvd1mkTmnWG55LRmqNtAbf+QEAfO7OQHo7B9aR9Th5f3tw1lOxx3C9w9C2r+6kJtSa4ms5GVISi7w5/Z7mrNBhb3/71fQ68cHErLTfdxYVmv6MHv32N6pXFVrdVY7j0pvAMe0B3dr+YLtxdyR6ZsZWElNj64mxYT8IXN3UJuxx8bwCrrO7nTNWW4LYKK9qqUR+sdO0ojp17EE6e4smhrVhBjo9Efpul+T97jNPLD1ndlmH4zBbw3fCSzud0JlM2lOP5OYUxdWNwQOs09ugFuYdcDq4V4nwyfm0IriMoq+snztvZPJYmwQ/r2ABjCrKYtmc9q5p0H43nU+JT5cjvA99NE3TFnYJ7Z+ykK2PomfATQ3MjBS1TZ7lF/nKrezkTcR/csKK9tG5/5T9u+2yty+peACBBEvDOiNZqTiLPV751ByLLplvdkjUIhWvQLcx75eNk/s5K9sDULSQWTp6dAsXX/zgrWD/BeSsl5BOr+zld8R7cPlXh6A8Xv7FMKg9Zv0bURe0zMPbchkzdvEivfPM2TvfH52itmsRlNEPiPR/qakoT3DN9R0wcfZune/DhtZ2CkoPrAGC71f2cjngObmJY0bY/MHVjypIYGM746hX5evcsJ63TR9kTIRTuC+/QPZeNpR+vLMbEn60f/39Flyz9joKm290i3w5A7N0KP4mYuSY8VQFZ/eD7jYe9Voc2wclj9m0dta7uclIyursd2uNhOoIzJ9LSxwbgmnwPPrymjc5b/Mn7asV+um5/ZcNIVHvC2k5OT7wGd1hVONr/pTnbRSubaJflw5zbOjLPtp9R8kBvopXss7KdmKfuXoeS+7uilboXc+/orDdIsPSvD+NnbXExhjsBNLe0kdMQj8F1hRTtrcdnbnFHLBxkMbR9Bt4bkYfw5CdZ5YRrOUTtxeaqQ68qQemj/Sn/2xQ2/aZ2rH3DBMt6OepX8PbC3Y5ARH0Pf0yjihNxF9xIVHtwRVGZa9Ue625yXN2tIR7t3xgV/74WoR/ejLv30HKaiqpJ93LBL8azd4fnolvTJMta+fy3fVxVJNoRwEWWNXEa4u3mVMNIVNs27M3fJKuWU72pdxPc0i0N5c9dAWXTIkt6qE2kc65mvutfJGNm7cC8rdbMvuvSJAn/vqLtUZeDawIgLuYvxtXRIhBRX/t02T7eqtDeWZCNW7qmoOyJIXZoa0h4/qek8s1b2XNDmuHCtvUt6WFFUTl+3VnqjkS1xy1p4DTEU3B7q7re7/0lRYIVxW/u3QSjOqWg9LEBiO5YaUULtVbk12mkYsK1eHxgE5zTIsWSHp6fUxhXN6riJbhcQFYnPTd7u8uKxcsHt0nHzd3ro+yZYVD3bjK9fl0gr5qNqnfvYS8MzUF+ptf0+kf9CiYtLnL4I+qLphc/DfES3MsPlIcbzNl0xPTC7RsmYPzAbFRMvBHRbctMr1+XhBd+SYLTXtLeG57L0n3mPyr6asV+jhL0B9DS9OKnKB6CSwIR9cnXftnlMbtwhk/E21e0Yv5PH2Xyim/NLl8nBae9yCm/fq1Pua6N7jR5lEZI0fDpsn1CIKKON7XwaYiH4A4oDynpZo+QEijBF6PydXn+Jyz047tx9Ywv3lW9cw8n7F2Dj0a2MX2pmc+X7+M5SoYCaGJ27VMR88H1R6JPvr1wt+lH2zevzNecJTvh/+ihmH+Pah2mo/zlUbSph3G39sk2tbQ/ouKrFfu5oKw+ZmrhUxTrH8quUY3l/bjJ3NUiLuuUifbpAlf+/BUUeq1ZXyyusEAZyl8YjhvOSkdeA3NvVn28bK/AUXIlgAamFj4FvNUN/B1/RH3ivcVFTtXEpQTr+0Q80DcL5S9dDb3C/Jthp4qILvDZ7cAlZx5b7NyXwrikjAhNqh+l7kSmB8qJXnbAoZUdEo8tmn4UWsl+RHevQ6x/KUULlyMw9QX9ncvvRcGrq2hUM+dzUBGKYtqag2Ro+wZjXQ7uDlOKnqJYHjnVMiira/r/e7Gp+9fOuqW9lrj2G1I16b7YPBsRRDjyesHZcYAitjs3wqU1llgkuBNM3w3ecZA4PfsIIUdxbNe6CgBJANKYrqWzcKAxNDUTHJ9DeEd9pXC5LK+e7ZU3zCfqvs1ALH4WCEG98bP1tXwOu/WLjZxZZVO9Dsy8o3tY5Lk0AAGz6lZXzAY3JKuTPl++f9Qb83eZdlZwfc9GuLVzEjt6ez5hcuysXQUAQk4neC66Lyh26C8wJbyNSJ4phBPmAVgN4HSGkqUBOEcP+weD4TwAiZHlM1nw+zcltWh9jfZ+pri0Jkh5aRlu+XIrVu01b4z6W1e3D3TNrncHgI9MK1pNsRpchxzVyoa99Zv7YIU5wxs9Dh4/39WRVU28gcirfjCl5kkRCmeXwfAMe8DP1c+JEIfzWcLxHwEwYmuDxiyqjGJa9G7t8C7BP+VZr7zyu5g5CrsvHq2zQXezgldXmXbU7dsqFY9d0GqVTxI6m1WzumLzdBAYuLskpJkVWgB4+dJWurplKYuJ0BIK13k3srS3tgYTbn1jk5Dd7noqeRoQjv83jAktAOwhguMJ6nSnC03aXJd4+5vbUieuDTi7xcakmeDMCdQtl9M7++aYVnNhYQk4SloDaGpa0WqKyeD6I9Fbpq4+4DOrXl6GFx2zPLTy7Tstfz+4Bs2R8vzioHfEuJVcUv3+1J2QD+BrAKpJLagAplJXQi6fnn1Zwq2v76z38LQQTUgzqfwJaCoqXruJjOyUigTJnKsnVWf4bsNhIqv69aYUPAWWf1CPI8HBc+f+uNm8R0DPDcnRQt+/pVu65jGhcA+5R0t5blGIz2wxhrp83QD8al1DYABmU8mb52jV/dXUV1aHnT0vtfS8ObrtN0Q3LtD+dVGuaXcrp605KGq6fgtiLCsx1czvhq0sKo/6I+YcYHo3q4cMn8gFZk6w7L344yjruWT0Kiq68gnveB1ALKxmCgAKEV0PUpevT8LNE/cmjfkqRH3WzOABgOD3b3BdGvtofZPGMm87HEBJQBEB9DGlYDXFXHArw9Fbp605aNpIqbH9G2vBmRN0FrRmRQ1Hbg+kPLsgzGe2eJC6fN0B7LakkZNbQSVPK0frs99JeXl5mMtoZmpxmlQfvpsnakljJqMqrLJHLmhl2hfblJUHPIGIerNZ9aoj1oKb6eBo/uLt5oxL7pqdhFQXzwW/fd2S90Fs3x9JY78JUclzIeEdryF2jrInEqFO973UlXB7ytM/h/lGrQ0vSBNS4bv+BS114jrsb3UhLn9/Ha79ZD3p2CiR1nOZMzV7YWEJ4SgZiBhalyrWgnv+kp2lmqKZ8/l98NxGWvDbV3UWMf/5urPbRSzx/k/8VHT1AzDP9AbOAOGFD4jLd23yE3NCQk5HY2p468F7zdN66msbcbjtZRjx0Xpc+cE6bldJCHvLwli6s0x75IJWplxz7ysPIyirPIA2ZtSrjpgKbmU4OmxBYYnbjFqZCU40SnZzobnvm/4eSAVX6wm3v1VJRVdvWHsD6rQRSidTl++Keo99G3Lk9qi513UnwnPl43raG5tR2uUqNurTjbjsvbVc4ZG/Doh5d9FurkuTJMKZdAxcsL1E0HU20JxqJxdLweWdAu29bKdRjyn/avSAZkxZ/7Nq9naXYqdB8F33QiUVXd0ArDO1eM37lkqeIUkPTQ3zWa3O6IWI5IXn8rF62ptbUNXzOnbjF1tw8btruU0Hj7+p3rbDAZQHFf2qbo3OqG51LdpeKvpl9VJTilVDLAW3D6+EpH9dnMuu6JIFoydR98iSEJz1qqmTLLgGzZF413th6nQPArDNzNoGmkcE8fakMZODRDz1fdeI6Ib74vv1tLe2ItjnZnbr14W48O213Np9lSf92ckrD5BLOjQw5fHDyqJySALXDoApZ4QnEzPBZZp6VnTLEr3lxs/YfW2JvmR0Dyy8o7028Yo2bFB+Omoyx0PaZYCEKqBsWVJzL3oyDgn1xn4TIoJ4N4DfzCtsPMLxH1Bf8ncJN00IV/uHHBLcF97F0t7eBvm8u9g903fi/LfWcit2l1f7JX7YeISkJ4h8qsdxOm2fkpCiYUdxIALgHMOLVUPMTOtjoaoLQj9/zMkrvoX/k0dA3Ilw5PXiOrTvr3Xr3Z8+OTiHVPpD2srDUW7W+kNYvOP0T6mv6pSmh39+09Q7hAnXPR+h3no/EF5418y6ZqGS9waxywWbpLOvbBhe+MWJ31tBhKv/9cx76UOkQmH6P78r4n7ZWnJa449Lgwo27K/SbitoSsfP2mr43+e8rUc9TVLcF7oc3HdG1zqZWAkuT5zujsrWpf/5H1iwAvKKbyGv+JYDAJqYDkd+H65nxwFa38EFHHG0wFF/RPt1f4SbvvYQ1u8/+akVAFACNE120rJlM4z5TY5D7DgQzp6XVlGn+wbTipovQCXPBb4bX14W3bnKpR4o/Ov/ywlw9R3JPMMfJX6V6g//uJ+bs6n4jCcMTFl1gHtgQHMNgOGTD1YVldOR3RsVmFDqpGJldlC+VnpwafGtraq91AGXng1H/tkQOw3SxNa9OcbADvqj+qKiADd11UHsLj3+gvTDOjbAmG4+dvTWVuYccTkeaa9vCnL1Mi4C8JMpNS3ENPXO6M7Vz5Y+0u/YtSDHQ+ozgnmvfJyEmKC9tPAQN3NdzQ0tdXAUv4zujZHvr8DOo8ZuQiAJHOb/s3dU4KgL5o0dP65YOeK2jZ7iHFDtyG6Ej+xGeN5HHADwDfOIr00feknnwdrwG7pwWlRhRZWaPn9nJTd19QEU+xUAwEVtU/XIkk8Bkx6mS31GMOJ0bUQdCC0AEI5/k2+Y+4DYcaCbuhPhvfoJFuEk9vziQ2TKqoM1fqhSNB3LdpVpw7s05J7+3tj7feGohopQNJLqFZsD2GJosZOIieCyqNw+unP1Gd2tU/dthrpvMwl9/yYHykFo2oGkt+lDR3YerN14exdOCYf1bRUaWtYTSMWyGeYcbXkHvCPGhakr4T5T6sUGlUreO5LGfDU9EqjSJyw9Qj9bvtnQ93vh9hJuVPdGppwubzvsZ6lesR0sDm5M3FVmkUC36J6NNdeLriG6YyWC014iZQ/35Q5fm4XAv6+iOTu+Ayc4SHTXmhor9Xdc/a7TCSesALD0pP9x7TIzpGg7Hp29l362fL/hxZbvLke6z2nKhefGg1UeRdWNGS52CmIiuODFPHXvZuNePypD2bQIekUxje7ZqJmySJpDgvfysRHqTrjf+GIxh7lFfvStfbKPP3qihh2qjCAc1VgfE/Yd2nY4QEOKVnNDxU5TLAQ3gfCCTys2flKM2LZAk9f/YsrvLHW7CCBkJYBVZtSLQd82SJTUpqnmjFdYt69CO6el8cEtPBKAyFPjZ1ecRCwEN189sjtkxtpGXHImlK1Lzbkp1ftyP3UnTjKjVozSCMEHQ9plKGYUW15UwefW9xp+KnWoMgJC4AKQbHStvxMLwW2sHdppfB+EgHrrcdHtxm+RSUQXHHk9RQB1esMhp8C9N7R9hkpN+Kpcv78SqT7RlC/l0oASAdDQjFonEgvBzdBK9hm+nAGf3R4sEgQLVW+gxpkQ2/cDk8NrAVR//F7ttJkQHOjQKNHwQrtLgvCK5uwSVhJQGIB0M2qdiOXBZUokSys9YPhgUzG/D7Sje01Zut/Z49Ig9SR9aEatWOfkuW+6N61n+GCFSFSHouloVd/47UqK/TKHOh9cOdRErzB+ah2f3Q7RA9uMv5CmHMQO/XkA5o2pjGEOnv7Uq3mysUOaflcSULR2WcYvDnqkKuIEUN/wQn/D8uCCsSzNhD16+NRGurp3s+HP+rjUxgDTqwAcNLpWnPgtO8Ut8SZc6B6qjLBGyac+tfBUHfXLfCSqZRle6G9YH1yOSzdjcy3qTWLakSLDPz18/aZAVInVBd+s4I9E9cM5acY/FtpbGiKZiU7D65QFo4hEdXNm8J+A5cElgljPjFUoiOiCXlVieB2uflOAFzYZXiiOaLq+KyPB+EDtKw9zyR7R8AXLSoMKAJZpdJ2/Y3lwQXmHHjZhsTbeQVjY+IE8fGZzhbp8Gw0vFEcEjm43I7iHKiPwOXnD72OUBhVQQlKNrvN3YiC4lEIzfoYU4QSimxHchrlhADsMLxRH3CJfmJkoRY2u4w+rEHnjhywrqg5CiPHLbvwN64NLqDm7vnM8YaEqw8vw6dkUsbuouVX2N0h0Gr6Dm6ozUGr8dE1dZyDE2tn0VgeXEkIImPHrKBPeAVPWT+YcBED1116qG8JOgTP8L1nVdFBi/N1rVWcgFi+DYfV8XI7pmg4zvkAIATPjyG47Hql9I0/C0od6GV5IVqN02Rhjt/khAKKqLhla5CSsDi4PXTcnuEyHI/8cEI4DcbpBnB4Q0Q3qdAOidOzfHU4QUQIcEogggghOMIeT6bxDZ4KD6bwAxgsAx4NRjoDyAKWEUA6EUkIIhUw5twswZ0eq+KFsKCkMjFn0sqF7QuXWa4qne9ytP/PoXEM/TwmJTtx2by/ZyBonY3VwOZi06FWEaaD3ToKsKkzRVRbVolC0KJN1BbIWZbKqQNYUhDWZRFQZshYhYbWSKsEokfUop2gKFE2FoilQ9GP/lLUooloUsh6Fcuz18Ma5jwRcgmT8LdT44vQrQVYhG3tzMKRGoOsMoaCxE5JEJw8Alp6+WR3cMDieByEwelofY8CI78fgSKiUwMD1psoiVVoDT5rxo+rjixTRZMPPqhxUAGMw/EDAUQLGmKXBtfrmlAYtKhPJ+PGluq4yF2/8gbBS9hMASYYXiiOKFs05ECg2/JowyemDrhl/BueUeOg6M2V1jxOxOrhgajRI3cYfoHRVYV6H8cPuiqoOSJqundlGOrVMSI2cvbnU+DnXqVISlJDxTygklwOMMXM2uToBy4MLLVpF3AmGlyFyWM/wGD/YZU3xFsEfDcXMrm4xgDg5R5utZcY/2k5zJeuB8qjhj2lcbgEAMWcT5xOwPrhMrzDjiCtUlZBMT5rhp1HrjhZC4p0dEQvvbWzI1JgmHAkZ/znP9KSzkqNBwx/kulwOUI6Yu83jf7H+w8VQTk044rJDO7lsX5bhNxRKIxXwKwENQK7RteJEp21lRaasO5XuqkeKDxl/6Sm5BDgcXM1tx3AarA8uxxUTbz3Dy6h7NqGJL8OUNYlWH9lCAPQ0o1asU3W189qj20xZ6jHNlUz376swvI7XJyocR42favY3LA8ukXxb+PRsw4+EyvaVqO9ONeX3XXFkg9uvBPuZUSvWBaOR8zaV7jD+upN3IlH04kA19tU9U2np3ggsHo9ufXApLeQb5Rm+tImybRkSRA9xcsYPalpTvBUc4foiFrZ1s1a2QPm2Sw+uNbxQy6QmCMphXdeNH8+Tcmyt6J2GF/oblgcXwHa+QQvj3+1oBOGIX89PaWZ4qV2V+3EwWOwAcIHhxWJYRJVvn7nzFyprxl/i5iY3hRw0fjA6IYAvwemCHVxs55KzjF8oCAB/ZA9rn9rKlCGWkzZM9VYpwcfNqBWjRAA3f1U425R5q+1Tc9Uj+4OGjwT0+pzQdD0EwISpZicWC8Gtgq6GaFKG4YW0DQu47hltjX9CD2De3mXQdLUlgM5m1ItBlxSW78GeKnPWzGud3IzbtO6Q4Tcfk1NciEb1fUbXOZlYCC6YEtnDZ+QYXie8aDJyk3M4YsLWuBrT8eGmGU6/EnzY8GIxyK8Ex3yyZZbxixwDSBS9SJYSyIa1xn9JpKZ7QAkxcIe66omJ4IJyG/jMloaXUfdsgKarrGmCOStrTtsxj/KUHwiLt6uwQAfGWIsF+1aYUqxngw6o9Ic0VTX+KqhJ03phySUsNLzQScREcKk7YYEjr6cpi2bjcJHeO7OjKdfYtmSmAAAXc0lEQVS5gWgIM3f+TMOqPNaMejGCD0ZDn7669nNRNWkCzYAmPdW9hVWm3MFv0rReFMByM2r9nZgILoCljryeplx7qvM/54bkFJgSXAB4e/3XjqgWHQXA2GUZYoSiRUfvqNjXeOp2Yyez/4GnPDqn5/ML5xl/k5cXKJLquSQA6wwvdhKxEtzN1J3E0QTjJwGE5ryD+u4UmulJM7wWAFTIVXh4yUQprEa+BlDb5+m21Jj22CNLJpqzKS6ATml5iEQV/dAB4xcCzMxKgCyruwFYuvoFEDvB1ZkcXCO06Gp8pagM7ehetX+j7qYddZccXIPvdy/yBKKh98yqaQEuEA199eqaz8UDAfPG35/bqJtWvNecq6ysxongOLrYlGInESvBBXElzHG06mH42rsAoM3/gh9q4ukyALy08kOnXwkO0Jl+pZl1zaJo0bv2Vh1q9tW22aZ9phxUwKDsXtzCuSbsrwygecvUgOjkLb8xBcRScDl+sdjmHFOWNQ3Omoh0dwpt7GtgRjkAQERTMHrBi25Zi74DwNJ9ZwzQXdW1p8YufsVtwsox/9G/cXfIEVUr3HrU8FqEADktUgQAPxlerBpiJrgAlvOZLZxEMuHRn6pA37lGH5l7oanrBm0p24V31k9xBqPhnwGkmFnbQPlhVZ7zwKKXXHv95s50G5k3RF+9+KApd5MzGyZC11kxgANm1DuZWApugMmh5WL7/qYUC30whg7K7s35HIauGPo/Pto8g/+68MeGwWh4KQDj5zMaKyesRhY8sexNjxkTCf6seWJjZHrS6bzZhabUa9U6TeMomWVKsWqIpeCCepI+kXoOC5pRK7prDbSKYvXSFv1NeQz1Z6+s+dQxfcfPjYPR8BIA5tzernl5YTWy/OVVHyXMKVpiyjznPxvRarC+e2sZ0zRzTs3bdmgQdIj8NFOKVUNMBRfATLHduTx4c/ZTUr54gh+ZO4Ty1PxVal9a9aHjy20/5ISikTUAjB/vWbM6hFV56dO/vZM0dftPpk9drOdMwHlNetBvp2425QvD7XEgJdXtABATN6aA2AvuYRaVt4utzzalWGTRV6BySB/QuIcp9f7b62u/ECas/qR+WJVXAuhtSROnhkZ19e6wGln86JJXfd/vXmT6kRYA/tH2cu3IwYBeWmLOY6D8dhksGtV/AmDKEjzVEWvBBZG8nzi7X2T4zm5/0KY8R+/peA0cVDCr5F98vf1HOmbRy4mVcmB2MBp+D4DxC3CdnsaBaGjpror9T1/53QOun/f9ZkloM9ypuKBpH2765xtM++x269XYL7mEd8yqVx2xF1yOn+Y8awgDMae10Jx34VRk7arcwaZf6/5h8YHVuHD67a55e5eNCKuRXQCGWtXLcRBN124Mq/KmDzZO63TVD2PcZt89/rO7OlylFh8Kqgf3Gz9SCgCSkl1IS/dSAHNMKVhNMRdcANsB7HO0LTCtoPzOvdyN+cNokmj8jgonEoiGMO7XN5x3/vxsvcPBks8CSuh7AMZPUv57OQElNG+f//CEUbPHuj/YNJ3XTdgS9USyE7LQJ6sz/9mk1abdlOjYOVPTdTYZMXSaDMRmcEFcvgnuATebcncZAOQV34Id3KHd1eEqy/fhXF28GRfNuMs9uXBOv7Aqbw9FI68DaG1iCwTAeX4l+EsoGtn42dbvel3+3Wj3joq9JrZwfA91uVHfvqmElZeac20LAF17Ng6LTj7mhqrGZnAJ/UJsW8BRb7JpNf0vXMkNaNyDa5Zo/aAmRY/itbWfC5fNus/95bYfbqqUA8urlOAGANcBMGoAv0fT9dsD0dDeff7DUyes/qRPv69vcL69frKg6qpBJavv/OzeLC+5KT16OMg43pyPbVajRLhcjhCAX00peAqISbtcnjI9VPVVYNqLw4IzJpj2uMF72xusuH0fNvy7f1Kz5pJWB0coemZ2xPCWAwMdUnM5lWmT3YL0DYD1APYApzXOUADQFkBXvxLsK1Dh/BVHNrKPNs1wry62fIGHv0h2JmLa0FcwZescDG58jubQnHTyJ2vIjkJjlzYecV2nUH67jCd4nj5naKHTELPBBdBFqyj+pfiWFm6YeF3le2+n9tWexXht7ecxubRqqlQPQ3LO0bpltAs0T2wkOHmRhNTIdgflf3MJ0goAm3BsITMVQBTHAur5/U96WJV7KJpS4OKl5kfDZfKa4q3cmuItrqUH1+JwyNI1vk/o1YKxWoLowcjZYzkAuLXdFbimxVBs33JUmz55A+evqvlZdl6fiAfH94sIAtcAQHmNFzhDsRxc6MGKzRWv3pwrr55tWk0+ux08T83FbT8/hXVHt5lW93Qlil60SGqM5omN0TqlWbBFYmNV5B2EIxzhKUdUXWNhVdZD0TDKIpV07dGtng0l28nm0p0IqaY9dTtt/Rt1x6Pd/sEunH4HqVT+f3uRRIcPE/o8qLVMzOZmz9rCli7YTWryozzgglZqr3Oafiw6+Rtq7lVrTkwHF8DVyvYVb5Y+fK6pA4q9I8ZBHnAdu2jm3SQYNWXCku04Mtwp+GrwS3ht7eeYXHj8pzG9MzthfNc7dcWvky8/XkP27z3zLUh4nuKxZweEnZLQEcDWM35BA8Tkzak/+ZLPyg04cs3dhsf/+Tg4y4r1R8/6h3XPPuo4J+fA6+c+ylYd2ayfKLQAsOjAKvSddi1dWLkMt9zdA5eOaKdJ0pkNpmnfOROMYSViNLRA7AdXJaLrQe/VT5i++HTg8UFcr/ptyKi8oXZ4LfBUz7s0nlD93gXPVesz+syKd8gl39+F5JYcHhzfDx26ZJ3WqSSlBAMuaBWUXMK40/l5s8R6cEEo/YzPyg048nqZWlevKkFw/BByc5tL6bkNz4rp64na5rrWF+md01vTa36/GVVdh4JHcdkP93LPrX0XF16ehztG99LT0k/tKqvTWQ2ZQ+Q3Afj5lH7QZDEfXPxx1L1qvOlHXXXnKkTeugtP9LiD5Ccbv+eQDejRoD1uyB9G7/j5GVIpn95etzN3/YJ+064j21FI7hpzNs4fmqsLwsm/A3ie4vyhuWFJEu4+rcImiofg/n7UbeU3+6gLAJFFX0L77g32Wt+HkeE2fhXKuqxtSgs83/t+vLDyfWws3X5Gr6XoKv65+CUycu6DaNE1iT04/lyWm5/+tz/TrVcTnXJ0GYBlZ1TcBLF+V/nPRkZ3r3ujZExv05b+/DPffR/rgTa9yDWzx5KScMw91ot7repl493+4/HRphmYtHFqjb/+qLyhuDnvcrZ3V7k+9Yv1XEX5X58WOBwcHn7qvLDkErrh2MCWmBYXR9zffcalZ++Vzr7Skm+aqpdHUveW5eyTgc+yNFe8rzgTW1rVy8Y7/cZhxo6fdSNCCwAfbZ6BAdNvIpWJRzH6kQKc07+ZTun/z0w8p38zlRDMRRyEFoivIy4AdNaDlQuLb8+XWMj4ncePx/fAl3o4rzu57sdHiJnrB9dWeck5eLvfY5hS+CObuOYzU+b4dklvjWe73acxmaNffbKGVFVEcN/DBWGHg2sFwPrZFNUQb8GFHvZ/EF48ZXjVu/c4rerBe99HutahH71p7jjEwqyZeNU7syOe7XUPPjTo9Phk7u04EpfnDIQia7pTEh4XBO4p05s4TXEXXAD1mBzaXfLYAJ+627otXLzXPA0y8AY8sPBl/HrI8q1k4s6ovKH6zW0upQzAVT+MMW0f3f82NKcvHuxyQ6nIOzIRA1uLVBc3btw4q3s4VWEQUuxo2e3c0E/vm7Oq3HEo638GrTyK8y8aB55y2uriLfF0v8AyAuXxZI87tEtyCmjo6WGEoxw7v8/N+H73IhI2eey0W5DwSsGDIY/DNRSA8buG1aC4/LARjv+QS220wzXwFktHNYXnfYTgYwMxssUg8nrfhzWP4LKynZiXJPrw4YCn9LMTm8F/dxcS3bIE/jdvJ9KO9fq7/ccxtyCZ2s+9HUfKAhWmA1hgauEaEJfBBaBTl+9y74hxEb5hrqWNRHeuRuU/8mgbjcfXF77MYmEifiwqaNgV04dORMODe1H1j1acXvb/GwJUPXkhlxoI6K8WjNXNWir37MxOGNikV6VbkG43pWANi8dr3P9gmnqdVrLv1aP3neVG1Popap5Rz0IYcCM+3jxTm7TxGy4WVo6wmtfhxiNn3aL1ymhH5Q/HkvBP7x//P+QdSHh9g7akajceXDSBM3IPohQpCVMv/HfY63D3B7DEsEIGitcjLoBjp8zUm/KT79p/WZ9aAIGPHoL/sUG4qsk5mD7kFb1tSgurW7JU94x2mDHkVfTgElF1Z4cThxYAVAWV93fneqbmkXs6Xm3Y8iMEBM/1vjcoUP7fiNPQAnF+xP1doh4Jbqt45bo0eZV5E+5Pxnfjy4wWXE1mFy3WXl79MVeX5vVmetJwf6dR2lnpbajyxRMk9N3r1f5ZWj8HvhcWs9fWT2ZfbPu+xg8sI/OGaDe1GbbRLbg649gqIXGpNgQXAHroocqfjt7bRdLLD1vdy39w6dlwPzRZYylZ3DsbprIphXOIrMXUKp81KsHhwT/aXaENzSng9O0rtaoXruJYoOyUX0do2Q2ex2bi0V9fZ/P2LquxQRl5yTl4t//4gMSLbQAU1dTrWqG2BBdMDj+mHt75QMnYvjFxvftnYvdL4LjuWU2XvNxb6yezb7b/RBTdlD28TSFyDoxoNVi/If8Syo7s1kITbuDUvRvP6DWd3S+B8463cOcvz2B18ZYz7jHZmYjJF7wUSnL6rgHwzRm/oMVqTXABED3sn6JsXDio/MURLsTg7+XsfSWEkU9oquji3lr/FZu1a4Hpzy5rUn1XCoa3HKRf2qI/1atKNfmtOzll3bwae333kLvBXfEQRs1+GLsq95/26wiUx4cDnw5m+zJfcfLiwzXWoIVqU3ABwKmH/b+GZr+T5/9ivGWDM05GKhgJ/oqHNN6Xws3d86s2uXAOt6l0h9VtVVvHtDxc1/oirXN6HqcdKNTCnzzCKet/MaSW54aXmNrnCnLl9/9EcejUT7sB4Jmed0d6Z3Wa7xakwQBqxYomtS24AJCqR4IbqibdlxZe+IUlG1NVF9+kLTzXPKWzll1JmVzFPt/6PZlTtISUy+bsi3Mqmic2wrmNuukXNO1DEgQ3yJqf4P/gAaKXG7+PkO/BKXp583ZkxPdjSCB6arsY3NTm0ujIvCE73ILUBYBpu2MYrTYGFwDymBz6rfTpiz3RrTG3CP3/4ni4h9wN2vcazZGSxR0MHNXn7V2GRQdW042lO2DFfj2UULRJaY7+jbprA5r0oBInAvu36crc97nwvI9M78f3/GKtyCORG358nFb3/sDg7LPZ2LNuKpV4Z1sA1u1UZoDaGlwA6K+H/DNKxw2S1KK4mGJ5jEOCq//1cPQZrmoZORylHFl5eJO2sngTV1i+B9vL98CII3KyMxH5Kc3QLrWl3iW9NWuW2JiLRiOM27eVRea+TyMLv4Cl9w0oB9/rG/SVoUPsvgUvnHSARr9G3dn4HrdXSbzYA0Bsbc1QA2pzcAHgEj1U9Unp44Nc6p4NVvdyWvgmbeHqOxKkVTdNTc0iTqeXypqCXRX7tE1lO0lxqIyWRSpRHqlCuVyF8kglqpQg9N//XikhEKgAkRPgFiSku5KR7k5GfVcqa+yrr2W600mWN52TeCfkUKXmOLIH0bU/caGFX0A/stvi3/6/uHxIeG29/t3BFezZ5ZNOuIhUn6zOeKbXPX6JF3sDqJVTt2p7cMF0fRiLBD4pfXyQFK/h/QtCITTvAkfbcyA07QCW0lDV3D6mixKlgkg4TqACJ+DYpnsAwKAxHTrToWsq06IRnQsHmFBZSrXiPVTdswHK5sWIFi638reqNprSCN6Xl7FJW2awDzdN/58BGj0y2uOFPqMDEi8WAFhpQYumqPXBBf4Ir//38J7Z80Wb9fimHeB5cjae+u1d9kPRov/cgOySno8JBWOCEu/sjxjcYa8mxfVY5eoilE4lTu+o5PE/hPnG+Va3YztD6q41CE+4Ho92u4V0rd8GwLHVNCYUjAlJvHMwanlogTpyxP0D0/XLWST4YfkLwyVl0yKr27GdIdfAWyBc8wQ+2fItG5k3xC/x4nkAfrO6LzPUqeD+rkCXQzOq3rvfHZ7/WZ0446jN6j3xo642bRt0O1xnATjzsZFxoi5+cH+hoqur7/oXi70jxikgMT1Gw3YiHI+EW9+ICI1b73A7XK1Qh0IL1M0j7h9S9ZB/rrJhfovyiTdIsTYxwXZixFsPSfd/FhKatFlBXb4LAZzeXiVxrC4HFwCcesj/pVa8u1/Zc1e49dIDJ/8Jm6WEFl2R9MCXIeKQ3qFO9z8Rx3Nqz0RdDy4AUKaExzI1Orbi9X9I8opvre7HdgLuC+/SvJePDRHRNQJAnf6LsoP7/7rpkeD0yOIpiZUfPCDap86xg7gTkXjXeyFHq267qeQdDGCP1T1ZzQ7uXyXooaqP9aqSc8ufH+5W98fshuR1htCiK5Lu/zREJM/H1Om5G0DtXULkFNjB/V+Eaep1LCq/6v/scWfox3dpLE7Kr+2I5IX36icj0tnDI1R0XQ9gmtU9xRI7uCfWUg9VfakdKWpe8dpNbnVfnXraYCmx0yAk3vpGCIJjGpW8dwKw9zX9L3Zw/x5lWvRmqNEXgz9OEgKTn3UwudbMxY45NCENCTe/EnLk96mgkudqAMYsq1EL2MGtngw9VPUqtOigyvdGuyJLzd9ZrlYTRLjPu1H3XD5WBuXeoKLrUQB1Zz3b02AH99T00kP+97UjuzOqPn3Eo2yYb3U/8Y1ykM6+knmvGh8mvONX6k64B4A9fasa7OCeOg7AcD3sf049uD3B/8kjHmXzYqt7ijtilwvgG/WvIPUkbqMu350AllrdUzyxg3v6eAAj9HDgOXX/Fo//00c9yhb7s/e3CIHYcQC8wx8LcKmNjvwe2NmAgRsF1VJ2cM8cz3T9GqaEnlX3bXEHZ77iiaz8HtDq5Ei84yKSF1LB1cwz5J4QEV37qTthHIDJqCVLpVrBDm7NEQBcqgcqHgDQMjjnXSH00/t8XR7/zKU3hXvwbYqr4GqdqdF51J3wLI6dEtsfujNkB9cYrfVI4G5C+auULUv04Leve+T1PwMWLLNqNiL54OwyGFLfa/yOnE4EYO8Q0TUBwD6re6tN7OAaywNguB6sGA3CNY4sn8Uiv34jyRvmA2rtGblHRBfEjgMhFVwVEPN6C0wJL6GepPcATAdwaiuY26rFDq55spmuXcxCVaOIILaMrP1JjSz52i2vmYt4HNRBkzMh5veBs8sFQbH9uTxTIquoJ+ldHAtrhdX91XZ2cK2RDmCoHigfRUR3Z/XwzrCy/hdJ2bLEoWxbBr3yqNX9/Q+akAZH/tkQ2/UNi+36adSdAKZEFlNP0jQcG0cce03XYnZwrScB6MI0tTcLVQ0iTldH3V+uKZsWUaXwN5d6cDvUgzuglx0wZycBQsGlZ4NvmAuhUWtdyOkQFLLbgXqTOSaHllFvvRk4NhRxE+y7wpaxgxt7KIDWAHrpoaru0NQ2EMRswgkurXR/WD2wjUSLNrr1sgNUD5Tjjz/sj3+Gj7OKCyEA5QDKgbp8oN7kY398KaD1GoBLbqBy6dkRoWGuxqU1drGoXAFV2UKc3mVEcKzDsdFMmwBopr4TthOygxs/fABaAGjBNLUViwSbQFfTwJACSpPACQlEcHhAeQeYpoNQAhBCKCWMMQbGGJjOoCphpkYroatlAIrBi7uo5NkN4ACAbTi2z06dW8Mp3tjBrX0cOHbU1nHsCKnDfm5a69jBtdniUF1cV9lmi3t2cG22OGQH12aLQ3ZwbbY4ZAfXZotDdnBttjhkB9dmi0N2cG22OGQH12aLQ3ZwbbY4ZAfXZotDdnBttjhkB9dmi0N2cG22OGQH12aLQ3ZwbbY4ZAfXZotDdnBttjhkB9dmi0N2cG22OGQH12aLQ3ZwbbY4ZAfXZotDdnBttjhkB9dmi0N2cG22OGQH12aLQ3ZwbbY4ZAfXZotDdnBttjj0f/xTDrVKkjmkAAAAAElFTkSuQmCC\n", "text/plain": [ - "Int64Index([16], dtype='int64')" + "
" ] }, - "execution_count": 89, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "a & b" - ] - }, - { - "cell_type": "code", - "execution_count": 140, - "metadata": {}, - "outputs": [], - "source": [ - "v600e = civic.get_variant_by_id(12)" + "fig, ax = plt.subplots()\n", + "\n", + "size = 0.3\n", + "var_vals = [exact_ct, pmsv_xs, rmdr]\n", + "tum_vals = [exct_tum_ct, perm_tum_xs, no_match_ct, no_variant_ct]\n", + "\n", + "cmap = plt.get_cmap(\"tab20c\")\n", + "outer_colors = cmap(np.arange(4)*4)\n", + "inner_colors = cmap(np.arange(3)*4)\n", + "\n", + "ax.pie(tum_vals, radius=1, colors=outer_colors,\n", + " wedgeprops=dict(width=size, edgecolor='w'))\n", + "\n", + "ax.pie(var_vals, radius=1-size, colors=inner_colors,\n", + " wedgeprops=dict(width=size, edgecolor='w'))\n", + "\n", + "ax.set(aspect=\"equal\")\n", + "# plt.show()\n", + "plt.savefig('data/donut.svg')" ] }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ - "missed_results = list()\n", - "with open('data/genie_5.0_sorted.txt', 'r') as f:\n", - " reader = csv.DictReader(f, delimiter='\\t')\n", - " for r in reader:\n", - " v = r.values()\n", - " if v not in exact_results:\n", - " missed_results.append(v)" + "x_late = np.array(sorted(timings)[6:])\n", + "x_xtd = np.array(sorted(timings)[4:])\n", + "y_late = [timings[x] for x in x_late]\n", + "b, m = polyfit(x_late, y_late, 1)" ] }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 20, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.8952533220529592 ms/variant\n" + ] + }, { "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEACAYAAAC6d6FnAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAHLhJREFUeJzt3Xt4VdW97vHvjxDI4paogJoAAqJBBAWMeMFaPNWCty3FWsE7onip7rZ7P2zluC/dp+2BHh7t1mpVLIi6FUstTWUXTW2ReqkIwWgBIRpBIStqwiVByAq5jfNHAsaYQJJ1mXOt+X6eJw+skZm1foNFxrvmHHOOac45REQkeLp5XYCIiHhDASAiElAKABGRgFIAiIgElAJARCSgFAAiIgGlABARCSgFgIhIQCkAREQCSgEgIhJQ3b0u4HD69+/vhg4d6nUZIiJJY/369TudcwM6sq2vA2Do0KEUFhZ6XYaISNIws086uq0vDwGZ2eVmtrCqqsrrUkREUpYvA8A5t8I5NzszM9PrUkREUpYvA0BEROJPASAiElC+DADNAYiIxJ8vA0BzACISRPlFYSbOX8Wwe//IxPmryC8Kx/X1fH0aqIhIUOQXhZm7fAORugYAwpUR5i7fAMDUcTlxeU1f7gGIiATNgoLiQ4P/QZG6BhYUFMftNRUAIiI+EK6MtNle1k57LPgyADQJLCJBUl1bT8/ubQ/H2VmhuL2uLwNAk8AiEhRV1XVc9+u3qW1oJD3NvvK9UHoacybnxu21NQksIuKR8r013LB4LVsr9vPotWdQ03zMv6wyQnZWiDmTc+M2AQwKABERT2zfVc11i95m574DPDnzTCaO6A/E74yftigAREQSrPizL7h+UdNhn+duPZuxg7M8qUMBICISZ/lF4UOHdvr36cn+2nr6ZnRn2W3ncPKxfT2ry5eTwDoLSERSxcELvMKVERxQse8AkdoGZp8/3NPBH3waADoLSERSRVsXeDlg8Rsfe1JPS74MABGRVNHehVzxvMCroxQAIiJx1Dej7anWeF7g1VEKABGROHDOMW/lZvbW1JNmib3Aq6MUACIiMVbf0Mi9v9vA469t5fqzT2DBVaeRkxXCgJysEPOmjUno+f7t0WmgIiIxdKC+gR8sfZeXN33GP/6vEfzoopMxM6aNH+R1aV+jABARiZF9B+q57ZlC3izZxb9fNoqbzxvmdUmHpQAQEYmB3ftrmfnkWjaW7eWB753uy0/8rfkyAMzscuDyESNGeF2KiMgRfVoV4fpFa9m+u5rHrzuDC0cd63VJHeLLSWBdCCYiyWJrxT6+++hbfF5Vw9M3T0iawR98ugcgIpIMNoaruHHxWgCWzj6b0TnJ9aFVASAi0gVrtu7ilqcKyQyl88ysCQwf0MfrkjpNASAi0kl/fv9zvv/cOww+uhfPzJrA8ZneX9XbFQoAEZFO+N36Uv7ld39ndHY/npw5gaN79/C6pC5TAIiIdNDiN7bxf/7nfSaOOIbHr8+jT8/kHkKTu3oRkQRwzvGLVz7goVUlTDn1OB6cMZae3dO8LitqCgARkcNobHT8x4ubeGbNJ1ydN5iffWc03dN8eQZ9pykARETaUVvfyD//9j1WvFfGbecP596LR2KtVvZMZgoAEZE2RGobuOPZ9awuruCeKSO5Y9KJXpcUcwkLADObClwK9AMWOef+lKjXFhHpjKrqOm5+ah1F2/cwb9oYZkwY4nVJcRHVgSwzW2xm5Wa2sVX7FDMrNrMSM7sXwDmX75y7FbgduDqa1xURiZfyvTVcvfAtNpRW8fA141N28Ifo1wJaAkxp2WBmacAjwMXAKGCGmY1qscm/Nn9fRMRXtu+q5ruPvcX23dUsvulMLhlzvNclxVVUh4Ccc6+Z2dBWzROAEufcVgAzex64wsw2A/OBl5xz70TzuiIisZBfFGZBQTFllREG9O1JTV0D3boZz95yFuOGHOV1eXEXjzmAHGBHi8elwFnA3cCFQKaZjXDOPdbWD5vZbGA2wJAhqbvrJSLeyi8KM3f5BiJ1DQCUf3EAgHum5AZi8IcELgftnHvIOXeGc+729gb/5u0WOufynHN5AwYMSFR5IhIwCwqKDw3+Lf33mu0eVOONeARAGBjc4vGg5rYOM7PLzWxhVVVVTAsTETmorDLSqfZUFI8AWAecZGbDzKwHMB14sTNPoBvCiEi89ctIb7M9Oys5V/bsimhPA10KvAXkmlmpmc1yztUDdwEFwGZgmXNuU/SliohEzznH/Je2UFVTR7dWF/WG0tOYMznXm8I8EO1ZQDPaaV8JrOzq8+qewCISD/UNjcxdvoHfri/l2rOGcMaQo7j/lQ8oq4yQnRVizuRcpo7L8brMhDHnnNc1tCsvL88VFhZ6XYaIpIBIbQN3L32HP28u5wffOokfXnhSSq3rc5CZrXfO5XVkW60FJCIpr6q6jllPrWP99j385IpTuf6coV6X5Au+DAAdAhKRWPmsqoYbF69l6859/HLGOC47LdvrknzDl4ta6ywgEYmFjyr2ceWjf6N0TzVLZk7Q4N+KL/cARESi9d6OSmYuWYcBz88+hzGD9IGyNV/uAehCMBGJxusfVjDjiTX06pHGC3ecq8G/Hb4MAB0CEpGu+sO7YW5eso4hR/di+R3nMqx/b69L8i0dAhKRlPHkm9v4zxXvM2HY0TxxQx6Zobav9pUmCgARSXrOOe7/0wc8/GoJ3x51LA/NGEdGeprXZfmeLwNAp4GKSEfVNzTyb3/YyNK1O5h+5mB+OnU03dN8eXTbd3z5r6Q5ABHpiJq6Bu589h2Wrt3BXReMYN60MRr8O8GXewAiIkdSFanj1qcLWbttNz++fBQ3TRzmdUlJRwEgIkmnfG8NNyxey0cV+3hw+liuGBucBdxiSQEgIkll28793LD4bXbtq2XRjWdy/sm6c2BX+TIANAksIm3ZGK7ixsVrccDSW8/m9MFZXpeU1Hw5W6JJYBFp7W8lO5m+cA0Z6Wn89vZzNPjHgC/3AEREWlq54VN++Py7DOvfm6dunsBxmRlel5QSFAAi4mvPrPmEf//DRs4YchSLbjyTzF66ujdWFAAi4hv5RWEWFBRTVhnh+MwMxgzKpGDT51x4ykB+OWM8oR66ujeWFAAi4gv5RWHmLt9ApK4BgLKqGsqqapgw7Ggeu+4MXeAVB778F9Vy0CLBs6Cg+NDg31Lp7moN/nHiy39VnQUkEjxllZE22z+tqklwJcHhywAQkeAZ2Ldnm+3ZWaEEVxIcCgAR8dz6T/awr7b+a+2h9DTmTM71oKJgUACIiKde2vAp1zyxhv59enLfJaeQkxXCgJysEPOmjWHqOK3zEy86C0hEPOGc49evb+P/vrSZcYOzeOKGPI7p05Nbzx/udWmBoQAQkYSrb2jkP1e8zzNrPuHSMcdz//dO1x28PKAAEJGE2n+gnruXFrFqSzm3nT+ce6aMpFs387qsQFIAiEjCfL63hpuXrGPzp3v5ydTRXH/2CV6XFGi+DAAtBy2SerZ8tpebn1xHZaSORTeeyQUjB3pdUuD58iwgXQgmklre+HAnVz36FvWNjmW3naPB3yd8uQcgIqlj2bod/O/fb2DEwD4svulMXdjlIwoAEYkL5xwPvPIBv1xVwjdO6s+vrh1P3wwt5ewnCgARibkD9Q3c88LfyX+3jKvzBvPT74wmXQu6+Y4CQERiqrK6ltnPrGfttt3MmZzLnZNOxEynefqRAkBEYmb7rmpuWrKW0t0RHpw+livGahkHP1MAiEhMFG3fwy1PFVLf6Hhm1gTOGn6M1yXJESgARCRqL2/8jB88X8Sx/TJ4cuaZnDigj9clSQcoAESky5xzLHpjGz9buZnTB2Xx6xvz6N+n7XX9xX8UACLSJQ2Njp/8z/ss+dvHTDn1OP5r+lgt6JZkEhYAZjYcuA/IdM59N1GvKyKxkV8UZkFBMWWVEY7LzODo3j3YVLaXW78xjLkXn6IF3ZJQVCfmmtliMys3s42t2qeYWbGZlZjZvQDOua3OuVnRvJ6IeCO/KMzc5RsIV0ZwNN2nd1PZXq4cn8N9l47S4J+kor0yYwkwpWWDmaUBjwAXA6OAGWY2KsrXEREPLSgoJlLX8LX2NVt3e1CNxEpUAeCcew1o/T9gAlDS/Im/FngeuCKa1xERb5VVRjrVLskhHtdm5wA7WjwuBXLM7BgzewwYZ2Zz2/thM5ttZoVmVlhRURGH8kSkMw7UN9CrR9uTu1rYLbklbBLYObcLuL0D2y0EFgLk5eW5eNclIu3bvqua7z/3DvtrG+jezahv/PJXMpSexpzJuR5WJ9GKRwCEgcEtHg9qbusw3RBGxHsvb/yMOS+8B8Dj159BpLbh0FlA2Vkh5kzOZeo4LfWQzOIRAOuAk8xsGE0D/3Tgms48gXNuBbAiLy/v1jjUJyKHUVvfyPyXtrD4zW2cNiiTR64Zz+CjewFowE8xUQWAmS0FJgH9zawU+A/n3CIzuwsoANKAxc65TVFXKiJxV7qnmrueK+LdHZXcdO5Q5l4ykp7ddXFXqooqAJxzM9ppXwms7Orz6hCQSOL9ZfPn/NOy92hsdPzq2vFcMuZ4r0uSOPPlHRp0T2CRxKlraGTeys3MeqqQnKwQK+4+T4N/QPhyLSDtAYgkxqdVEe5+rojCT/Zw7VlD+LfLRmk9nwDRHoBIQK0uLufSh95g86d7eXD6WH72nTEa/APGl3sAIhI/9Q2N/OLPH/DIqx8x8ri+PHLteK3fH1AKAJEA+XxvDf+4tIi3t+3m6rzB/PgfTiXUzlW+kvp8GQCaAxCJvTdLdvKD54vYf6CB+686nSvPGOR1SeIxzQGIpLiGRscvXvmA6xa9TVavHrx410QN/gL4dA9ARGKj4osD/PA3RbxZsotp43L46XdG06uHfu2lif4niKSoNVt3cffSIvZG6vj5lWP4Xt5gzHTjFvmSLwNAcwAiXdfY6Hj0rx9x/5+KGXpMb56+eQKnHN/P67LEhzQHIJJCdu+vZeaSdSwoKObS07J58e7zNPhLu3y5ByAinVf48W7ueq6I3ftr+enU0Vx71hAd8pHDUgCIJLnGRscTr2/l/xUUM+ioEMvvPJfROdp7liNTAIgkscrqWv552Xv8ZUs5l4w5jvlXnka/jHSvy5Ik4csA0CSwyOE1NDp+t76U+18pZvf+Wn58+ShuPHeoDvlIp/gyAHRHMJG2Oef4y+Zyfv7yFj4s38fYwVk8cUMepw3K8ro0SUK+PAtIRL7une17uPrxNdzydCH1jY6Z5w6lfG8NVzz8JhPnryK/qFO33hbx5x6AiHzpo4p9LHi5mJc3fUb/Pj35ydTR9EpP41/zNxKpawAgXBlh7vINgO7bKx2nABDxqfK9NfzXXz7kN+t2kNG9Gz+68GRu+cYwevfszsT5qw4N/gdF6hpYUFCsAJAOUwCI+MwXNXU88dpWnnh9G3UNjVx31hDu/tZJ9O/T89A2ZZWRNn+2vXaRtvgyAHQWkARRbX0jz739Cb9cVcKu/bVcetrxzPl2LkP79/7attlZIcJtDPbZWaFElCopwpeTwFoKQoKksdGx4r0yLvrFX/nxivc5+di+/OH7E3nkmvFtDv4AcybnEmp1+8ZQehpzJucmomRJEb7cAxAJir+V7GTeS1vYEK5i5HF9eXLmmUw6ecARz+c/eJx/QUExZZURsrNCzJmcq+P/0ikKABEPvF+2l/kvb+G1DyrIyQpx/1WnM3VcDmndOn4h19RxORrwJSoKAJEEKt1TzQN/+oDfvxumX0Y6911yCtefcwIZ6bovrySeAkAkAfbsr+WRV0t4+q1PwGD2+cO585sjyOyldXvEOwoAkTiqqWvgyTc/5lerS9h/oJ4rxw/iRxedrLN1xBcUACJxUNfQyO/fCfPAKx/w2d4avjVyIP8yZSS5x/X1ujSRQxQAIjFS8cUBVheXs7q4gtc+rOCLmnrGDs7iweljOWv4MV6XJ/I1vgwAXQgmyaCx0fH3cBWrtpSzuricv5dWATCwb08uGX08k0cfywW5A7VEs/iWOee8rqFdeXl5rrCw0OsyRA6pqq7jtQ8reLW4nL8WV7Brfy3dDMYNOYoLcgcwKXcgp2b306AvnjGz9c65vI5s68s9ABG/cM6x5bMveLW4nFe3lPPO9koaGh1H9UrnmycP4IKRAzn/pAEc1buH16WKdJoCQKSV/QfqebNkJ68WV7C6uJxPq2oAODW7H3dOOpFJuQMZOzirUxdtifiRAkAE2Fqx79CA//bW3dQ2NNKnZ3fOG9GfH104kG/mDuDYfhlelykSUwoACaSaugbe3rabV5sncD/eVQ3AiIF9uPHcE7hg5EDyTjiaHt19uV6iSEwoACQwwpWRQwP+myW7iNQ10LN7N8498RhmnTeMSbkDGXx0L6/LFEkYBYCkhMZGx879ByirrKGsMkJ4T4RwZYSyyghlVU2P91TXATDoqBBX5Q3igpEDOWf4MVqHRwJLASBJoaauoWkwr6whXFlNuHmgL6tsGug/rayhtqHxKz/Tp2d3crJCZGdlcPqgLIYP6MM3Tx7AiQN66zRNERQA4gPOOXbtrz00oJfuiRz6JH/w0/uu/bVf+RkzOLZvBtlZGZw2KIspozOaBvvMEDlHhcjOCtEvo7snA31+UVjr9EtSUACIp3bsrubCB/7KgfqvfnoPpacdGshPze7X/Em+6SsnK8RxmRmkp/lvgja/KMzc5RsO3bA9XBlh7vINAAoB8Z2EBYCZ9QZ+BdQCq51zzybqtcW/BvTtyQ3nnHBoYD/4Z1av9KQ8TLOgoPjQ4H9QpK6BBQXFCgDxnagCwMwWA5cB5c650S3apwAPAmnAr51z84FpwAvOuRVm9htAASBkpKdx36WjvC4jZsrauFH74dpFvBTtPvQSYErLBjNLAx4BLgZGATPMbBQwCNjRvNlXPyKJpIj21vnX+v/iR1EFgHPuNWB3q+YJQIlzbqtzrhZ4HrgCKKUpBKJ+XRG/mjM5l1Cr00pD6WnMmZzrUUUi7YvHQJzDl5/0oWngzwGWA1ea2aPAivZ+2Mxmm1mhmRVWVFTEoTyR+Jk6Lod508aQkxXCgJysEPOmjdHxf/GlhE0CO+f2AzM7sN1CYCE0LQcd77pEYm3quBwN+JIU4rEHEAYGt3g8qLmtw8zscjNbWFVVFdPCRETkS/EIgHXASWY2zMx6ANOBFzvzBM65Fc652ZmZmXEoT0REIMoAMLOlwFtArpmVmtks51w9cBdQAGwGljnnNkVfqoiIxFJUcwDOuRnttK8EVnb1eXVPYBGR+PPl6Zg6BCQiEn++DABNAouIxJ8vA0B7ACIi8efLABARkfjTctCS8rQ+v0jbfBkAOgtIYkXr84u0z5eHgDQHILFyuPX5RYLOlwEgEitan1+kfQoASWlan1+kfb4MAF0HILGi9flF2ufLANAcgMSK1ucXaZ8vzwISiSWtzy/SNl/uAYiISPxpD0A8pYu0RLzjywDQhWDBoIu0RLzly0NAmgQOBl2kJeItXwaABIMu0hLxlgJAPKOLtES8pQAQz+giLRFv+XISWILh4ESvzgIS8YYvA0BnAQWHLtIS8Y4vDwHpLCDv5BeFmTh/FcPu/SMT568ivyjsdUkiEie+3AOQr0vEBVM6L18kWHy5ByBfdXBgDldGcHw5MMf607nOyxcJFu0BdFEilzA43MAcy9fUefkiwaI9gC5I1CfygxI1MOu8fJFgUQB0QaIPlSRqYNZ5+SLBogDogkQfKknUwKybp4gEi+YAuiA7K0S4jcE+XodKEnnBlM7LFwkOXwaA3y8EmzM59yunS0L8D5VoYBaRWPPlISC/XwimQyUikgp8uQeQDPSJXESSnS/3AEREJP4UACIiAaUAEBEJKAWAiEhAKQBERAJKASAiElAKABGRgFIAiIgEVMICwMyGm9kiM3shUa8pIiLt61AAmNliMys3s42t2qeYWbGZlZjZvYd7DufcVufcrGiKFRGR2OnoUhBLgIeBpw82mFka8AhwEVAKrDOzF4E0YF6rn7/ZOVcedbUiIhIzHQoA59xrZja0VfMEoMQ5txXAzJ4HrnDOzQMui2WRIiISe9HMAeQAO1o8Lm1ua5OZHWNmjwHjzGzuYbabbWaFZlZYUVERRXkiInI4CVsN1Dm3C7i9A9stBBYC5OXluXjXJSISVNHsAYSBwS0eD2pui5qZXW5mC6uqqmLxdCIi0oZoAmAdcJKZDTOzHsB04MVYFOX3G8KIiKSCjp4GuhR4C8g1s1Izm+WcqwfuAgqAzcAy59ym+JUqIiKx1NGzgGa0074SWBnTivD/PYFFRFKBL5eC0CEgEZH482UAaBJYRCT+fBkA2gMQEYk/XwaAiIjEnwJARCSgfBkAmgMQEYk/XwZANHMA+UVhJs5fxbB7/8jE+avIL4rJxckiIiknYWsBJUJ+UZi5yzcQqWsAIFwZYe7yDQBMHdfuOnUiIoHkyz2ArlpQUHxo8D8oUtfAgoJijyoSEfEvXwZAV+cAyiojnWoXEQkyXwZAV+cAsrNCnWoXEQkyXwZAV82ZnEsoPe0rbaH0NOZMzvWoIhER/0qpSeCDE70LCoopq4yQnRVizuRcTQCLiLQhpQIAmkJAA76IyJH58hCQLgQTEYk/XwaAFoMTEYk/XwaAiIjEnwJARCSgFAAiIgGlABARCShfngbafFP47wN7zezDFt/KBKo6+Pf+wM4oS2n5vF3Zpq3vtW473ON49KsjfTrcdh1t78x7BYnpV9Deq/a+15V+Jdt71botSOPFCWZ2uXNuxRErds758gtYeLi2I/0dKIxHDZ3Z5kh9ONLjePSrI3063HYdbe/Me5WofgXtvYplv5LtverI+xPL9ypR/erK79nhvvx8CKit9FrRyb/Ho4bObHOkPhzpcTz61dHnaW+7jrbrvYpetO9Ve9/rSr+S7b1q3ab/g22w5rRIOWZW6JzL87qOWFO/kkcq9glSs1+p2KeO8PMeQLQWel1AnKhfySMV+wSp2a9U7NMRpewegIiIHF4q7wGIiMhhKABERAJKASAiElCBCQAz621mT5nZE2Z2rdf1xIqZDTezRWb2gte1xIqZTW1+n35jZt/2up5YMbNTzOwxM3vBzO7wup5Yaf7dKjSzy7yuJVbMbJKZvd78fk3yup54SeoAMLPFZlZuZhtbtU8xs2IzKzGze5ubpwEvOOduBf4h4cV2Qmf65Zzb6pyb5U2lHdfJPuU3v0+3A1d7UW9HdbJfm51ztwPfAyZ6UW9HdPL3CuAeYFliq+y8TvbLAfuADKA00bUmTLRXv3n5BZwPjAc2tmhLAz4ChgM9gPeAUcBcYGzzNs95XXus+tXi+y94XXcc+nQ/MN7r2mPZL5o+fLwEXON17bHoE3ARMB24CbjM69pj2K9uzd8/FnjW69rj9ZXUewDOudeA3a2aJwAlrumTcS3wPHAFTSk+qHkbX/e7k/1KCp3pkzX5OfCSc+6dRNfaGZ19r5xzLzrnLgZ8exiyk32aBJwNXAPcama+/d3qTL+cc43N398D9ExgmQnly8XgopQD7GjxuBQ4C3gIeNjMLiW2l38nSpv9MrNjgJ8B48xsrnNunifVdU1779XdwIVAppmNcM495kVxUWjvvZpE06HInsBKD+qKRpt9cs7dBWBmNwE7WwycyaK992oaMBnIAh72orBESMUAaJNzbj8w0+s6Ys05t4umY+Upwzn3EE2BnVKcc6uB1R6XERfOuSVe1xBLzrnlwHKv64g33+6uRSEMDG7xeFBzW7JLxX6lYp8gNfuVin2C1O1Xh6RiAKwDTjKzYWbWg6YJqhc9rikWUrFfqdgnSM1+pWKfIHX71SFJHQBmthR4C8g1s1Izm+WcqwfuAgqAzcAy59wmL+vsrFTsVyr2CVKzX6nYJ0jdfkVDi8GJiARUUu8BiIhI1ykAREQCSgEgIhJQCgARkYBSAIiIBJQCQEQkoBQAIiIBpQAQEQkoBYCISED9fz7GvEZyq99JAAAAAElFTkSuQmCC\n", "text/plain": [ - "59437" + "
" ] }, - "execution_count": 97, - "metadata": {}, - "output_type": "execute_result" + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" } ], "source": [ - "len(genie_samples)" + "plt.xscale('log')\n", + "plt.yscale('log')\n", + "plt.scatter(sorted(timings), [timings[x] for x in sorted(timings)])\n", + "plt.plot(x_xtd, b + m * x_xtd, '-')\n", + "plt.savefig('data/scatter.svg')\n", + "print('{} ms/variant'.format(m*1000))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, + "outputs": [], + "source": [ + "match_counts = defaultdict(Counter)\n", + "levels = ['A','B','C','D','E']\n", + "for tumor, matches in exact_tumor.items():\n", + " match_count = len(matches)\n", + " highest_evidence = 4\n", + " for match in matches:\n", + " v = civic.CACHE[match.v_hash]\n", + " for e in v.evidence:\n", + " if levels.index(e.evidence_level) < highest_evidence:\n", + " highest_evidence = levels.index(e.evidence_level)\n", + " match_counts[match_count][levels[highest_evidence]] += 1 " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, "outputs": [ { "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEKCAYAAAAIO8L1AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3XmcFOWdx/HPz2Y4PFE8Eh1dxkVcMQrieCBoTCIbNByeEbLBaIisyRIjrhvRXLjRxAOj0RgNCQaTmEElURjFmBjvIy5iwANEEVHHGDlUEARmGH77x1MD7UwfNUdN9Qzf9+vVr+l6qrrq1wPTv36Oeh5zd0RERBrbLu0ARESkNClBiIhITkoQIiKSkxKEiIjkpAQhIiI5KUGIiEhOShAiIpKTEoSIiOSkBCEiIjl1STuA1th99929d+/eaYchItKhzJs3b6W771HsuA6dIHr37s2zzz6bdhgiIh2Kmb0R5zg1MYmISE5KECIiklOHTBBmNsLMpq5evTrtUEREOq0O2Qfh7tVAdWVl5blpxyIiba+uro6amho2bNiQdigdWvfu3SkvL6esrKxFr++QCUJEOreamhp22mknevfujZmlHU6H5O6sWrWKmpoaKioqWnSODtnEJCKd24YNG+jVq5eSQyuYGb169WpVLUwJQkRKkpJD67X2d6gEISIiOSlB5LCxPvc63fnKRSR5mUyGAQMGbHlceeWVBY8/6aST+OCDD5qUT548mSlTpiQVZl7Tp09nwoQJiZ3/+OOPb/Mbh9VJnUO3jFFRXdOk/PUR5SlEIyIAPXr0YP78+bGPnzNnToLRbBtUgxCRDutPf/oTZ5xxxpbtRx55hOHDhwNhKp6VK1cCcMUVV9C3b1+GDBnC4sWLtxz/2muvMWzYMA4//HCOPfZYXn75ZQDOPvtszj//fI455hj2339/Zs6cueU1V111FYcccgj9+/dn0qRJBc8Tx5///GcGDRrEwIEDOeOMM1i7dm3B95Xr+KQoQYhIh7B+/fqPNTHdcccdnHDCCTzzzDOsW7cOgDvuuIPRo0d/7HXz5s1jxowZzJ8/nzlz5jB37twt+8aPH8+NN97IvHnzmDJlCt/4xje27HvnnXd44oknuPfee7ckgvvvv59Zs2bxzDPPsGDBAr797W8XPU8hK1eu5PLLL+fBBx/kueeeo7Kykp/85Cd531e+45OiJiYR6RDyNTENGzaM6upqTj/9dO677z6uvvrqj+1//PHHOeWUU9h+++0BGDlyJABr167lqaee+tg39Y0bN255fvLJJ7PddtvRr18/3n33XQAefPBBzjnnnC3n2m233Yqep5C//e1vLFy4kMGDBwNQW1vLoEGD6NKlS8739eijj+Y8PikllSDMbAfgUWCyu9+bdjwiUvpGjx7Nz372M3bbbTcqKyvZaaedYr1u8+bN9OzZM2+/Rrdu3bY8d88/QKXYeQpxd4YOHUpVVVWTfbneV6Hjk5BoE5OZ3Wpmy83sxUblw8xssZktMbNJWbsuBu5MMiYR6Vw+/elP89xzz/HLX/6ySfMSwHHHHcc999zD+vXr+fDDD6murgZg5513pqKigrvuugsIH9YLFiwoeK2hQ4fy61//mo8++giA9957r0XnaXD00Ufz5JNPsmTJEgDWrVvHK6+8kvd9FTo+CUn3QUwHhmUXmFkGuAk4EegHjDGzfmY2FFgILE84JhHpgBr3QTT0C2QyGYYPH87999+/pSM328CBAznzzDPp378/J554IkccccSWfbfffjvTpk2jf//+HHzwwcyaNatgDMOGDWPkyJFUVlYyYMCALcNl455n+vTplJeXb3ls3LiR6dOnM2bMGA499FAGDRq0pYM71/vaY4898h6fBCtUdWqTC5j1Bu51909F24MITUifj7YviQ7dEdiBkDTWA6e4++ZC566srPSkFgzSMFeR9CxatIiDDjoo7TA6hVy/SzOb5+6VxV6bRh/EPsBbWds1wFHuPgHAzM4GVuZLDmY2HhgPsN9++yUbqYjINqxoE5OZTTSzb5pZTzN7LOo3aFqPayPuPr1QB7W7T3X3Snev3GOPokuqiohIC8Xpg7gA6AF8GTgM6A5cXfAVhb0N7Ju1XR6VxaYFg0REkhcnQewF/AMYCPwa+C7QssnFg7nAAWZWYWZdgdHA7OacwN2r3X38Lrvs0oowRESkkDgJYjlwETASmAdsD3wY5+RmVgU8DRxoZjVmNs7dNwETgAeARcCd7v5Sc4JWDUJEJHlxEsQVQG/gdeCPwOHA43FO7u5j3P2T7l7m7uXuPi0qn+Pufd39X939iuYGrRqEiEjyiiYId/+Fu/d09yPc/UN3H+fup7VHcCIibW1jfW0q57vnnnsws0TvW2hrRYe5mlkv4BbgBOAM4D+BR9z9poRjKxTTCGBEnz590gpBRDqobpmuDJ49ts3O9+TI38Y6rqqqiiFDhlBVVcVll13WZtdPUpwmppsJd0PvDGwGlgHnJRhTUWpiEpGOZO3atTzxxBNMmzaNGTNmpB1ObHESxFAge/mlhbRuFJOIyDZl1qxZDBs2jL59+9KrVy/mzZuXdkixxEkQ6whDXQEyhKamVYlFFINGMYlIR1JVVbVlwr3Ro0e322ysrRVnqo0ZwIWAA/dGr7kmyaCKcfdqoLqysvLcNOMQESnmvffe46GHHuKFF17AzKivr8fMuOaaazCztMMrKE4N4hLgMsI9EAui599LMigRkc5i5syZjB07ljfeeINly5bx1ltvUVFRweOPx7pbIFVFaxDuXkdICh2j211EpICN9bWxRx7FPV+3TNe8+6uqqrj44os/VnbaaadRVVXFcccd12ZxJCHOMNfTgCsJN8s11Djc3VNbjU7DXEWkpQp9mCdxvocffrhJ2fnnn9+mMSQl7jDX/YAlhBFMCwlTZKRGw1xFRJIXpxawCvihu9+YdDAiIlI68iYIMzs1evpX4Otm9hHwflTm7n530sFJUxvrnW6ZpiMf8pWLiLRUoRrETMLQVgADpmY9d8I9EanYlvsgumVMy6GKSLsolCD+l60JoqToPggRkeTlTRDuPrkd4xARkRITZ03qR8zsJ1nb15lZ03FbIiIdwMb6tm0YiXO+TCbDgAED6N+/PwMHDuSpp55q0xiSEmcU05HAbVnbzxOm/BYR6XDy9eO1VJz+vx49ejB//nwAHnjgAS655BIeffTRNoshKXGXHD3VzLY3sx2A06OybU5dXX2L9omINFizZg277rpr2mHEEqcGUQVcDKyJto1wZ3Vq0hrFVFaWYdzEWTn3TbtuVLvGIiIdx/r16xkwYAAbNmzgnXfe4aGHHko7pFjiJIjvA+uB4dF2NfDjxCKKQaOYRKQjyW5ievrppznrrLN48cUXO81srjPd/Uh3b+iP+GKyYYmIdE6DBg1i5cqVrFixIu1QioqTICYDB2dtDwHabipEEZFtyMsvv0x9fT29evVKO5SiCk218RXgK9HmD8zs69HzPoRV5kREOpyN9d6mMw/EmeamoQ8CwN257bbbyGRSm4witkJ9EL2B4wl3U/eLHgCbgasTjUpEJCFtPWdZnPPV13fMUY6FmpiuBvYE3gT+A9gD2B3o7u6XtkNsIiKSokJTbawnjF6qMLOuhBpFdwAzw92fb5cIRUQkFXFWlBsF/AbYsdGu0m9AExGRFosziulHQA3hBrk5wGrgjiSDKsbMRpjZ1NWrV6cZhohIpxYnQewP/JLQWX0D8D0g1cUHtOSoiEjy4txJvR74EKgDLgJ2AAYkGVR72Fhf2+aLl4uIdCZxEsSDwG7ADOCsqGxGYhG1k26ZrgyePTbnvidH6j5Akc6qrq6esrK260KNe75//vOfXHDBBcydO5eePXuy1157cf3119O3b982i6WtFU0Q7v5FADPbjjBxH8BfkgxKRCQphSbdbIk4E3W6O6eccgpf+cpXmDEjfL9esGAB7777bsdMEGZ2Vr59hPsi9DVbRCSGhx9+mLKyMs4777wtZf37908xongK1SCms3VN6oZbBT167ihBiIjE8uKLL3L44YenHUazFWtiMmAJoc9BY0pFRLYhhYa5Hke4Qe6TwH8DA4EF7n6tu1/bHsGJiHQGBx98MPPmzUs7jGbLmyDc/Ql3P5uQIL4DnAo8YGb/3U6xiYh0Cp/97GfZuHEjU6dO3VL2/PPP8/jjj6cYVXEFm5jM7Fjgq8AZhFlcf0+4m7rNmdlBwLcIEwL+1d1vTuI6IrJtq6urb9MlguMMczUz7r77bi644AKuuuoqunfvTu/evbn++uvbLI4kFBrFtJiw9sNSwnQbVUT9EGa2m7u/V+zkZnYrYanS5e7+qazyYcBPCfM5/crdr3T3RcB50XDa3wBKECLS5tryHojmnG/vvffmzjvvbNNrJ61QH8QBhE7q/YEfEjqrV0SP5THPPx0Yll1gZhngJuBEwhoTY8ysX7RvJHAfCdVSREQkvkJNTI+xdZhri7j7Y2bWu1HxkcASd18KYGYzgFHAQnefDcw2s/sIzVkdhm+qxbo0nbojX7mISKkrtB7E8Qldcx/graztGuAoMzue0BHejQI1CDMbD4wH2G+//RIKsfmsS1fWXve5JuU7TvxrCtGIiLRenLmY2oW7PwI8EuO4qcBUgMrKylbVcEREJL840323tbeBfbO2y6Oy2LQehIhI8tJIEHOBA8ysYSnT0cDs5pxA60GIiCQvb4Iws90KPeKc3MyqgKeBA82sxszGufsmYALwALAIuNPdX2pO0KpBiEhL+abadj9fJpNhwIABHHzwwfTv359rr72WzZs3t2kcSSjUB7GS/KOYvMhrw0HuY/KUz6EVQ1ndvRqorqysPLel5xCRbVO+ASUtFWcgSo8ePZg/fz4Ay5cv50tf+hJr1qzhsssua7M4kpDoMFdpGa12J9J57bnnnkydOpUjjjiCyZMnY2bFX5SSNIa5tpqZjQBG9OnTJ+1QEqHV7kQ6t/3335/6+nqWL1/OXnvtlXY4eRXtpDazHmZ2jZn93cwGm9kNZnZGewSXjzqpRUSSF2cU0/XAROBQwk1sGeDbSQYlItKZLV26lEwmw5577pl2KAXFSRCnAtdkbc8DDkwmnHg0iqn0bKzP311VaJ/ItmbFihWcd955TJgwoaT7HyDendSb2brkKEB/YG0y4cSjUUylp1vGqKiuybnv9RHl7RyNSH6+qbZNp8CJM9/a+vXrGTBgAHV1dXTp0oWxY8dy4YUXtlkMSYmTIO4DGt7Jb4FPAL9KLCIRkQS19eSZcc5XX1/fptdsL3ESxAWEGsQXgDLgNuCiJIMSEZH0xUkQnwXGu3td0sHE1dmHuYqIlII4ndR/BP5pZjeb2eCkA4pDw1xFRJIXJ0FMAJ4HvgY8ZmZLzeyHyYYlIiJpK5og3P3n7v4ZYG/COtH/AlyadGAiIpKuon0QZjYAOB04DegbFT+RZFAiIpK+OJ3Uz0U/XyLUHH7v7m8VOD5x6qQWkZbaXFvLdl3bbqhrnPNlMhkOOeSQLdujR49m0qRJbRZDUuIkiGuA2939+aSDiUs3yolIS23XtSuvn5R7MsyWqJhTfALN7Om+O5I4fRAXA/ua2fVm1s/Mvmxm/dshNhERSVGc2Vy/BVQD3yTcRd14biYpAXV1+e/ULLRPRJLXMNVGw+OOO+5IO6RY4jQxTQTuInRUAzwI/G9iEUmLlJVlGDdxVs59064b1c7RiEi2TtvEBOwKLMja3p4w5beIiHRicWoQzwBfj55fBAwBnkwsohg0iklEJHlxEsT5wGzChH3DgFcJE/ilRqOYRKSlNtfWxhp51JzzFRvm2tAH0WDYsGFceeWVbRZDUoomCHd/2cwOYusiQW+gJiYR6aDa8h6IuOfrqNN9x+mDwN3r3X2huy8Evgu8l2xYIiKStlgJIofSXidPRERaraUJQkQkUe5ay7y1Wvs7zNsHYWaz8+z6t1ZdUUSkiO7du7Nq1Sp69eqFmRosWsLdWbVqFd27d2/xOQp1Ug8vdO0WX1FEpIjy8nJqampYsWJF2qF0aN27d6e8vLzFry+UICpafNaE6T4Ikc6trKyMioqS/QjaZuRNEO7+RnsG0hy6D0JEJHnqpBYRkZyUIEREJCclCBERySnOehC9zOwuM3vfzE6Ink9oj+BERCQ9cWoQNxMm6dsZ2AwsA/4zwZhERKQExEkQQ4EpWdsLKeEhsJKsjfW1aYcgIu0kznTf64C9oucZ4ARgVWIRSUnrlunK4NlNF3x/cmTbTZ8sIqUhToKYAVxIuHv63ug1WpNaRKSTi5MgLgE+BL4Qbd8L/DiJYMzs5Og6OwPT3P3PSVxHRESKi9MHkQGmuPuR7n4kcC3NWDDIzG41s+Vm9mKj8mFmttjMlpjZJAB3v8fdzwXOA86M/zZERKStxUkQc4HLs7Yvj8rimk4YBbWFmWWAm4ATgX7AGDPrl3XId6P9IiKSkjgJog/wfNb2C8C/xr2Auz9G0xXojgSWuPtSd68l9HOMsuAq4H53fy7uNUREpO3F6YOoAb5mZk8TVpI7F3i7ldfdB3ir0TWOAr5JGCW1i5n1cfdbGr/QzMYD4wH222+/VoYhIiL5xEkQtxBGLb0UbRtwURLBuPsNwA1FjpkKTAWorKzUuhQiIgkpmiDc/VozW8HWBYSq3b21g97fBvbN2i6nGbUSrQchIpK8uJP1/R6YSKg5PGpmrW3bmQscYGYVZtYVGA3kW+K0CXevdvfxu+yySyvDEBGRfOJM1nc+sBp4E3g9eiyNewEzqwKeBg40sxozG+fum4AJwAPAIuBOd3+p0HlERKR9xemDmAxsAB4DNjX3Au4+Jk/5HGBOc88HamKS5tlY73TL5F74vtA+kW1dnATxJvALd7856WDi0pKj0hzdMkZFdU3Ofa+PaPmC7iKdXZwEsRD4npntDbwflbm7X5dcWCIikrY4CWJ09PM7WWUOpJYg1MQkIpK8OAninMSjaCY1MYmIJC/OfRC3RUNR+wGvu/vq5MMSEZG0xRnmehjwGvAscISZLTKzXyYeWeGYRpjZ1NWrlatERJIS50a5nxFWlTPCmtS/I8yXlBrdKCcikrw4CaI/YcruBv8A9kwkGhERKRlxEkQN8Ono+aGExXyWJRVQHGpiEhFJXpwEcTXweUIT07XAEcBVSQZVjJqYRESSF2cU061mthQ4KSq6z90fTTaszmNzbS3bde3a7H0iImnLmyAazdi6lNBZvWWfu7+ZZGCdxXZdu/L6SWNz7quY09pZ00VEklOoBrGMcMd0Ll7ktSIi0sEV+pCfQ0gEOwPHEKbsNuBo4G/Jh5afptoQEUle3k5qdx/u7iMI90BMcvfj3P1Y4BJgbXsFmCc2dVKLiCQsTjPREGCdmfUmJJRBhFqEiIh0YnESxCzgP4BTo20D1LsqItLJxUkQ4wjzMB1P6JN4GPhFgjGJiEgJiHMfRC3w0+ghIiLbiLyd1Ga2xsxOiX42fqQ6x4Wm2uhY6urqm1UuIqWhUA1iFVAHvEf++yFSoQWDmsc31WJdmt6xna+8rZWVZRg3cVaT8mnXjUr82iLScnkThLtXAJjZ/e6ur3odmHXpytrrPtekfMeJf00hGhHpKOJM1veOmd1oZkclHo2IiJSMOAliGfBfwFNm9qqZTTYz3cIsItLJFU0Q7n4k0Bv4H2AF8F3g5WTDEhGRtMWpQQCUAV2BbtFrLLGIRESkJBS9D8LM5gOHEJLCYuD7wO0JxyUiIimLcyf1XoSb5G5393kJxxOLZnMVEUlenCamfdz9wobkYGYZM9s54bgK0myuIiLJK3Qn9XtmNgrY0cweMrPDol2nA++3S3QiIpKaQjWInoRO6TLCRH27tkdAIiJSGoo1MXme5yIi0skV66S+GPgqITlcYWYrgb0Tj0qkBTbW19Itk/zcUiLbimIJYmDW8+xV5FSbkJLTLdOVwbPHNil/cqTWtxJpiUIJoqLdohARkZJTaDbXN9ozEBERKS1xp9oQEZFtjBKEiIjkVDRBmNlxZrZ71nY3M2vzkUxmtr+ZTTOzmW19bhERab44NYiHgc9kbZ8MvBXn5GZ2q5ktN7MXG5UPM7PFZrbEzCYBuPtSdx8XN3AREUlW3k5qMzuOcAe1AWeY2UHRruMIa1XHMR34GfCbrPNmgJuAoUANMNfMZrv7wuYGL9LRbax3umVyz55faJ9Ieyg0zPUzwA8I9zycHj0aPBjn5O7+mJn1blR8JLDE3ZcCmNkMYBSgBCHbnG4Zo6K6Jue+10eUt3M0Ih9XqInpTuCLhBrE9cAZhCTxOeALrbjmPny8iaoG2MfMepnZLcBhZnZJvheb2Xgze9bMnl2xYkUrwhARkUIK3QexCFhkZhXAcndfb2YHAHu5e9wmptjcfRVwXozjpgJTASorK3VHt4hIQuJ0Uv8G+IWZHQ4sAh41s5+24ppvA/tmbZdHZbGZ2Qgzm7p69epWhCEiIoXESRCHAE8AI4EXgbuBMa245lzgADOrMLOuwGhgdnNOoAWDRESSFydBdCesCVEJ3APMAnaKc3IzqwKeBg40sxozG+fum4AJwAOEGsmd7v5Sc4JWDUJEJHlx1qR+Abghej4FOIbQsVyUu+esabj7HGBOnHPkeX01UF1ZWXluS88hIiKFxUkQY4FvAK+6+8Nm1he4ItmwREQkbUWbmNz9FeDHwNtm9kngfuAPSQdWiJqYRESSF2cuphOAV4GZwEHRz1sSjqsgdVKLiCQvTif1FEJncsM9/3cBn04sIhERKQlxEkQf4I9Z2+8DPZMJJx41MUlbqaurb1a5yLYkTif1EsJcSRAm2DsVWJxYRDFoFJO0lbKyDOMmzmpSPu26UTmOFtm2xEkQ3yX0OxhwMVALnJJkUCIikr6iCcLd7zWzQwi1B4C/uPuryYYlIiJpi7vk6J7AamAtMMjMzkoupOLUByEikryiNQgzu50wX9KWIsIaEb/J/YrkqQ9CRCR5cfoghgPzCDfHbUo2HBERKRVxEsQjwFPuflXCsYiISAkptCZ1wxTcuwCXm9lwwj0QAO7uqY0DNLMRwIg+ffqkFYKISKdXqAYxvNH24Kznqa7kpj4IEZHkFUoQFe0WhYiIlJxCa1K/AWBm++XY/UFiEYmISEmI00m9jBxNSmb2LHCmuy9r45hERKQExLlR7nFCgng6ejjwLNAfuDa50EREJE1xEsQ64BJ3H+LuQ4BLgeXA+cCxSQaXj+6kFhFJXpwEMQQ42swqzKwCOCoqWwLsmGRw+WjBIBGR5MXpg7gH+DJwclbZ74BDgFeSCEpERNIXJ0F8jdDn8BlC/8PDwC+AMmBacqGJiEia4kz3XQvcED2y1SYSkYiIlIS8fRBmtsbMTol+Nn6od1hEpJMrVINYBdQB75Hy1BoiItL+Ct1J3TDVxr3tFIuIiJSQQrO5Flo1zt39twnEIyIiJaJQE9N0cjctNawol1qC0HTfIm1nY30t3TJdm5RvqN9M90zTbsqN9U63jLVHaJKyQgni24REsAMwGZgKvNoOMRWl6b5F2k63TFcGzx7bpPzJkb+lorqmSfnrI8rbIywpAYX6IKYAmFkvQoK4090faqe4REQkZXGm2migkUwiItuQOEuOlhGSwxVmtjIqS3XJURERSV5zlhw9Ouu5ahMiIp2clhwVEZGcii45KiIi26bmdFKLiMg2RAlCRERyirMeRLswsx2AnxOmEX/E3W9POSQRkW1aojUIM7vVzJab2YuNyoeZ2WIzW2Jmk6LiU4GZ7n4uMDLJuEREpLikm5imA8OyC8wsA9wEnAj0A8aYWT+gHHgrOqw+4bhERKSIRBOEuz9GWE8i25HAEndfGq1WNwMYBdQQkkTicYmISHFpfBDvw9aaAoTEsA/wR+A0M7sZqM73YjMbb2bPmtmzK1asSDbSTm5zbe5VY/OVS8ttrG/+77SuLndFOl95Z1Pod7ahfnOe16R/D2++GEohtuYqmU5qd18HnBPjuKmEmWWprKzseL/xErJd1668flLTWTwr5mipj7ZWaMbUfMrKMoybOKtJ+bTrto1ZbvL9zqC0Z5rtlrGSja250qhBvA3sm7VdHpXFZmYjzGzq6tVaGltEJClpJIi5wAFmVmFmXYHRwOwir/kYd6929/G77LJLIgGKiEjyw1yrgKeBA82sxszGufsmYALwALCIsM7ES0nGISIizZdoH4S7j8lTPgeY09LzaslREZHkdcjhpGpiEhFJXodMEOqkFhFJXodMEKpBiIgkr0MmCBERSZ65d9x7zcxsBZDWwka7AyuLHpUOxdYyiq1lFFvzpR3Xv7j7HsUO6tAJIk1m9qy7V6YdRy6KrWUUW8sotuYr1bgaUxOTiIjkpAQhIiI5KUG03NS0AyhAsbWMYmsZxdZ8pRrXx6gPQkREclINQkREclKCaKZ862yXAjPb18weNrOFZvaSmX0r7ZgamFl3M/s/M1sQxXZZ2jFlM7OMmf3dzO5NO5ZsZrbMzF4ws/lm9mza8WQzs55mNtPMXjazRWY2KO2YAMzswOj31fBYY2YXpB1XAzObGP0NvGhmVWbWPe2Y8lETUzOZ2XHAWuA37v6ptOPJZmafBD7p7s+Z2U7APOBkd1+YcmiYmQE7uPtaMysDngC+5e5/Szk0AMzsQqAS2Nndh6cdTwMzWwZUunvJjeU3s9uAx939V9HU/du7+wdpx5XNzDKE9WaOcve07pnKjmcfwv/9fu6+3szuBOa4+/R0I8tNNYhmyrPOdklw93fc/bno+YeE6dT3STeqwIO10WZZ9CiJbydmVg58AfhV2rF0FGa2C3AcMA3A3WtLLTlEPge8VgrJIUsXoIeZdQG2B/6Rcjx5KUF0UmbWGzgMeCbdSLaKmnHmA8uBv7h7qcR2PfBtIPdCx+ly4M9mNs/MxqcdTJYKYAXw66hp7ldmtkPaQeUwGqhKO4gG7v42MAV4E3gHWO3uf043qvyUIDohM9sR+ANwgbuvSTueBu5e7+4DCMvMHmlmqTfRmdlwYLm7z0s7ljyGuPtA4ETgv6ImzlLQBRgI3OzuhwHrgEnphvRxUbPXSOCutGNpYGa7AqMICXZvYAcz+3K6UeWnBNHJRO37fwBud/c/ph1PLlFTxMPAsLRjAQYDI6OeqPOSAAAHC0lEQVS2/hnAZ83sd+mGtFX0jRN3Xw7cDRyZbkRb1AA1WbXAmYSEUUpOBJ5z93fTDiTLCcDr7r7C3euAPwLHpBxTXkoQnUjUETwNWOTuP0k7nmxmtoeZ9Yye9wCGAi+nGxW4+yXuXu7uvQnNEQ+5e0l8ozOzHaLBBkTNN/8OlMToOXf/J/CWmR0YFX0OSH0wRCNjKKHmpcibwNFmtn309/o5Ql9hSVKCaKZc62ynHVOWwcBYwrfghiF+J6UdVOSTwMNm9jwwl9AHUVJDSkvQXsATZrYA+D/gPnf/U8oxZfsmcHv0bzoA+FHK8WwRJdShhG/oJSOqcc0EngNeIHwGl+xd1RrmKiIiOakGISIiOSlBiIhITkoQIiKSkxKEiIjkpAQhIiI5KUEIAGb2jJltNrO9s8rOMjM3s8tbcL7p0WtbvO6ume1tZpPN7ORmvq53dG03s+9llU9rKI9xjmOiaw+IcewyM1tb7Lgi5zg9im1ynv1Xm9kH0TFfa821mhHTI1m/x4892uP6kj4lCGlwB2DAaVllDc/vbM6JoknIbibcqPRaK2LaG/gB0KwE0cjZFuwIfLEZrzsmunbRBJG0aEz//wCrCfe5PJTjmC4JXPp/Cf+G10fbf4i2xyRwraISeo9SiLvroQfAvoTJ6h6NtncE1hPuyoYwRcASYAOwkjAtxU7RvumESeVuJkytfEFWWSWwB/B3wjTpa4HHgYOj154dHVcFLADeJ0wDTlSe/TibkCxeBTYSZsG8Nsd76R0d/1r08zPAV6PYa8J/ewf4FOHu34+AD4A5hNlvj89x7d5A3+iYD6I4J0TnWRa9r6uBVYQbAT8R7TsI+AuwBngDmJgV55cIE7a9AdwUXWdyjvezrFEsxwOPRM9/Gv17nAwcDPw161rfY+u9Tg68QpiX6EPg1uj6K6Lf08AC/zdObxwb0CcquyfanhRtfznariEktFuin9WEGXNron+3odFxPYAborL3CdOJ7BPt+110zp9H+yek/XeyrT1UgxAA3P0twh3iQ8xsL2A40J2ttYe1hD/U8wkf5mdGz7MdC3w/Ok+2zYQ7Wr8FXAn0Z+u30gafIdxR6sCV0URr34n2PUb41voocBnhQ2U8YVbMdQXe1iLCbLZfjR73ED7cG9QCt0Xv42fA54HJhKRxe3TMLdG13yd8yA0lfCh/h5BAG+xASIRzCEnx3Ogb7yygHyF5PAP8xMxGRL/jadHv5grg0wXex6VZ72cMH5/S4jDgYsKH/GzgKOC7wPOEGsA5WcceEL12cVR+EXAjsD/h362t7QzUEe4CHx5dawrwCeDH0THfI9yRfT9wDWEiu982Os+Q6LhSmf1325F2htKjdB6ED3AHvk74pumEhU0gfIAv4ePfZGdE+6ZH2yOzztVQVkloKnqS8GHY8Np/RsedHW3/KNr+U7S9b/RaB6ZnnXcm4Rv/7wkfcOU53kfv6HX3EhLJhmj784S5jDw67hBCrSX7Pf0t2ndRtH12tH1wtH1XjustA+oJCfXo6LhpWa9p/LiB8EHowA+jc4wjfw1i92jfI1llj0Rlh0bbn4q2b4+2D8iON3peEz2/ItoeB2Si5wsK/L9oaQ2ijrDux9ejfT+I9r0DrIqez284Ltp+Jvpd9mBrDeKktP82ttWHahCS7S7Ch/hYwkyYL/nW1eh+TPimOY5Qe4DwgZgt38In5xPa9K8nTDhXk+O1DYswbYp+NnxwNfYfUQzvENrli61IN4PwgVNDaOrJ9h3gUMKH278TPqga4mpuR+x6d9/QKP4GDxBqHg2P7Ll3rNHP5mr8O/dGP7M11J7qop+r3b0+ep7JcXwhDa9r6BfomeOYdR5mLN1yvazX5rterrhLdkGdzk6dPrKFu//DzJ4kNBVB6LjOZoRvsye28BK7ElYhK2frh0Uh70c/DzOzMYQP+EsJ7esLCB/qfc2si7tvynUCd19jZl8FPnT3zWECzSZ6AacSvu02vvaJZvYRoYnsFeDkaKTRu0Ctu08rEP9iQn/JEELfwEeEvpy7CUljA3COmb1J0+a65lpMaGYaZWbfjK4DockrCe8Q+oGOMLMvEr5UtMR9hCbHn5vZUuAIQk1pfZ5/K2lHqkFIY9lJIXv00qXAW8AlhGaB5riR0HF7JqETONaU1e6+lNCU1Df6+W+ED/FJhOVBdyJ0XOZMDlnnucPdc31QXkGYcvwbhBpMdtKaTVjT+zTg99E1RgIPEjrhf0RYLrLQdTcRmpKeJPQL/DCK+QUPaxSMI/wNXkzxmlBB0Tf1UYTf848IfRPfJzT1tbmotnQp0I3w3p5o4al+SPj/MZzwe6gGzmqLGKX1NJuriIjkpBqEiIjkpAQhIiI5KUGIiEhOShAiIpKTEoSIiOSkBCEiIjkpQYiISE5KECIiktP/A0skupf39/gIAAAAAElFTkSuQmCC\n", "text/plain": [ - "2696" + "
" ] }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" } ], "source": [ - "len(civic.CACHE)" + "barWidth = 0.25\n", + "colors = ['#3ab15c', '#20b2e3', '#6270b0', '#f38e42', '#de495c']\n", + "\n", + "for label in levels:\n", + " idx = levels.index(label)\n", + " counts = [match_counts[x][label] for x in range(1,9)]\n", + " r = np.arange(len(counts))*1.5 + barWidth * idx\n", + " plt.bar(r, counts, color=colors[idx], width=barWidth, edgecolor='white', label=label)\n", + "plt.yscale('log')\n", + "plt.xlabel('Variants Matched from Tumor', fontweight='bold')\n", + "plt.xticks([r * 1.5 + 2 * barWidth for r in range(8)], range(1,9))\n", + "plt.ylabel('Highest Evidence Level Matches', fontweight='bold')\n", + "plt.legend(title='Evidence Level')\n", + "plt.savefig('data/grouped_bar.svg')" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "mcv = match_counts.values()\n", + "summed_counts = Counter()\n", + "for c in mcv:\n", + " summed_counts += c" + ] + }, + { + "cell_type": "code", + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAO4AAADuCAYAAAA+7jsiAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xl8VNXBPvDnnHvv7JN9JRDCvoR93xdRqlgXEK0LtBZrqz9U7GKtb7VWW21LbfF9cSnWWutWt2q1KiCbioIi+yqQhCQQAoSss8+995zfH0GLCsiSybl35nw/n0gwYebJ8sy5c+bccwnnHJIk2QsVHUCSpDMniytJNiSLK0k2JIsrSTYkiytJNiSLK0k2JIsrSTYkiytJNiSLK0k2JIsrSTYkiytJNiSLK0k2JIsrSTYkiytJNiSLK0k2JIsrSTYkiytJNiSLK0k2JIsrSTYkiytJNiSLK0k2JIsrSTYkiytJNiSLK0k2JIsrSTYkiytJNiSLK0k2JIsrSTYkiytJNiSLK0k2pIoOIJ2RNACdj72VxEzeLWLyjhw8gwNpBPBRQnwKgYcCGgdMxqFzwOAcOgePcyAG4LBGSLVXJZWEkFoAhwDUAigD0Czuy5NOF5HXx7UkP4AhAAa16Gw04xjiVkhHAqh1MTNyIGLyfUHDURky3PVxEyGDI2QwBA2OsMERNBjijEOjBBolUAmgUQIHBVwKQbZDQa6TIs+lsEK3Ei10qXqBWyEdXIpb5zwQY3yPWyHr3QrdDGAHgK0AwkK/I9KXyOJaQy6A8wM6u4QD41wKKagMGuGNjTHn1mbdtatZR2XIQJPOEhqCAChyK+jh19DTr/J+6Y5w33TN7ORR3SGD73YrZIlTISsBfASgJaFhpFOSxRXDCWBs1OTTYoxf7qSk46f1sfjywxH/hsY4drfoMCz0Y3ErBIMzHRiZ7TQn5rmCfdI0T8TkVS5KXncq5GUAGwBYKHHyk8VtPxqAqS06u8FJyUWVISO+7FDE+35dVNncGLdUUb+JgwIDMhw4P9+tX1bkifk1EiPAyx6VvojW0dgUnTHZyeImFgUwNmSw7yuEXFkR1NmL1SH/OwcjpD6e2MPe9tTDp+KiDm7z8iJPuMCtgHG86FXpowC2iM6WrGRxEyNTZ/yHccZ/Uh9j7peqQ543a8LKgUjyD0TFHgVXdPIaszp74yolB3wq+RMl5HkAIdHZkoksbtvqHTTYHSoh1644HGFPlAU8W5t10ZmEoADG5Toxp6s/ODLbSU3On/GqdD6AfaKzJQNZ3LZxXrPO7qfAkKf3BbVnK4NqXSx5DoXPVQe3gu+W+PTZJV7T5HjDr9F7AOwVncvOZHHPzegWnT0cMljpn3e3eN+sCSOJnrq2uTSNYE4Xv/GDbj6DcSz2a/RuADtF57IjWdyzM6BFZwsMxkf9YVez+18HwsSU38bT5lMJvlviM2/q7o8DWOnX6G0AKkTnshNZ3DNTHNDZ/3Fg6oLdLc4XqoJUjrBnz60Q/KCrz7i5u18HwSNuhd4PICg6lx3I4p4eLWbyHzPO7/1rRdCxqCyghuUQ22byXRT3lGaEJ+e5oh6V3gbgBcgFHacki/vNRgd19uzOFr3gzi2N3sqQITpP0hqS6cDvB2aGClxKuV+jswBsE53JqmRxTy4zqLOHTY4r797W6H7rYER0npRAAHyn2MvvLk2PqoQ84FTI7yFXYn2NLO6JjQ8b7LU3ayL+B3c2OQN2Wo+YJIrcCv53SFaop1+r8Gt0JoA9ojNZiSzul6kRg/1G55h3+8YG96ojUdF5UhoB8L0uPnZH77SYRsldGiULAcjpQMjiHq8kqLM3drbo3W/ZUO+RCyiso4tXxSNDs0LFHvVTn0ZnAGgUnUk0WVwAjPPpMZM/s2BPi/vJ8qAivyPWoxLgntKM2BWdPA1elX4LKT5xlerFJRGT/Spi8J9f/8lRz7YUXVdsJ5cXefgDAzIiLoXMoYS8JDqPKKlcXFfQYM/VhM0LZ39c55WHxvbRJ03D0yNzwj6V/M2j0p8ASLnX6FK1uAVBnb370dFY93kb692ys/aToVEsGp4d7pumrfVp9BIAKfV6XSoWtzRssFVPVgQzFuxu0USHkc6eSoCHh2RFJua6dvk0OgVAk+hM7SXVijssYrAV/7O1yf/vmjARHUY6dwTAr/tlxK7o5DngVel4tG4zm/RSqbjjwwZ75/aNDb5lh+Xrs8nmlh5+/abu/vpj5S0TnSfRUqW4E8MGe+dHn9Z7PjwaE51FSpBrir3m3aXpLR6VjkKSr7RKheJODhnsrR+sq/d8XC9Lm+yu7ORhv+6X0eBR6TAAVaLzJEqyXztoeNhgb90gS5syXtkfpg991pIZMtgaAAWi8yRKMhe3e8Rgy+ZtbPB8IkubUv6+L6gsKgvkhgz2EYAs0XkSIVmLmx8y2Ae/2dnsWy4nolLSwr0B7YWqUMegwT5A67WYkkoyFtcXNNiqv1cEc/5ZFVJEh5HEeXBns2NpbaRbUGf/QpL9rifVFwNADers7aW1kS5/kosrJAB3bWl0VYSMMRGDPSg6S1tKquKGDfa7zwL6sDu3NLpEZ5GsQefAnE+OekMmv5VxfpXoPG0lmV4OurQhZv7zgvcOexrk1ovSV/RJ0/Dq2NywR6XjAGwSnedcJcuI2y1isOdvWFcvSyud0K4WHXdsbnSHDfYuWq9HbGvJUFx30GCL53/W4t7cFBedRbKwd2oj5PmqUFpAZy+hdZmzbdm+uEGDPfFhXbTj0/uCcgZZ+kbzdzU7DkbMEXHGbxOd5VzYvbgXhw0+42ebGt2ig0j2YHDgjZqwx2T8TwD6ic5ztuxc3IywwZ65fWODJySvKiCdhnSN4Inh2ebN3b1YUbOahozIvwA4ROc6G6roAGcraLDH3qoJe9bK5YzSaRiT48TCoVk4Ej6Imct+S4JGGIWenI6lmd3vc6nOu0TnO1N2fTnowrqo+a/JKw/J0VY6JQcFftEn3fxOsVd5Zs+reLbsP198LNuZjhenPBTxqK7hAHaIS3nm7Fjc9LDBym/8tD57jTy3VjqFnn4Vi4ZlcycNsx+v/Y2yP3T4a58zs8tUdmPvK9b7NM8o2OhCY7Z7jhs22P1LD0W8srTSyRAAN3TxsdfH5WFb/RrMXH7LCUsLAK9XLqeNsZZSAFe2a8hzZLcRt1vYYNsnrjzkOiq3ZpROIM9J8cjQbNbdB/xq/Z/oxvpd3/hvBmb1xJ9G/bzBrTo7wybX57XViBvQ2cLHywKqLK10IhcWurF8cgHAKviMZTedVmkBYEvDHqw9stkVNWK/TmzCtmOnEXdCfcxcPHZ5rUf2VjqeTyV4oH+GeV6+i/7f9qfI2/s/OOPbyHFm4MUpf4y4Vdcg2GC/KruMuDSgs0X3bW+SpZW+ZEimAysm5/N+aQFcveK2syotAByNNeG5srccQT38+zaOmBB2Ke5VNRGz43/kxaWlY1QC/Kx3Gnt2VA6WVL9DZr93h9IYbzmn23y5YqlCQC4E0LNtUiaOHYpLAjp78Pe7mn2ig0jW0MWr4u0JeezKjpTftPqXWPTZK21yu2EjihfK39FCeviBNrnBBLJDcS+ui5m578uLTEsAri328rcm5KG6ZROZvuxmpTxwoE1v/5WKpSoh9GIAPdr0htuY1YtLWnT24J93t8jRNsVlOSieGZlt/ryPF/euX4B7Nz6SkNPyQkYEL5Yv1oJ6+DeJuP22YvXiTggbvOti+dw2pU3Kc2Hl5HxkKLWY8e7NZM2RzQm9v5fKl6gKoZcB6JrQOzoHli5ui84eeHhPs0dOJKcml0LwuwEZ5iNDM/HU7hdw00f3KlGW+M0SgkYYr1euoBEjenvC7+wsWbm4vQAMef2AvKpeKipN17B8Uj4fkx3D7JU/xav73m3X+/935UoHIeT7ACy58aBlixsx2I9erg6pcgup1EIBzO3uZ6+MycXaQ6twzcrblcPR+nbPURM+gt1NlRzAjHa/89Ng1fNxNRDc8M/qkNwbOYUUuRU8PiyLdXAzzFt7P3Y0lgs92nq5Yom/q7/jHX6H9wWROU7EqiPuxeUBg1QEDdE5pHZyeZGHL52Uj+boZ5ix9Ca6o7FcdCSsPrQJaH3K1ldwlK+xZHGbdXb70/uCSXe9F+nr0jSCRcOyzPv7p+EPmx/Dneseogas8fzI5CZer1yhRYzoraKzfJUVi1vkIBj5Tq18CSjZjcp2YtXkAnR2N2DmsrlkxcFPREf6mjer31MJodfBYk8rLVdcxvmMZYejLCK3pElaDgrcU5pu/m1ENl6teA1zPrhLCRph0bFOqDZch0PhOg5gougsx7NccVt0Pvvtg2GP6BxSYvTwqVgyMZ9fVMAx5/2f4x973xQd6Rst3v+hL6RHZonOcTyrFTfDrZCBq+vktjTJhgD4fhcf+/f4POxsWIuZy+eedDsZq1l1cB1VCJ0BC/XFUsftAKatb4jFIia35V630onlOSkWDs1iPXwEd677PTYe3WWrRTU14SNojLeQQjV3OABLPBG3zCMIADTH2TX/ORiWs8lJ5FsFLiyfXADKKlu3kzl6etvJWM3Kmk9ccVOfLjrH56xUXIdLIVNWHJan7yUDr0KwYHCm+dCgTP7Ijqcwb+0DSpzZ93X5Dw5t1GIsbpmdIK10qDy8JmLoR2NMXgfI5gZnOvCXYVk8qNfj2hV3kvp4s+hI52xXUwWc1NERQDoA4V+QZUZck/NJ7x+JWnJBt3R6VAL8tFcae25UDpbuX0xmr/qZkgylBVoXY5S37I8AGC06C2ChEbdFZxevrY/JSSmbKvGqeHxYFsty6Lh59d0oC+wXHanNravb5uue3mmiRrUlorNYZcSlXoUO3tAgL0xtR1cXe/nbE/JQE9hCpr97E03G0gLA5vrdStiIXSg6B2CdEbd3k86MBnkOn61kOSgWDM40B2Yo9L4ND+PDw5ts9TLPmdrRWAaP6uqL1ktzCh1lrDLijtrQKBdd2MnEPBdWTM5HpnoIM5fPJR8e3iQ6UsKFjAgOR+qjAIaIzmKJETdssJEbGuJyQzgbcCkEvypNNy8tcitP7noBL+97VxGdqT1tb9irdvTmDwLwscgclihujPHBZfLcW8vrm6Zh0fBsznkLZq+8GyJ2phCtvGW/J2rG+rsUp9Acliiug5JuFUFddAzpJCiAm7r72S09/PTt6hVYsP2ZlBplj1cVrEXUjA+WxQV8DkrSasKm6BzSCRS5FTw2LIsVuRluX/sbbG8sS+oJqG9SFTwIjardReewwuRUz9qIGZbzydZzaZGbL5mUj0BsN2YsvYlubywTHUm42nAdnNSRBUDoCj8rjLi99gb1lH4Utxq/SjB/UKY5NsdB529+HCsOfmyFB3hLMDlDfbQpnO/J7gFgq6gcVviBdNsb0OWJ8xYxMtuJVecVoMsX28kInTy1pKrgQQ7B1xYSPuKGDNbxSJSl7GSHVTgo8PPe6eY1nb3K82Wv4ek9b8ifyUkciTZoAPJEZhBeXJ3xooa4nJgSqbtPxaLh2dxDI7jh/V+gOlQrOpKl1UebnJzzHELEPcMTfqjMgQK51FGc67v42BvHtpO5YvlcRZb2mzXHgzRqxjqIzCB8xKWE5NTHZHHbW+6x7WR6+QnuWjcf64/ukBOEp6kpHkCcGYUip5WFF1cjyJQjbvuaWuDCQ4OysKdpL6a/O5/G2+EKeMmkOR4A5zxfZAbhxXVS4muUz3HbhUch+O2ADPOCfBd9ZMffyX+q3xP+VMmOmuNBUEJyRGYQXlxKQOWAm3iDMhz4y/AsHk6i7WRECelhUEKFbmoouriEEELkNQsSRyHAvJ5p7IauPvp65TvksZ0vyZd5zpHBTRAQoUcroourMM45WvfLltpYZ4+CvwzPZtmazud+eA/2tFSLjpQUTM5AiNhXZEhrb4RxmpyHu79VI59rJcC+SzqKjmBbnHPgS28AR+v7hCow9XhUdbuFTSyLHnEp42CwwOvJyehQMGY6DJOq3GA+n1MhlMKsr2Ls0GeUHS0H4hGAqgBVWt8IBRQVhHz+dwVEaf2z9XPof98nFOTzf0M//xx67O+09TaOvQ9y3BulACjI5/8fhH/xsc/fB+UAASgBQPDFUWnr5wCEoPU/x94nBAQEHAAhx32c4L+f/9/3CY4tnGhdQHHsY+S42zr2hq8ssDj+bwpXhXZH9Ijr1Rlv6vl2jegHkKQ0NNOB50dk4a4HlqO5JYaBpfkYM7wYvTt7TK/Po7BQPTf3fcLMyk8V88AWQJeXNj0dJLMTPNc8epg4vQXCMggursI417u/VSMnqBLk32NyzFBFHf72wsYvTUqpKsXIIR0xcnAH3q3IxZwej8Ia9jOzYi0xqtYTdmgXYMrNDU6EZneB+6oF+4nLXywqg+iRzjQ44j6VOAOGrG4izN3UoKyY0AFLVu5FzaHAF//fMBg+WleNj9ZVEwCKx61i/KgSOqz/VNap36VcdTqpeWSvaZavoWb1BsKOlAGQPyMAgOoAwIXubih6xEXIYPVT3zucdTAiF2EkyuNDs3hJOMz++OhHp/1SUHamGxPHlGBw3xwzP8tJKaXErNlmGhVrFLN6E3jTgURGtjRa1B/uS+7bTtzp/UVlEF7c5jirvHpNXefPAvKwLFE8FFh/fgF/9Kl1ZNeeurO6jeKO6Zg4ujP698gwM9NdCkwdZvUG06z4WDH3bwIPpc7GcUrxULim/XI9cacPF5VB9KEyTPCWNE2+jJtIYQY8dyBCZs0cwO/+3QpyNo/V1Qea8ewrWwFAAYC+vXIxbkQPpe+owabvArfCw83cqFzHzMp1inlgMxALte0XYSHEkwmAHBKZQXhxwdGUrslXgxLtwZ3NuGpKPh81tBPWrt9/zo+UO3fXYefuOgBQKAVGDCoiI4cOpT0mjTe9Xo/CmmqZuW8NzKoN1KzZDpjJcyID8ecCmmuvyAzCi6tSsj/HJVfhtYc/7A3Quy4rxfotNdD1tlsgzhjw8cYafLyxhgBQXA4V40Z2osMGTuGd+0xjLqeLmkcr/jvRdXgPwO27QJ2mF0aJ6tgnMoPw4vpVsr2LVzWskCXZ/bM6jFu6+sxvTepO3lq2J2GHOdG4geWr92H56n0EAMlIc2HimM7KkL6XmQVDrqKKqhKzdqdplq9RjOqN4A1ViYqSECS9MAZA6JXNhJeFEFLe3aeGAaSJzpIK7tjWpDw1pQfeX1uJQLB9Dl+bWqJ4Y8luvLFktwIAHfL9mDimRBnY91oze8wcCs6IuX+TaVasbZ3oChxpl1xni6blA4KLK3xWGcCwfUF9+XmrDqeLDpIq3h6bY9btPoRnXt5iiecoPbtlY/zIYpR28xtpfo/KY0FuVq5jRuU6xdy/GYi2iI74Jd5b3okTzdkBgLCpdOEjLoCKQrcqdHPpVHPL5iZl8bhOWLqqHIfrgqLjYE95PfaU1wOASikwuH8hGT10EO01fnTrRFegjpkVa2FWradmzTbAiArL2jqjjDgElhawxohLYiYPjVpW625qwwkT6dSeGpbFcpuDfMGitZYYdU/GoVKMHlGMEYMKWZdCFxxuN2X1VcwoX0PM6vWEHfoMYO23eEfpOBCub9+7jbjTB7TbnZ6AFUZcHjF5VQ+/2vtTeUX6dnPb5ga67rwC9Oiahb0VDaLjnFTcYHh/TSXeX1NJAcDndWDimBI6tHQaKxo4nSsOBzUP7TbN8o+oWb2RsKP7kMilmSSrE0CUbQm7g9NkheJCo/iof4ZDFrcdBQ3g1YMRzL5yILt3/ioq/sDr9ARDcby9bA/ePjYrnpfjxaQxJcrA3t8xc0d9j1KAmDVbTKN8TetEV3PbbjdLs7vEicu3sU1v9CxY4VAZAK5fWhtZeNP6enlx63ZEAWw+P5+98MoWum5Tjeg4baJrcSYmjO6Mft3TzPR0twI9ys2qT5m57xPF3L8ZPNx4Trfvvnphs1LY91oA77RN4rNjiREXwKeDMx2iM6QcBmBBeYjOm96Pb9xaSwzT/nMMFdWNqKhuBI4tzRxYmk/GDC9Veo8Z/vVzkGu2AvHw6d84VUBzu7sBrE1I+DNglRFXiTMeGvHuQWezbok8KeXjyXnm+yv3kKWrypN67ekJz0FuPMDM8jXErN5AzNqdpzwHmeb3gnvG/Cri8pW0X+oTs8qIa4YMtmtAhmPQ6jqhpzmmpLt2NCuPfas3PvykGqFw8p6l9U3nILucTmoeKTs20bWBsLryLy3NVIoGcFC6TNxX8F9WGXERNflDT5YHbv/T7hZLvzyRrJaOyzX3b6/BC69tS9nv/wnPQT64/dhE10Y4z7stoBYP/QGAl0VntUxxAUzeG9D/PfW9w3LpowA9fSreHJuLX81fiaP1Z/C8L4l99Rxkojo4oUoRAOFXRrNScR0xkzePWV7rktcSEuO5EVnMW9fMF/5tXcqOuifTpTgDP7lpzH6PWxO2z9TxrDQZEY+YfM34XKfoHCnrlk1NtHePXKVr50zRUSynf598U1Xo66JzfM5KxUWGg748tcCdvFsnWFyTzvDGoSifNXOAPOT5iiEDCkMOh/If0Tk+Z6niAlg8IdelyI1sxLl7axPJzvaSwf0LRUexDK9HQ0GezwngQ9FZPme14lYz4LBcjCGOAeCxyhC5bkZ/rlD5EAoAwwcVcV1nKwGIOy3pK6xWXLgV8tSMjh7LfINS0eNlQZiawiaNKbHMzKVIk8aWBD1u7VHROY5nueJqlDx/aZGHy+Nlse7ZGVCmX9yHuF1WWaMjRkGeD3k5XgZgqegsx7NccQGUM2Df+FyX6BwpbcmhCGpjzPz2Bb1SeqJq3MhinYA8jdZnEZZhxeLCr5KFszp75eyyYPO2NCqTx3WhWRmpuUEJIcD4kZ11h0N5UnSWr7JkcSkhL47NdSkZcr9lobY3G9jaHGdXXVaakteH6dszF5SSAwC2i87yVVZtRpPO+OKrij0pfZhmBXM3NtIBffOV4o6pt5ffxNElYbdLXSg6x4lYtbjwa/SBH3XzR1U5SSVUXZxh8ZEYnz1zYEo9iLpdKvr3zVcIIf8UneVELFtcABtUSnZdVJiaz6+s5M4tjSQ/30cG9M0XHaXdTB7bxTQMtgSCd3M8GSsXF2kavf/WnmmBb/5MKZHiDPhbVYRcd8UATlNgUYbDoWDa+T3iHrf2S9FZTsbSxQXwVqFbaRmRJVdSibZgTwuoU+XjRhYn/aKM88Z2MTnHSgA7RGc5GasXl3kU8ttbeqaJ37Vbwv27A3Tmt/sSpyN5z/pzaAouvqBn3OPW7hKd5VSsXlxQQv4xNNOhD0jXREdJeW/URFBvcHPa+T2SdqJq0pgSBuADAML3Tj4VyxcXQMSpkF/c2z9DjroW8OOtzcoFE7vRjLTkW9mmaRTfntoz5nFrvxCd5ZvYobhQCHmqp19rmiBPshduY2McuwKGOfOSvkm3KGPiqBJGCPkIwGbRWb6JLYoLwPCpdN6v+2UEk39O0/rmbmxQhgwoVIoK/aKjtBlNo7j0wl62GG0B+xQXAF7PcSqVlxbJ13VFOxhlWFkX59ddMSBpRt1LpvbSKSGrAGwQneV02Km43K/RuXeXZoTd8pw/4X62uYF0KsqgfXvmio5yzjoU+HH+hK5xt1u7UXSW02Wn4gLAB05K3rmjd5rcNV2wMAOeOxAhs2YO4MTGj6OEADdcOySkKvROAAdF5zlddisu/Br9f1cXe2P95MtDwv1uVzNcXicfPayTbRdljB/ZmefnevcpCv2L6CxnwnbFBVDnVMgtDw/OCskjZvF+tzdAv3NZP+LQ7LcoI83vxHcuK426Xdp1AGz1fN2OxQUl5Lk8l7L1hq4+W32zk9FL1WE0M5hTJ3Wz3aKM664YECGELAKwVXSWM2XL4qJ1omr2vJ5p8RJvau+JZAV3bGtSpk3pQf0++6wpL+2Vi36981qcTvVu0VnOhl2LCwDlGiV3/nV4dshh568iCaytj6MsZJgzpvWxxRGQ16PhB9cNDbuc6vUAbLlFkq1/5TVKHsl3KavvKc2Qs8yCzd3UqIwc2kkpyPOJjvKNfjh7aNjhUJ4BsER0lrNl6+Ki9ZD52hkdPc0XFCTf2lk7qQqb+LA+xq6Z0d/So+7USd3MbiVZlS6nOk90lnNh9+ICQKNHpZf9eVBWpMhtv5nNZHL75gbarSSL9uyaLTrKCXXrnInLL+odcbu0iwHERec5F8lQXAD4WKXk/ifk812hggbwUk2EzLpyALPaooz0NCduvXFk2OlQrwVQKTrPuUqaX3OXQuYXe9RVCwZnRURnSWW/2dEMf7obwwcViY7yBVWhuP3GUSGnpj4EwDJX3DsXSVNcAMyn0asm5LnKftorTRcdJlUxAAvKg/Tq6f25qlrj12v2lQOiuTne1Q6Hcp/oLG3FGt/ZthPxqfT8OV19DTM6emy7DM/unt4XQpQQdv74rsIXZUyb0sMYNqio1u3SrkLr40pSSLbiAsARj0rP+23/jNDIbPssCEg2d+5oVi75Vi/q9YhbUz5lfFfz4gt61rmc6jgASbVbaDIWFwB2ulU6/cnhOZG+afJkBBHePxJDVdgwL79IzKKMcSOL2RUX92l0OdUxsNFZP6crWYsLAMs9Kpn14pjcSB9ZXiFu2dykjB1RrORme9r1fkcMLuLXzujf7HSqY5EEM8gnkszFBSXkNa9KvvvimNxIb78sb3srCxpY1xhnV09vv0UZg/oV4PqrBwWcDnUCgD3tdb/tLamLCwCUkFd9Krn+pbG5kZ5+eUJCe7ttUyPt3T1H6dY5M+H3VdorFz+cPTTodKjnwYJX2GtLSV9cAKCEvOxTyZxXxubJkbedNekM/66N8llXJvaiYaW9cjH3+yNCTod6IWyyb9S5SIniAq3X3PWp5PpXx+VGxubIbV7b0z3bmkhWlocM6V+YkNufNKaEzZ0zosXpVC8E8FFC7sRiUqa4QOvI61XphX8dnh2Y2Um+ztteDACP7AuRa6/oz5U23LaEEODq6f1iV15aetDpUIcC+LDNbtziUqq4x3zgVunI+/pl1P1YrrBqN4vKgzAUhU0aU9ImD5gOh4J5N44KjxtRvMXlVAcCKGuL27WLVCwuAOzyqHTgnK6+sgWDs6LyxIT2cfeuFmX6tD6B6LoWAAAFdElEQVTE7Tq3ScKMNBfu+fGEUI8u2f9xu7TxABraJqF9pPKv7CGfSodPyXe998b4/FAHeUpgwr17KIqDUWZeMrXXWb881LFDGn59x6RwTrb3jy6Xeg1sfnre2Url4gJAyK/RaSVe9bdLJuZH5LWJEu+2LY3KpLFdlOzMM78ixYRRndn/3DY+7PU45jg05T4AKTtPQThP2a/9qyaGDfba81Uh3/xdzQ5DflsS5sVR2UypaeCP/2P9aR3mpPmd+MF1Q8JdO2cecLu06QB2Jjii5cniflluQGcvH4gYw29Z3+CtCBmi8ySlHAfF6sl5mL/wQ1QdaD7l5w7uX4g51wyOKApZ6HSo9yBFD42/Shb366jO+K0G5w8+ujfgWFQWUOXo2/b+PCiTD2Rx/sDDH5zw6ZrLqWL2lQMig/oVNrmc6kwAa9o5oqXJ4p5cSUBnzxyJmUPmbWzw7miWrxy1JQcFNk4p4E8+u55s3Xn4Sx/r2S0bN39veNihKa+6XOpcAPKi5l8hi3tqhHE+K2byR5+rCjn/vLvFETXl96utzOvpx6xcjd/1wHLCGEeaz4mZl/SNDBvUIeZ0qLMBvCU6o1XJ4p6e/IDOnjA4P//Bnc3u1/aHSdJspSDYhvPy2JKlnxFNU9ilU3vFATzpdKr3ADj1k98UJ4t7Zka36Ozx+pjZ/dfbm7wf1Ml92M/V/f3SMavEh2jUWOl2azcjiU/Fa0uyuGeOALg8aLCFu5r1jPt2NMnnv2dhXI4T9/TLCBa6lAN+jf4EwGLRmexEFvfsaQbjN+qMP7C1WVf/d3eLb229HIFPRSHAhQVu3N4rLVjgUhp9Gr0DwCtIok3c2oss7rlzMs6vCxv8vkNRM+PhPS2+JbURyDms/3JS4MpiL7+1R1rYpZCyNI3+Cq0TT7KwZ0kWt+1QABe36Oz+OOM9nigPuF8/EKZHY6n7u1niVXFVJ48xq8Snc2BNmkbvRYqcL5tosriJMSqgs3kaJZdtaIiZz1WFfCsPRxBPgQ77VIJphW7MLvEFuvlVcI5nPCp9FMAu0dmSiSxuYvkAzGiOs1sVin5vHAiTN2vCzg2N8aQ6lHYpBGOynZje0RM+v8CtxEy+Ot1BHwHwDgA5c5cAsrjtpyRm8uujjF+nEXRcXRczF9eGve8fiaFJt99QXOBSMDnfhW93cAeGZjqdEZNt92v0OYWQ5wEcEZ0v2cniilEEYFpTnF3tUcnY8oAeW3Ek6t3UEFc2N8XRYMFj6kKXgiGZDgzNcsQn5bmihW5ViTO+NE2jLwNYCqBJdMZUIosrngvARJ3xCUGDne9RaP9mnZnrG2Lkk/qY97MWHftCBuraaZKLAih0K+jiVdEnXeOjs53BwZlOxanAjJp8fbpGl1FCVgP4GK3bSUkCyOJaDwXQG60TXJMNzoe4FVpMAGdt1IxUBHWyu0X37A+bSn3cRH2MoTHO0GIwBHSGU/XbqxBkOCgyj71laK1/dvSoRi+/Fu7mU5HnUjxxxltijO9zUrLRq9L30FrSfUjhE9etRhbXPjIAdAfQg3HeI2Dw3ibnhRTIVwjJVCm8Dko8BKAc4KR1hdcXCEAMjnic8aDB0MTAGwDUaYQc8qqkjBCyB63LDcsAhNv/y5POhCxu8jl+x3f+lT+FXIBLanuyuJJkQ6m+WZwk2ZIsriTZkCyuJNmQLK4k2ZAsriTZkCyuJNmQLK4k2ZAsriTZkCyuJNmQLK4k2ZAsriTZkCyuJNmQLK4k2ZAsriTZkCyuJNmQLK4k2ZAsriTZkCyuJNmQLK4k2ZAsriTZkCyuJNmQLK4k2ZAsriTZkCyuJNmQLK4k2ZAsriTZkCyuJNmQLK4k2ZAsriTZkCyuJNnQ/wdwXxwAxPLZCgAAAABJRU5ErkJggg==\n", "text/plain": [ - "1520" + "
" ] }, - "execution_count": 22, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "len(civic.COORDINATE_TABLE)" + "counts = [summed_counts[x] for x in levels]\n", + "plt.pie(counts, colors=colors, wedgeprops=dict(edgecolor='w'))\n", + "plt.savefig('data/pie.svg')" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(1520, 5)" + "6.876070461160556" ] }, - "execution_count": 24, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "civic.COORDINATE_TABLE.shape" + "# Average number of reported variants / tumor\n", + "df.groupby('key').count().alt.sum() / len(genie_samples)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/civicpy/__version__.py b/civicpy/__version__.py index 7e42eca..c98f45a 100644 --- a/civicpy/__version__.py +++ b/civicpy/__version__.py @@ -1,7 +1,7 @@ __title__ = 'civicpy' __description__ = 'CIViC variant knowledgebase analysis toolkit.' __url__ = 'http://civicpy.org' -__version__ = '0.0.3a1' +__version__ = '0.0.3' # __build__ = 0x021901 __author__ = 'Alex H. Wagner' __author_email__ = 'ahwagner22@gmail.com' diff --git a/requirements_dev.txt b/requirements_dev.txt index 9ba7a5e..2cf07ec 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,2 +1,3 @@ jupyter==1.0.0 -notebook==5.7.7 \ No newline at end of file +notebook==5.7.7 +matplotlib==2.0.2 \ No newline at end of file