Skip to content

Commit

Permalink
Merge pull request #252 from Sage-Bionetworks/develop
Browse files Browse the repository at this point in the history
v6.0.0 release
  • Loading branch information
thomasyu888 authored Mar 13, 2020
2 parents a8286ca + 1e069d2 commit 02f63db
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 47 deletions.
65 changes: 34 additions & 31 deletions genie/__main__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
#!/usr/bin/env python
#!/usr/bin/env python3
# noqa pylint: disable=line-too-long
"""genie cli"""
import argparse
import logging

import synapseclient

import genie.config
import genie.validate
from .__version__ import __version__

import synapseclient

logger = logging.getLogger('genie')

Expand All @@ -20,36 +25,35 @@ def synapse_login(username=None, password=None):
syn = synapseclient.login(silent=True)
except Exception:
if username is None and password is None:
raise ValueError(
"Please specify --syn_user, --syn_pass to specify your Synapse "
"login. Please view https://docs.synapse.org/articles/client_configuration.html"
"to learn about logging into Synapse via the Python client.")
else:
syn = synapseclient.login(
email=username,
password=password,
silent=True)
return(syn)
raise ValueError("Please specify --syn_user, --syn_pass to specify your Synapse "
"login. Please view https://docs.synapse.org/articles/client_configuration.html"
"to learn about logging into Synapse via the Python client.")
syn = synapseclient.login(email=username,
password=password,
silent=True)
return syn


def build_parser():
import argparse
parser = argparse.ArgumentParser(description='GENIE processing')

parser.add_argument("--syn_user", type=str, help='Synapse username')

parser.add_argument("--syn_pass", type=str, help='Synapse password')

subparsers = parser.add_subparsers(title='commands',
parser.add_argument('-v', '--version', action='version',
version='genie {}'.format(__version__))

subparsers = parser.add_subparsers(title='commands',
description='The following commands are available:',
help='For additional help: "genie <COMMAND> -h"')

parser_validate = subparsers.add_parser('validate', help='Validates GENIE file formats')

parser_validate.add_argument("filepath", type=str, nargs="+",
help='File(s) that you are validating. \
If you validation your clinical files and you have both sample and \
patient files, you must provide both')
help='File(s) that you are validating.'
'If you validation your clinical files and you have both sample and '
'patient files, you must provide both')

parser_validate.add_argument("center", type=str, help='Contributing Centers')

Expand All @@ -63,17 +67,17 @@ def build_parser():

validate_group.add_argument("--filetype", type=str,
help='By default, the validator uses the filename to match '
'the file format. If your filename is incorrectly named, '
'it will be invalid. If you know the file format you are '
'validating, you can ignore the filename validation and skip '
'to file content validation. '
'Note, the filetypes with SP at '
'the end are for special sponsored projects.')
'the file format. If your filename is incorrectly named, '
'it will be invalid. If you know the file format you are '
'validating, you can ignore the filename validation and skip '
'to file content validation. '
'Note, the filetypes with SP at '
'the end are for special sponsored projects.')

validate_group.add_argument("--parentid", type=str, default=None,
help='Synapse id of center input folder. '
'If specified, your valid files will be uploaded '
'to this directory.')
'If specified, your valid files will be uploaded '
'to this directory.')

parser_validate.add_argument("--testing", action='store_true',
help='Put in testing mode')
Expand All @@ -82,17 +86,16 @@ def build_parser():
help='Do not check hugo symbols of fusion and cna file')

parser_validate.set_defaults(func=genie.validate._perform_validate)
return(parser)
return parser


def main():
"""Invoke"""
args = build_parser().parse_args()
syn = synapse_login(args.syn_user, args.syn_pass)
if 'func' in args:
try:
args.func(syn, args)
except Exception:
raise
# func has to match the set_defaults
args.func(syn, args)



if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion genie/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "5.0.0"
__version__ = "6.0.0"
25 changes: 15 additions & 10 deletions genie/assay.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def _process(self, df):
df['gene_padding'] = 10

if not process_functions.checkColExist(df, "variant_classifications"):
df['variant_classifications'] = pd.np.nan
df['variant_classifications'] = float('nan')

df['CENTER'] = self.center
return df
Expand Down Expand Up @@ -185,15 +185,20 @@ def _validate(self, assay_info_df):
warning += warn
total_error += error

target_capture_kit = read_group_headers['target_capture_kit']['enum']
warn, error = process_functions.check_col_and_values(
assay_info_df,
'target_capture_kit',
target_capture_kit,
filename="Assay_information.yaml",
required=True)
warning += warn
total_error += error
# target_capture_kit = read_group_headers['target_capture_kit']['enum']
# warn, error = process_functions.check_col_and_values(
# assay_info_df,
# 'target_capture_kit',
# target_capture_kit,
# filename="Assay_information.yaml",
# required=True)
# warning += warn
# total_error += error

if not process_functions.checkColExist(assay_info_df,
"target_capture_kit"):
total_error += ("Assay_information.yaml: "
"Must have target_capture_kit column.\n")

variant_classes = ['Splice_Site', 'Nonsense_Mutation',
'Frame_Shift_Del', 'Frame_Shift_Ins',
Expand Down
37 changes: 37 additions & 0 deletions genie/dashboardTemplate.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -527,4 +527,41 @@ if (!is.null(this_bed)) {
kable(final_matrix,
caption = "Distribution of top 5 most frequently mutated genes per SEQ_ASSAY_ID")
}
```


Each panel's bed file is used to generate a gene panel file used in cBioPortal. All bed files undergo gene symbol remapping during processing.

* `num_genes_remapped`: number of unique genes a panel has after submitted genes are remapped
* `num_genes_submitted`: number of unique genes found in site's submitted BED file prior to remapping
* `num_genes_expected`: size of panel as defined in site's submitted assay_information.yaml file

Ideally, `num_genes_remapped` and `num_genes_expected` should match. Panels that have differences between the two values are shown below. `NA` means that the panel has an invalid assay information file.

```{r genepanel_diff, echo=F}
if (!is.null(assay_infodf) & !is.null(this_bed)) {
gene_panel_bed = this_bed[this_bed$Feature_Type == "exon" & this_bed$includeInPanel == "True" & this_bed$Hugo_Symbol != "", ]
# Get count of symbols per panel
symbol_count_per_panel = table(gene_panel_bed$Hugo_Symbol, gene_panel_bed$SEQ_ASSAY_ID)
# Get whether or not a panel contains a certain symbol
symbol_bool_per_panel = symbol_count_per_panel > 0
# Sum of the columns gets number of unique genes per panel
number_of_genes = data.frame(colSums(symbol_bool_per_panel))
submitted_count = table(gene_panel_bed$ID, gene_panel_bed$SEQ_ASSAY_ID)
submitted_bool_per_panel = submitted_count > 0
num_submitted_genes = colSums(submitted_bool_per_panel)
colnames(number_of_genes) = "num_genes_remapped"
number_of_genes$num_genes_submitted = NA
number_of_genes$num_expected_genes = NA
number_of_genes[names(num_submitted_genes), "num_genes_submitted"] = num_submitted_genes
number_of_genes[assay_infodf$SEQ_ASSAY_ID, "num_expected_genes"] = assay_infodf$number_of_genes
different_from_expected = number_of_genes[c(which(number_of_genes$num_genes_submitted != number_of_genes$num_expected_genes),
which(is.na(number_of_genes$num_expected_genes))),]
kable(different_from_expected,
caption = "Number of submitted vs expected genes in a bed file")
}
```
5 changes: 3 additions & 2 deletions genie/input_to_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,10 @@ def check_existing_file_status(validation_status_table, error_tracker_table, ent

validation_statusdf = validation_status_table.asDataFrame()
error_trackerdf = error_tracker_table.asDataFrame()

# This should be outside fo the forloop so that it doesn't
# get reset
to_validate = False
for ent in entities:
to_validate = False
# Get the current status and errors from the tables.
current_status = validation_statusdf[validation_statusdf['id'] == ent.id]
current_error = error_trackerdf[error_trackerdf['id'] == ent.id]
Expand Down
3 changes: 0 additions & 3 deletions tests/test_assay.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,9 +192,6 @@ def test_invalid__validate():
"Please double check your instrument_model column. "
"This column must only be these values: value1, value2, None\n"
"Assay_information.yaml: "
"Please double check your target_capture_kit column. "
"This column must only be these values: value1, value2\n"
"Assay_information.yaml: "
"Please double check your variant_classifications column. "
"This column must only be these values: Splice_Site, "
"Nonsense_Mutation, Frame_Shift_Del, Frame_Shift_Ins, "
Expand Down

0 comments on commit 02f63db

Please sign in to comment.