Skip to content

Commit

Permalink
CWL: support for variant inputs to SV and het calling
Browse files Browse the repository at this point in the history
Correctly pass variant outputs to structural variant calling so they
can get used as inputs to heterogeneity and structural variant callers.
Add code to handle inputs from both standard and CWL runs for variants.
  • Loading branch information
chapmanb committed Jul 11, 2018
1 parent 612d9a5 commit ed98597
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 24 deletions.
5 changes: 5 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 1.1.1 (in progress)

- CWL: support for heterogeneity and structural variant callers that make
use of variant inputs.

## 1.1.0 (11 July 2018)

- Germline calls: rename outputs to `samplename-germline` to provide easier
Expand Down
18 changes: 10 additions & 8 deletions bcbio/cwl/defs.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,15 @@ def _variant_sv(checkpoints):
"seq2c", "simple_sv_annotation", "survivor", "svtools", "svtyper",
"r=3.4.1", "vawk"],
disk={"files": 2.0})]
sv_batch_inputs = [["analysis"], ["genome_build"],
["work_bam_plus", "disc"], ["work_bam_plus", "sr"],
["config", "algorithm", "tools_on"],
["config", "algorithm", "tools_off"],
["config", "algorithm", "svvalidate"], ["regions", "sample_callable"],
["genome_resources", "aliases", "snpeff"], ["reference", "snpeff", "genome_build"],
["sv_coverage_rec"]]
if checkpoints.get("vc"):
sv_batch_inputs.append(["variants", "samples"])
steps = [s("calculate_sv_bins", "multi-combined",
[["align_bam"], ["reference", "fasta", "base"],
["metadata", "batch"], ["metadata", "phenotype"],
Expand Down Expand Up @@ -506,14 +515,7 @@ def _variant_sv(checkpoints):
cwlout("inherit")])],
"bcbio-vc", ["cnvkit"],
disk={"files": 1.5}),
s("batch_for_sv", "multi-batch",
[["analysis"], ["genome_build"],
["work_bam_plus", "disc"], ["work_bam_plus", "sr"],
["config", "algorithm", "tools_on"],
["config", "algorithm", "tools_off"],
["config", "algorithm", "svvalidate"], ["regions", "sample_callable"],
["genome_resources", "aliases", "snpeff"], ["reference", "snpeff", "genome_build"],
["sv_coverage_rec"]],
s("batch_for_sv", "multi-batch", sv_batch_inputs,
[cwlout("sv_batch_rec", "record")],
"bcbio-vc",
unlist=[["config", "algorithm", "svcaller"]]),
Expand Down
22 changes: 20 additions & 2 deletions bcbio/heterogeneity/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"""
from __future__ import print_function
import collections
import os

from bcbio import utils
from bcbio.heterogeneity import bubbletree, phylowgs, theta
Expand All @@ -23,14 +24,31 @@ def _get_calls(data, cnv_only=False):
out[sv["variantcaller"]] = sv
return out

def get_variants(data):
def get_variants(data, include_germline=False):
"""Retrieve set of variant calls to use for heterogeneity analysis.
"""
data = utils.deepish_copy(data)
supported = ["precalled", "vardict", "vardict-java", "vardict-perl",
"strelka2", "mutect2", "freebayes", "mutect"]
if include_germline:
supported.insert(1, "gatk-haplotype")
out = []
# CWL based input
if isinstance(data.get("variants"), dict) and "samples" in data["variants"]:
cur_vs = []
# Unpack single sample list of files
if (isinstance(data["variants"]["samples"], (list, tuple)) and
len(data["variants"]["samples"]) == 1 and isinstance(data["variants"]["samples"][0], (list, tuple))):
data["variants"]["samples"] = data["variants"]["samples"][0]
for fname in data["variants"]["samples"]:
variantcaller = utils.splitext_plus(os.path.basename(fname))[0]
variantcaller = variantcaller.replace(dd.get_sample_name(data) + "-", "")
for batch in dd.get_batches(data):
variantcaller = variantcaller.replace(batch + "-", "")
cur_vs.append({"vrn_file": fname, "variantcaller": variantcaller})
data["variants"] = cur_vs
for v in data.get("variants", []):
if v["variantcaller"] in supported:
if v["variantcaller"] in supported and v.get("vrn_file"):
out.append((supported.index(v["variantcaller"]), v))
out.sort()
return [xs[1] for xs in out]
Expand Down
1 change: 1 addition & 0 deletions bcbio/structural/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ def batch_for_sv(samples):
CWL input target -- groups samples into batches and structural variant
callers for parallel processing.
"""
samples = cwlutils.assign_complex_to_samples(samples)
to_process, extras, background = _batch_split_by_sv(samples, "standard")
out = [cwlutils.samples_to_records(xs) for xs in to_process.values()] + extras
return out
Expand Down
19 changes: 9 additions & 10 deletions bcbio/structural/cnvkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import numpy as np
import toolz as tz

from bcbio import utils
from bcbio import heterogeneity, utils
from bcbio.bam import ref
from bcbio.distributed.multi import run_multicore, zeromq_aware_logging
from bcbio.distributed.transaction import file_transaction
Expand Down Expand Up @@ -496,15 +496,14 @@ def _compatible_small_variants(data, items):
VarFile = collections.namedtuple("VarFile", ["name", "sample", "normal"])
supported = set(["vardict", "freebayes", "gatk-haplotype", "strelka2", "vardict"])
out = []
for v in data.get("variants", []):
vrn_file = v.get("vrn_file")
if vrn_file and v.get("variantcaller") in supported:
base, ext = utils.splitext_plus(os.path.basename(vrn_file))
paired = vcfutils.get_paired(items)
if paired:
out.append(VarFile(vrn_file, paired.tumor_name, paired.normal_name))
else:
out.append(VarFile(vrn_file, dd.get_sample_name(data), None))
paired = vcfutils.get_paired(items)
for v in heterogeneity.get_variants(data, include_germline=not paired):
vrn_file = v["vrn_file"]
base, ext = utils.splitext_plus(os.path.basename(vrn_file))
if paired:
out.append(VarFile(vrn_file, paired.tumor_name, paired.normal_name))
else:
out.append(VarFile(vrn_file, dd.get_sample_name(data), None))
return out

def _add_variantcalls_to_output(out, data, items, is_somatic=False):
Expand Down
7 changes: 4 additions & 3 deletions bcbio/variation/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,7 @@ def _group_validate_samples(samples, vkey, batch_keys):
if data.get(vkey):
is_v = True
for variant in data.get("variants", []):
if variant.get(vkey):
if isinstance(variant, dict) and variant.get(vkey):
is_v = True
if is_v:
for batch_key in batch_keys:
Expand Down Expand Up @@ -561,7 +561,8 @@ def summarize_grading(samples, vkey="validate"):
plot_data = []
plot_files = []
for data in sorted(vitems, key=lambda x: x.get("lane", dd.get_sample_name(x))):
validations = [variant.get(vkey) for variant in data.get("variants", [])]
validations = [variant.get(vkey) for variant in data.get("variants", [])
if isinstance(variant, dict)]
validations = [v for v in validations if v]
if len(validations) == 0 and vkey in data:
validations = [data.get(vkey)]
Expand All @@ -588,7 +589,7 @@ def summarize_grading(samples, vkey="validate"):
if data.get(vkey):
data[vkey]["grading_plots"] = plots
for variant in data.get("variants", []):
if variant.get(vkey):
if isinstance(variant, dict) and variant.get(vkey):
variant[vkey]["grading_plots"] = plots
out.append([data])
return out
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
from setuptools import setup, find_packages

version = "1.1.0"
version = "1.1.1a0"

def write_version_py():
version_py = os.path.join(os.path.dirname(__file__), 'bcbio', 'pipeline',
Expand Down

0 comments on commit ed98597

Please sign in to comment.