Skip to content

Commit

Permalink
PURPLE: input heterozygote variants from panel of normals
Browse files Browse the repository at this point in the history
Provide flexible ability to either use heterozygotes at defined positions
or pre-called variants as inputs to PURPLE segmentation and analysis. Uses
GATK4's CNV based approach for extracting read counts and then merges
into PURPLE formats.
  • Loading branch information
chapmanb committed Nov 7, 2018
1 parent ad3da20 commit 8d40ce7
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 9 deletions.
1 change: 1 addition & 0 deletions bcbio/cwl/defs.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,7 @@ def _variant_sv(checkpoints):
["config", "algorithm", "svprioritize"],
["config", "algorithm", "svvalidate"], ["regions", "sample_callable"],
["genome_resources", "variation", "gc_profile"],
["genome_resources", "variation", "germline_het_pon"],
["genome_resources", "aliases", "snpeff"], ["reference", "snpeff", "genome_build"],
["sv_coverage_rec"]]
if checkpoints.get("vc"):
Expand Down
1 change: 1 addition & 0 deletions bcbio/pipeline/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def add_required_resources(resources):
required = [["variation", "cosmic"], ["variation", "clinvar"], ["variation", "dbsnp"],
["variation", "lcr"], ["variation", "polyx"],
["variation", "encode_blacklist"], ["variation", "gc_profile"],
["variation", "germline_het_pon"],
["variation", "train_hapmap"], ["variation", "train_indels"],
["variation", "editing"], ["variation", "exac"], ["variation", "esp"],
["variation", "1000g"]]
Expand Down
40 changes: 40 additions & 0 deletions bcbio/structural/gatkcnv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""Support for Copy Number Variations (CNVs) with GATK4
https://software.broadinstitute.org/gatk/documentation/article?id=11682
https://gatkforums.broadinstitute.org/dsde/discussion/11683/
"""
import os

import toolz as tz

from bcbio import broad, utils
from bcbio.distributed.transaction import file_transaction
from bcbio.pipeline import datadict as dd
from bcbio.variation import bedutils

def heterogzygote_counts(paired):
"""Provide tumor/normal counts at population heterozyogte sites with CollectAllelicCounts.
"""
work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(paired.tumor_data), "structural", "counts"))
key = "germline_het_pon"
het_bed = tz.get_in(["genome_resources", "variation", key], paired.tumor_data)
vr = bedutils.population_variant_regions([x for x in [paired.tumor_data, paired.normal_data] if x])
cur_het_bed = bedutils.intersect_two(het_bed, vr, work_dir, paired.tumor_data)
tumor_counts = _run_collect_allelic_counts(cur_het_bed, key, work_dir, paired.tumor_data)
normal_counts = _run_collect_allelic_counts(cur_het_bed, key, work_dir, paired.normal_data)
return tumor_counts, normal_counts

def _run_collect_allelic_counts(pos_file, pos_name, work_dir, data):
"""Counts by alleles for a specific sample and set of positions.
"""
out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", "counts"))
out_file = os.path.join(out_dir, "%s-%s-counts.tsv" % (dd.get_sample_name(data), pos_name))
if not utils.file_exists(out_file):
with file_transaction(data, out_file) as tx_out_file:
params = ["-T", "CollectAllelicCounts", "-L", pos_file, "-I", dd.get_align_bam(data),
"-R", dd.get_ref_file(data), "-O", tx_out_file]
num_cores = dd.get_num_cores(data)
memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None
broad_runner = broad.runner_from_config(data["config"])
broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale)
return out_file
24 changes: 17 additions & 7 deletions bcbio/structural/purple.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from bcbio.pipeline import datadict as dd
from bcbio.provenance import do
from bcbio.variation import vcfutils
from bcbio.structural import gatkcnv

def run(items):
paired = vcfutils.get_paired(items)
Expand All @@ -25,7 +26,7 @@ def run(items):
return items
work_dir = _sv_workdir(paired.tumor_data)
from bcbio import heterogeneity
het_file = _amber_het_file(heterogeneity.get_variants(paired.tumor_data), work_dir, paired)
het_file = _amber_het_file("pon", heterogeneity.get_variants(paired.tumor_data), work_dir, paired)
depth_file = _run_cobalt(paired, work_dir)
purple_out = _run_purple(paired, het_file, depth_file, work_dir)
out = []
Expand Down Expand Up @@ -158,20 +159,29 @@ def write_row(self, rec, stats):
stats["normal"]["freq"], _normalize_baf(stats["normal"]["freq"]),
stats["normal"]["depth"]])

def _amber_het_file(vrn_files, work_dir, paired):
def _amber_het_file(method, vrn_files, work_dir, paired):
"""Create file of BAFs in normal heterozygous positions compatible with AMBER.
Two available methods:
- pon -- Use panel of normals with likely heterozygous sites.
- variants -- Use pre-existing variant calls, filtered to likely heterozygotes.
https://github.com/hartwigmedical/hmftools/tree/master/amber
https://github.com/hartwigmedical/hmftools/blob/637e3db1a1a995f4daefe2d0a1511a5bdadbeb05/hmf-common/src/test/resources/amber/new.amber.baf
"""
assert vrn_files, "Did not find compatible variant calling files for TitanCNA inputs"
from bcbio.heterogeneity import bubbletree

prep_file = bubbletree.prep_vrn_file(vrn_files[0]["vrn_file"], vrn_files[0]["variantcaller"],
work_dir, paired, AmberWriter)
amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber"))
out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data))
utils.symlink_plus(prep_file, out_file)
if method == "variants":
amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber"))
out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data))
prep_file = bubbletree.prep_vrn_file(vrn_files[0]["vrn_file"], vrn_files[0]["variantcaller"],
work_dir, paired, AmberWriter)
utils.symlink_plus(prep_file, out_file)
else:
assert method == "pon"
tumor_counts, normal_counts = gatkcnv.heterogzygote_counts(paired)
out_file = _count_files_to_amber(tumor_counts, normal_counts, work_dir, paired.tumor_data)
pcf_file = out_file + ".pcf"
if not utils.file_exists(pcf_file):
with file_transaction(paired.tumor_data, pcf_file) as tx_out_file:
Expand Down
2 changes: 1 addition & 1 deletion bcbio/structural/regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def calculate_sv_bins(*items):
cnv_group.work_dir, data)
else:
target_bed, anti_bed = cnvkit.targets_w_bins(cnv_group.region_file, cnv_group.access_file,
size_calc_fn, cnv_group.work_dir, data)
size_calc_fn, cnv_group.work_dir, data)
if not data.get("regions"):
data["regions"] = {}
data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed, "group": str(i)}
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
from setuptools import setup, find_packages

version = "1.1.1"
version = "1.1.2a0"

def write_version_py():
version_py = os.path.join(os.path.dirname(__file__), 'bcbio', 'pipeline',
Expand Down

0 comments on commit 8d40ce7

Please sign in to comment.