PURPLE: input heterozygote variants from panel of normals

Provide flexible ability to either use heterozygotes at defined positions or pre-called variants as inputs to PURPLE segmentation and analysis. Uses GATK4's CNV based approach for extracting read counts and then merges into PURPLE formats.
bcbio · Nov 7, 2018 · 8d40ce7 · 8d40ce7
1 parent ad3da20
commit 8d40ce7
Show file tree

Hide file tree

Showing 6 changed files with 61 additions and 9 deletions.
diff --git a/bcbio/cwl/defs.py b/bcbio/cwl/defs.py
@@ -516,6 +516,7 @@ def _variant_sv(checkpoints):
                        ["config", "algorithm", "svprioritize"],
                        ["config", "algorithm", "svvalidate"], ["regions", "sample_callable"],
                        ["genome_resources", "variation", "gc_profile"],
+                       ["genome_resources", "variation", "germline_het_pon"],
                        ["genome_resources", "aliases", "snpeff"], ["reference", "snpeff", "genome_build"],
                        ["sv_coverage_rec"]]
     if checkpoints.get("vc"):

diff --git a/bcbio/pipeline/genome.py b/bcbio/pipeline/genome.py
@@ -46,6 +46,7 @@ def add_required_resources(resources):
     required = [["variation", "cosmic"], ["variation", "clinvar"], ["variation", "dbsnp"],
                 ["variation", "lcr"], ["variation", "polyx"],
                 ["variation", "encode_blacklist"], ["variation", "gc_profile"],
+                ["variation", "germline_het_pon"],
                 ["variation", "train_hapmap"], ["variation", "train_indels"],
                 ["variation", "editing"], ["variation", "exac"], ["variation", "esp"],
                 ["variation", "1000g"]]

diff --git a/bcbio/structural/gatkcnv.py b/bcbio/structural/gatkcnv.py
@@ -0,0 +1,40 @@
+"""Support for Copy Number Variations (CNVs) with GATK4
+
+https://software.broadinstitute.org/gatk/documentation/article?id=11682
+https://gatkforums.broadinstitute.org/dsde/discussion/11683/
+"""
+import os
+
+import toolz as tz
+
+from bcbio import broad, utils
+from bcbio.distributed.transaction import file_transaction
+from bcbio.pipeline import datadict as dd
+from bcbio.variation import bedutils
+
+def heterogzygote_counts(paired):
+    """Provide tumor/normal counts at population heterozyogte sites with CollectAllelicCounts.
+    """
+    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(paired.tumor_data), "structural", "counts"))
+    key = "germline_het_pon"
+    het_bed = tz.get_in(["genome_resources", "variation", key], paired.tumor_data)
+    vr = bedutils.population_variant_regions([x for x in [paired.tumor_data, paired.normal_data] if x])
+    cur_het_bed = bedutils.intersect_two(het_bed, vr, work_dir, paired.tumor_data)
+    tumor_counts = _run_collect_allelic_counts(cur_het_bed, key, work_dir, paired.tumor_data)
+    normal_counts = _run_collect_allelic_counts(cur_het_bed, key, work_dir, paired.normal_data)
+    return tumor_counts, normal_counts
+
+def _run_collect_allelic_counts(pos_file, pos_name, work_dir, data):
+    """Counts by alleles for a specific sample and set of positions.
+    """
+    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", "counts"))
+    out_file = os.path.join(out_dir, "%s-%s-counts.tsv" % (dd.get_sample_name(data), pos_name))
+    if not utils.file_exists(out_file):
+        with file_transaction(data, out_file) as tx_out_file:
+            params = ["-T", "CollectAllelicCounts", "-L", pos_file, "-I", dd.get_align_bam(data),
+                      "-R", dd.get_ref_file(data), "-O", tx_out_file]
+            num_cores = dd.get_num_cores(data)
+            memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None
+            broad_runner = broad.runner_from_config(data["config"])
+            broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale)
+    return out_file
diff --git a/bcbio/structural/purple.py b/bcbio/structural/purple.py
@@ -16,6 +16,7 @@
 from bcbio.pipeline import datadict as dd
 from bcbio.provenance import do
 from bcbio.variation import vcfutils
+from bcbio.structural import gatkcnv
 
 def run(items):
     paired = vcfutils.get_paired(items)
@@ -25,7 +26,7 @@ def run(items):
         return items
     work_dir = _sv_workdir(paired.tumor_data)
     from bcbio import heterogeneity
-    het_file = _amber_het_file(heterogeneity.get_variants(paired.tumor_data), work_dir, paired)
+    het_file = _amber_het_file("pon", heterogeneity.get_variants(paired.tumor_data), work_dir, paired)
     depth_file = _run_cobalt(paired, work_dir)
     purple_out = _run_purple(paired, het_file, depth_file, work_dir)
     out = []
@@ -158,20 +159,29 @@ def write_row(self, rec, stats):
                                   stats["normal"]["freq"], _normalize_baf(stats["normal"]["freq"]),
                                   stats["normal"]["depth"]])
 
-def _amber_het_file(vrn_files, work_dir, paired):
+def _amber_het_file(method, vrn_files, work_dir, paired):
     """Create file of BAFs in normal heterozygous positions compatible with AMBER.
 
+    Two available methods:
+      - pon -- Use panel of normals with likely heterozygous sites.
+      - variants -- Use pre-existing variant calls, filtered to likely heterozygotes.
+
     https://github.com/hartwigmedical/hmftools/tree/master/amber
     https://github.com/hartwigmedical/hmftools/blob/637e3db1a1a995f4daefe2d0a1511a5bdadbeb05/hmf-common/src/test/resources/amber/new.amber.baf
     """
     assert vrn_files, "Did not find compatible variant calling files for TitanCNA inputs"
     from bcbio.heterogeneity import bubbletree
 
-    prep_file = bubbletree.prep_vrn_file(vrn_files[0]["vrn_file"], vrn_files[0]["variantcaller"],
-                                         work_dir, paired, AmberWriter)
-    amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber"))
-    out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data))
-    utils.symlink_plus(prep_file, out_file)
+    if method == "variants":
+        amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber"))
+        out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data))
+        prep_file = bubbletree.prep_vrn_file(vrn_files[0]["vrn_file"], vrn_files[0]["variantcaller"],
+                                             work_dir, paired, AmberWriter)
+        utils.symlink_plus(prep_file, out_file)
+    else:
+        assert method == "pon"
+        tumor_counts, normal_counts = gatkcnv.heterogzygote_counts(paired)
+        out_file = _count_files_to_amber(tumor_counts, normal_counts, work_dir, paired.tumor_data)
     pcf_file = out_file + ".pcf"
     if not utils.file_exists(pcf_file):
         with file_transaction(paired.tumor_data, pcf_file) as tx_out_file:

diff --git a/bcbio/structural/regions.py b/bcbio/structural/regions.py
@@ -47,7 +47,7 @@ def calculate_sv_bins(*items):
                                                                           cnv_group.work_dir, data)
                 else:
                     target_bed, anti_bed = cnvkit.targets_w_bins(cnv_group.region_file, cnv_group.access_file,
-                                                                size_calc_fn, cnv_group.work_dir, data)
+                                                                 size_calc_fn, cnv_group.work_dir, data)
                 if not data.get("regions"):
                     data["regions"] = {}
                 data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed, "group": str(i)}

diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 import os
 from setuptools import setup, find_packages
 
-version = "1.1.1"
+version = "1.1.2a0"
 
 def write_version_py():
     version_py = os.path.join(os.path.dirname(__file__), 'bcbio', 'pipeline',