CNV: support background inputs for CNVkit, GATK4 CNV and seq2c

- Allow pre-computed panel of normals for tumor-only or single sample CNV calling for all methods - CNVkit background preparation fixes for correctly setting target and antitarget without overlaps. - GATK4 CNV extraction of BED regions from panel of normals. - seq2c support for a panel of normals with combined mapping and coverage inputs from the background.
bcbio · Dec 13, 2018 · be17357 · be17357
1 parent 1daf8e1
commit be17357
Show file tree

Hide file tree

Showing 8 changed files with 116 additions and 22 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,3 +1,8 @@
+## 1.1.3 (in progress)
+
+- CNV: support background inputs for CNVkit, GATK4 CNV and seq2c. Allows
+  pre-computed panel of normals for tumor-only or single sample CNV calling.
+
 ## 1.1.2 (12 December 2018)
 
 - VarDict low frequency somatic filters: generalize strand and mismatch based

diff --git a/bcbio/pipeline/datadict.py b/bcbio/pipeline/datadict.py
@@ -76,7 +76,6 @@
     "realign": {"keys": ['config', 'algorithm', 'realign'], "default": False},
     "ensemble": {"keys": ["config", "algorithm", "ensemble"], "default": {}},
     "background_variant": {"keys": ["config", "algorithm", "background", "variant"]},
-    "background_cnv_reference": {"keys": ["config", "algorithm", "background", "cnv_reference"]},
     "peakcaller": {"keys": ['config', 'algorithm', 'peakcaller'], "default": []},
     "chip_method": {"keys": ['config', 'algorithm', 'chip_method'], "default": "chip"},
     "spikein_counts": {"keys": ["spikein_counts"]},
@@ -203,6 +202,11 @@
     "cwl_reporting": {"keys": ["config", "algorithm", "cwl_reporting"]},
 }
 
+def get_background_cnv_reference(data, caller):
+    out = tz.get_in(["config", "algorithm", "background", "cnv_reference"], data)
+    if out:
+        return out.get(caller) if isinstance(out, dict) else out
+
 def get_batches(data):
     batches = get_batch(data)
     if batches:

diff --git a/bcbio/pipeline/run_info.py b/bcbio/pipeline/run_info.py
@@ -377,7 +377,13 @@ def _clean_background(data):
         elif isinstance(val, dict):
             for k, v in val.items():
                 if k in allowed_keys:
-                    out[k] = _file_to_abs(v, [os.getcwd()])
+                    if isinstance(v, basestring):
+                        out[k] = _file_to_abs(v, [os.getcwd()])
+                    else:
+                        assert isinstance(v, dict)
+                        for ik, iv in v.items():
+                            v[ik] = _file_to_abs(iv, [os.getcwd()])
+                        out[k] = v
                 else:
                     errors.append("Unexpected key: %s" % k)
         else:

diff --git a/bcbio/structural/cnvkit.py b/bcbio/structural/cnvkit.py
@@ -459,7 +459,12 @@ def targets_w_bins(cnv_file, access_file, target_anti_fn, work_dir, data):
     if not os.path.exists(anti_file):
         _, anti_bin = target_anti_fn()
         with file_transaction(data, anti_file) as tx_out_file:
-            cmd = [_get_cmd(), "antitarget", "-g", access_file, cnv_file, "-o", tx_out_file,
+            # Create access file without targets to avoid overlap
+            # antitarget in cnvkit is meant to do this but appears to not always happen
+            # after chromosome 1
+            tx_access_file = os.path.join(os.path.dirname(tx_out_file), os.path.basename(access_file))
+            pybedtools.BedTool(access_file).subtract(cnv_file).saveas(tx_access_file)
+            cmd = [_get_cmd(), "antitarget", "-g", tx_access_file, cnv_file, "-o", tx_out_file,
                    "--avg-size", str(anti_bin)]
             do.run(_prep_cmd(cmd, tx_out_file), "CNVkit antitarget")
     return target_file, anti_file

diff --git a/bcbio/structural/gatkcnv.py b/bcbio/structural/gatkcnv.py
@@ -148,6 +148,23 @@ def create_panel_of_normals(items, group_id, work_dir):
             _run_with_memory_scaling(params, tx_out_file, items[0])
     return out_file
 
+def pon_to_bed(pon_file, out_dir, data):
+    """Extract BED intervals from a GATK4 hdf5 panel of normal file.
+    """
+    out_file = os.path.join(out_dir, "%s-intervals.bed" % (utils.splitext_plus(os.path.basename(pon_file))[0]))
+    if not utils.file_uptodate(out_file, pon_file):
+        import h5py
+        with file_transaction(data, out_file) as tx_out_file:
+            with h5py.File(pon_file, "r") as f:
+                with open(tx_out_file, "w") as out_handle:
+                    intervals = f["original_data"]["intervals"]
+                    for i in range(len(intervals["transposed_index_start_end"][0])):
+                        chrom = intervals["indexed_contig_names"][intervals["transposed_index_start_end"][0][i]]
+                        start = int(intervals["transposed_index_start_end"][1][i]) - 1
+                        end = int(intervals["transposed_index_start_end"][2][i])
+                        out_handle.write("%s\t%s\t%s\n" % (chrom, start, end))
+    return out_file
+
 def prepare_intervals(data, region_file, work_dir):
     """Prepare interval regions for targeted and gene based regions.
     """
@@ -298,12 +315,13 @@ def _seg_to_vcf(vals):
     """Convert GATK CNV calls seg output to a VCF line.
     """
     call_to_cn = {"+": 3, "-": 1}
+    call_to_type = {"+": "<DUP>", "-": "<DEL>"}
     if vals["CALL"] not in ["0"]:
         info = ["FOLD_CHANGE_LOG=%s" % vals["MEAN_LOG2_COPY_RATIO"],
                 "PROBES=%s" % vals["NUM_POINTS_COPY_RATIO"],
                 "END=%s" % vals["END"],
                 "CN=%s" % call_to_cn[vals["CALL"]]]
-        return [vals["CONTIG"], vals["START"], ".", "N", "<CNV>", ".",
+        return [vals["CONTIG"], vals["START"], ".", "N", call_to_type[vals["CALL"]], ".",
                 ";".join(info), "GT", "0/1"]
 
 def _sv_workdir(data):

diff --git a/bcbio/structural/regions.py b/bcbio/structural/regions.py
@@ -60,16 +60,19 @@ def calculate_sv_bins(*items):
 def _calculate_sv_bins_gatk(data, cnv_group, size_calc_fn):
     """Calculate structural variant bins using GATK4 CNV callers region or even intervals approach.
     """
-    target_bed = gatkcnv.prepare_intervals(data, cnv_group.region_file, cnv_group.work_dir)
+    if dd.get_background_cnv_reference(data, "gatk-cnv"):
+        target_bed = gatkcnv.pon_to_bed(dd.get_background_cnv_reference(data, "gatk-cnv"), cnv_group.work_dir, data)
+    else:
+        target_bed = gatkcnv.prepare_intervals(data, cnv_group.region_file, cnv_group.work_dir)
     gc_annotated_tsv = gatkcnv.annotate_intervals(target_bed, data)
     return target_bed, None, gc_annotated_tsv
 
 def _calculate_sv_bins_cnvkit(data, cnv_group, size_calc_fn):
     """Calculate structural variant bins using target/anti-target approach from CNVkit.
     """
     from bcbio.structural import cnvkit
-    if dd.get_background_cnv_reference(data):
-        target_bed, anti_bed = cnvkit.targets_from_background(dd.get_background_cnv_reference(data),
+    if dd.get_background_cnv_reference(data, "cnvkit"):
+        target_bed, anti_bed = cnvkit.targets_from_background(dd.get_background_cnv_reference(data, "cnvkit"),
                                                               cnv_group.work_dir, data)
     else:
         target_bed, anti_bed = cnvkit.targets_w_bins(cnv_group.region_file, cnv_group.access_file,
@@ -208,9 +211,8 @@ def _calculate_sv_coverage_cnvkit(data, work_dir):
         target_cov = coverage.run_mosdepth(data, "target", tz.get_in(["regions", "bins", "target"], data))
         anti_cov = coverage.run_mosdepth(data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data))
         target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0)
-        anti_cov_genes = annotate.add_genes(anti_cov.regions, data, max_distance=0)
         out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data)
-        out_anti_file = _add_log2_depth(anti_cov_genes, out_anti_file, data)
+        out_anti_file = _add_log2_depth(anti_cov.regions, out_anti_file, data)
     return out_target_file, out_anti_file
 
 def _add_log2_depth(in_file, out_file, data):
@@ -228,7 +230,7 @@ def _add_log2_depth(in_file, out_file, data):
                             # Handle inputs unannotated with gene names
                             if len(parts) == 5:
                                 chrom, start, end, orig_name, depth = parts
-                                gene_name = "."
+                                gene_name = orig_name if (orig_name in ["Antitarget", "Background"]) else "."
                             else:
                                 assert len(parts) == 6, parts
                                 chrom, start, end, orig_name, depth, gene_name = parts
@@ -270,7 +272,15 @@ def normalize_sv_coverage(*items):
 def _normalize_sv_coverage_gatk(group_id, inputs, backgrounds, work_dir, back_files, out_files):
     """Normalize CNV coverage using panel of normals with GATK's de-noise approaches.
     """
-    pon = gatkcnv.create_panel_of_normals(backgrounds, group_id, work_dir) if backgrounds else None
+    input_backs = set(filter(lambda x: x is not None,
+                             [dd.get_background_cnv_reference(d, "gatk-cnv") for d in inputs]))
+    if input_backs:
+        assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs)
+        pon = list(input_backs)[0]
+    elif backgrounds:
+        pon = gatkcnv.create_panel_of_normals(backgrounds, group_id, work_dir)
+    else:
+        pon = None
     for data in inputs:
         work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                                    dd.get_sample_name(data), "bins"))
@@ -295,7 +305,7 @@ def _normalize_sv_coverage_cnvkit(group_id, inputs, backgrounds, work_dir, back_
             target_bed = tz.get_in(["depth", "bins", "target"], d)
             antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d)
     input_backs = set(filter(lambda x: x is not None,
-                                [dd.get_background_cnv_reference(d) for d in inputs]))
+                                [dd.get_background_cnv_reference(d, "cnvkit") for d in inputs]))
     if input_backs:
         assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs)
         back_file = list(input_backs)[0]

diff --git a/bcbio/structural/seq2c.py b/bcbio/structural/seq2c.py
@@ -61,18 +61,28 @@ def run(items):
     items = [utils.to_single_data(x) for x in items]
     work_dir = _sv_workdir(items[0])
 
-    coverage_file = _combine_coverages(items, work_dir)
-    read_mapping_file = _calculate_mapping_reads(items, work_dir)
-
-    normal_names = [dd.get_sample_name(x) for x in items if population.get_affected_status(x) == 1]
+    input_backs = list(set(filter(lambda x: x is not None,
+                                  [dd.get_background_cnv_reference(d, "seq2c") for d in items])))
+    coverage_file = _combine_coverages(items, work_dir, input_backs)
+    read_mapping_file = _calculate_mapping_reads(items, work_dir, input_backs)
+    normal_names = []
+    if input_backs:
+        with open(input_backs[0]) as in_handle:
+            for line in in_handle:
+                if len(line.split()) == 2:
+                    normal_names.append(line.split()[0])
+    normal_names += [dd.get_sample_name(x) for x in items if population.get_affected_status(x) == 1]
     seq2c_calls_file = _call_cnv(items, work_dir, read_mapping_file, coverage_file, normal_names)
     items = _split_cnv(items, seq2c_calls_file, read_mapping_file, coverage_file)
     return items
 
 def prep_seq2c_bed(data):
     """Selecting the bed file, cleaning, and properly annotating for Seq2C
     """
-    bed_file = regions.get_sv_bed(data)
+    if dd.get_background_cnv_reference(data, "seq2c"):
+        bed_file = _background_to_bed(dd.get_background_cnv_reference(data, "seq2c"), data)
+    else:
+        bed_file = regions.get_sv_bed(data)
     if bed_file:
         bed_file = bedutils.clean_file(bed_file, data, prefix="svregions-")
     else:
@@ -103,6 +113,26 @@ def prep_seq2c_bed(data):
 
     return ready_file
 
+def _background_to_bed(back_file, data):
+    """Convert a seq2c background file with calls into BED regions for coverage.
+
+    seq2c background files are a concatenation of mapping and sample_coverages from
+    potentially multiple samples. We use the coverage information from the first
+    sample to translate into BED.
+    """
+    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bedprep")),
+                            "%s-regions.bed" % utils.splitext_plus(os.path.basename(back_file))[0])
+    if not utils.file_exists(out_file):
+        with file_transaction(data, out_file) as tx_out_file:
+            with open(back_file) as in_handle:
+                with open(tx_out_file, "w") as out_handle:
+                    sample = in_handle.readline().split("\t")[0]
+                    for line in in_handle:
+                        if line.startswith(sample) and len(line.split()) >= 5:
+                            _, gene, chrom, start, end = line.split()[:5]
+                            out_handle.write("%s\n" % ("\t".join([chrom, str(int(start) - 1), end, gene])))
+    return out_file
+
 def _get_seq2c_options(data):
     """Get adjustable, through resources, or default options for seq2c.
     """
@@ -135,7 +165,7 @@ def _call_cnv(items, work_dir, read_mapping_file, coverage_file, control_sample_
         with file_transaction(items[0], output_fpath) as tx_out_file:
             export = utils.local_path_export()
             cmd = ("{export} {cov2lr} -a {cov2lr_opt} {read_mapping_file} {coverage_file} | " +
-                   "{lr2gene} {lr2gene_opt} > {output_fpath}")
+                   "{lr2gene} {lr2gene_opt} > {tx_out_file}")
             do.run(cmd.format(**locals()), "Seq2C CNV calling")
     return output_fpath
 
@@ -268,8 +298,10 @@ def _depth_to_seq2cov(input_fpath, output_fpath, sample_name):
                 out.write('\t'.join(fs) + '\n')
     return output_fpath
 
-def _combine_coverages(items, work_dir):
+def _combine_coverages(items, work_dir, input_backs=None):
     """Combine coverage cnns calculated for individual inputs into single file.
+
+    Optionally moves over pre-calculated coverage samples from a background file.
     """
     out_file = os.path.join(work_dir, "sample_coverages.txt")
     if not utils.file_exists(out_file):
@@ -282,10 +314,18 @@ def _combine_coverages(items, work_dir):
                     cov_file = svouts[0]["coverage"]
                     with open(cov_file) as cov_f:
                         out_f.write(cov_f.read())
+                if input_backs:
+                    for input_back in input_backs:
+                        with open(input_back) as in_handle:
+                            for line in in_handle:
+                                if len(line.split()) >= 4:
+                                    out_f.write(line)
     return out_file
 
-def _calculate_mapping_reads(items, work_dir):
+def _calculate_mapping_reads(items, work_dir, input_backs=None):
     """Calculate read counts from samtools idxstats for each sample.
+
+    Optionally moves over pre-calculated mapping counts from a background file.
     """
     out_file = os.path.join(work_dir, "mapping_reads.txt")
     if not utils.file_exists(out_file):
@@ -299,7 +339,13 @@ def _calculate_mapping_reads(items, work_dir):
             lines.append("%s\t%s" % (dd.get_sample_name(data), count))
         with file_transaction(items[0], out_file) as tx_out_file:
             with open(tx_out_file, "w") as out_handle:
-                out_handle.write("\n".join(lines))
+                out_handle.write("\n".join(lines) + "\n")
+                if input_backs:
+                    for input_back in input_backs:
+                        with open(input_back) as in_handle:
+                            for line in in_handle:
+                                if len(line.split()) == 2:
+                                    out_handle.write(line)
     return out_file
 
 # def _count_mapped_reads(data, work_dir, bed_file, bam_file):

diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 import os
 from setuptools import setup, find_packages
 
-version = "1.1.2"
+version = "1.1.3a0"
 
 def write_version_py():
     version_py = os.path.join(os.path.dirname(__file__), 'bcbio', 'pipeline',