diff --git a/HISTORY.md b/HISTORY.md index f532532d7..f0dd02cda 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,3 +1,8 @@ +## 1.1.3 (in progress) + +- CNV: support background inputs for CNVkit, GATK4 CNV and seq2c. Allows + pre-computed panel of normals for tumor-only or single sample CNV calling. + ## 1.1.2 (12 December 2018) - VarDict low frequency somatic filters: generalize strand and mismatch based diff --git a/bcbio/pipeline/datadict.py b/bcbio/pipeline/datadict.py index 7146d637d..b2ffa980c 100644 --- a/bcbio/pipeline/datadict.py +++ b/bcbio/pipeline/datadict.py @@ -76,7 +76,6 @@ "realign": {"keys": ['config', 'algorithm', 'realign'], "default": False}, "ensemble": {"keys": ["config", "algorithm", "ensemble"], "default": {}}, "background_variant": {"keys": ["config", "algorithm", "background", "variant"]}, - "background_cnv_reference": {"keys": ["config", "algorithm", "background", "cnv_reference"]}, "peakcaller": {"keys": ['config', 'algorithm', 'peakcaller'], "default": []}, "chip_method": {"keys": ['config', 'algorithm', 'chip_method'], "default": "chip"}, "spikein_counts": {"keys": ["spikein_counts"]}, @@ -203,6 +202,11 @@ "cwl_reporting": {"keys": ["config", "algorithm", "cwl_reporting"]}, } +def get_background_cnv_reference(data, caller): + out = tz.get_in(["config", "algorithm", "background", "cnv_reference"], data) + if out: + return out.get(caller) if isinstance(out, dict) else out + def get_batches(data): batches = get_batch(data) if batches: diff --git a/bcbio/pipeline/run_info.py b/bcbio/pipeline/run_info.py index 7abbd960c..8718d10aa 100644 --- a/bcbio/pipeline/run_info.py +++ b/bcbio/pipeline/run_info.py @@ -377,7 +377,13 @@ def _clean_background(data): elif isinstance(val, dict): for k, v in val.items(): if k in allowed_keys: - out[k] = _file_to_abs(v, [os.getcwd()]) + if isinstance(v, basestring): + out[k] = _file_to_abs(v, [os.getcwd()]) + else: + assert isinstance(v, dict) + for ik, iv in v.items(): + v[ik] = _file_to_abs(iv, [os.getcwd()]) + out[k] = v else: errors.append("Unexpected key: %s" % k) else: diff --git a/bcbio/structural/cnvkit.py b/bcbio/structural/cnvkit.py index 040cfadbd..c064559a2 100644 --- a/bcbio/structural/cnvkit.py +++ b/bcbio/structural/cnvkit.py @@ -459,7 +459,12 @@ def targets_w_bins(cnv_file, access_file, target_anti_fn, work_dir, data): if not os.path.exists(anti_file): _, anti_bin = target_anti_fn() with file_transaction(data, anti_file) as tx_out_file: - cmd = [_get_cmd(), "antitarget", "-g", access_file, cnv_file, "-o", tx_out_file, + # Create access file without targets to avoid overlap + # antitarget in cnvkit is meant to do this but appears to not always happen + # after chromosome 1 + tx_access_file = os.path.join(os.path.dirname(tx_out_file), os.path.basename(access_file)) + pybedtools.BedTool(access_file).subtract(cnv_file).saveas(tx_access_file) + cmd = [_get_cmd(), "antitarget", "-g", tx_access_file, cnv_file, "-o", tx_out_file, "--avg-size", str(anti_bin)] do.run(_prep_cmd(cmd, tx_out_file), "CNVkit antitarget") return target_file, anti_file diff --git a/bcbio/structural/gatkcnv.py b/bcbio/structural/gatkcnv.py index f81325872..328bd64c6 100644 --- a/bcbio/structural/gatkcnv.py +++ b/bcbio/structural/gatkcnv.py @@ -148,6 +148,23 @@ def create_panel_of_normals(items, group_id, work_dir): _run_with_memory_scaling(params, tx_out_file, items[0]) return out_file +def pon_to_bed(pon_file, out_dir, data): + """Extract BED intervals from a GATK4 hdf5 panel of normal file. + """ + out_file = os.path.join(out_dir, "%s-intervals.bed" % (utils.splitext_plus(os.path.basename(pon_file))[0])) + if not utils.file_uptodate(out_file, pon_file): + import h5py + with file_transaction(data, out_file) as tx_out_file: + with h5py.File(pon_file, "r") as f: + with open(tx_out_file, "w") as out_handle: + intervals = f["original_data"]["intervals"] + for i in range(len(intervals["transposed_index_start_end"][0])): + chrom = intervals["indexed_contig_names"][intervals["transposed_index_start_end"][0][i]] + start = int(intervals["transposed_index_start_end"][1][i]) - 1 + end = int(intervals["transposed_index_start_end"][2][i]) + out_handle.write("%s\t%s\t%s\n" % (chrom, start, end)) + return out_file + def prepare_intervals(data, region_file, work_dir): """Prepare interval regions for targeted and gene based regions. """ @@ -298,12 +315,13 @@ def _seg_to_vcf(vals): """Convert GATK CNV calls seg output to a VCF line. """ call_to_cn = {"+": 3, "-": 1} + call_to_type = {"+": "", "-": ""} if vals["CALL"] not in ["0"]: info = ["FOLD_CHANGE_LOG=%s" % vals["MEAN_LOG2_COPY_RATIO"], "PROBES=%s" % vals["NUM_POINTS_COPY_RATIO"], "END=%s" % vals["END"], "CN=%s" % call_to_cn[vals["CALL"]]] - return [vals["CONTIG"], vals["START"], ".", "N", "", ".", + return [vals["CONTIG"], vals["START"], ".", "N", call_to_type[vals["CALL"]], ".", ";".join(info), "GT", "0/1"] def _sv_workdir(data): diff --git a/bcbio/structural/regions.py b/bcbio/structural/regions.py index cbc04ba04..cb9b59047 100644 --- a/bcbio/structural/regions.py +++ b/bcbio/structural/regions.py @@ -60,7 +60,10 @@ def calculate_sv_bins(*items): def _calculate_sv_bins_gatk(data, cnv_group, size_calc_fn): """Calculate structural variant bins using GATK4 CNV callers region or even intervals approach. """ - target_bed = gatkcnv.prepare_intervals(data, cnv_group.region_file, cnv_group.work_dir) + if dd.get_background_cnv_reference(data, "gatk-cnv"): + target_bed = gatkcnv.pon_to_bed(dd.get_background_cnv_reference(data, "gatk-cnv"), cnv_group.work_dir, data) + else: + target_bed = gatkcnv.prepare_intervals(data, cnv_group.region_file, cnv_group.work_dir) gc_annotated_tsv = gatkcnv.annotate_intervals(target_bed, data) return target_bed, None, gc_annotated_tsv @@ -68,8 +71,8 @@ def _calculate_sv_bins_cnvkit(data, cnv_group, size_calc_fn): """Calculate structural variant bins using target/anti-target approach from CNVkit. """ from bcbio.structural import cnvkit - if dd.get_background_cnv_reference(data): - target_bed, anti_bed = cnvkit.targets_from_background(dd.get_background_cnv_reference(data), + if dd.get_background_cnv_reference(data, "cnvkit"): + target_bed, anti_bed = cnvkit.targets_from_background(dd.get_background_cnv_reference(data, "cnvkit"), cnv_group.work_dir, data) else: target_bed, anti_bed = cnvkit.targets_w_bins(cnv_group.region_file, cnv_group.access_file, @@ -208,9 +211,8 @@ def _calculate_sv_coverage_cnvkit(data, work_dir): target_cov = coverage.run_mosdepth(data, "target", tz.get_in(["regions", "bins", "target"], data)) anti_cov = coverage.run_mosdepth(data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data)) target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0) - anti_cov_genes = annotate.add_genes(anti_cov.regions, data, max_distance=0) out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data) - out_anti_file = _add_log2_depth(anti_cov_genes, out_anti_file, data) + out_anti_file = _add_log2_depth(anti_cov.regions, out_anti_file, data) return out_target_file, out_anti_file def _add_log2_depth(in_file, out_file, data): @@ -228,7 +230,7 @@ def _add_log2_depth(in_file, out_file, data): # Handle inputs unannotated with gene names if len(parts) == 5: chrom, start, end, orig_name, depth = parts - gene_name = "." + gene_name = orig_name if (orig_name in ["Antitarget", "Background"]) else "." else: assert len(parts) == 6, parts chrom, start, end, orig_name, depth, gene_name = parts @@ -270,7 +272,15 @@ def normalize_sv_coverage(*items): def _normalize_sv_coverage_gatk(group_id, inputs, backgrounds, work_dir, back_files, out_files): """Normalize CNV coverage using panel of normals with GATK's de-noise approaches. """ - pon = gatkcnv.create_panel_of_normals(backgrounds, group_id, work_dir) if backgrounds else None + input_backs = set(filter(lambda x: x is not None, + [dd.get_background_cnv_reference(d, "gatk-cnv") for d in inputs])) + if input_backs: + assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs) + pon = list(input_backs)[0] + elif backgrounds: + pon = gatkcnv.create_panel_of_normals(backgrounds, group_id, work_dir) + else: + pon = None for data in inputs: work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) @@ -295,7 +305,7 @@ def _normalize_sv_coverage_cnvkit(group_id, inputs, backgrounds, work_dir, back_ target_bed = tz.get_in(["depth", "bins", "target"], d) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d) input_backs = set(filter(lambda x: x is not None, - [dd.get_background_cnv_reference(d) for d in inputs])) + [dd.get_background_cnv_reference(d, "cnvkit") for d in inputs])) if input_backs: assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs) back_file = list(input_backs)[0] diff --git a/bcbio/structural/seq2c.py b/bcbio/structural/seq2c.py index 593232a46..b4a95c2e6 100644 --- a/bcbio/structural/seq2c.py +++ b/bcbio/structural/seq2c.py @@ -61,10 +61,17 @@ def run(items): items = [utils.to_single_data(x) for x in items] work_dir = _sv_workdir(items[0]) - coverage_file = _combine_coverages(items, work_dir) - read_mapping_file = _calculate_mapping_reads(items, work_dir) - - normal_names = [dd.get_sample_name(x) for x in items if population.get_affected_status(x) == 1] + input_backs = list(set(filter(lambda x: x is not None, + [dd.get_background_cnv_reference(d, "seq2c") for d in items]))) + coverage_file = _combine_coverages(items, work_dir, input_backs) + read_mapping_file = _calculate_mapping_reads(items, work_dir, input_backs) + normal_names = [] + if input_backs: + with open(input_backs[0]) as in_handle: + for line in in_handle: + if len(line.split()) == 2: + normal_names.append(line.split()[0]) + normal_names += [dd.get_sample_name(x) for x in items if population.get_affected_status(x) == 1] seq2c_calls_file = _call_cnv(items, work_dir, read_mapping_file, coverage_file, normal_names) items = _split_cnv(items, seq2c_calls_file, read_mapping_file, coverage_file) return items @@ -72,7 +79,10 @@ def run(items): def prep_seq2c_bed(data): """Selecting the bed file, cleaning, and properly annotating for Seq2C """ - bed_file = regions.get_sv_bed(data) + if dd.get_background_cnv_reference(data, "seq2c"): + bed_file = _background_to_bed(dd.get_background_cnv_reference(data, "seq2c"), data) + else: + bed_file = regions.get_sv_bed(data) if bed_file: bed_file = bedutils.clean_file(bed_file, data, prefix="svregions-") else: @@ -103,6 +113,26 @@ def prep_seq2c_bed(data): return ready_file +def _background_to_bed(back_file, data): + """Convert a seq2c background file with calls into BED regions for coverage. + + seq2c background files are a concatenation of mapping and sample_coverages from + potentially multiple samples. We use the coverage information from the first + sample to translate into BED. + """ + out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bedprep")), + "%s-regions.bed" % utils.splitext_plus(os.path.basename(back_file))[0]) + if not utils.file_exists(out_file): + with file_transaction(data, out_file) as tx_out_file: + with open(back_file) as in_handle: + with open(tx_out_file, "w") as out_handle: + sample = in_handle.readline().split("\t")[0] + for line in in_handle: + if line.startswith(sample) and len(line.split()) >= 5: + _, gene, chrom, start, end = line.split()[:5] + out_handle.write("%s\n" % ("\t".join([chrom, str(int(start) - 1), end, gene]))) + return out_file + def _get_seq2c_options(data): """Get adjustable, through resources, or default options for seq2c. """ @@ -135,7 +165,7 @@ def _call_cnv(items, work_dir, read_mapping_file, coverage_file, control_sample_ with file_transaction(items[0], output_fpath) as tx_out_file: export = utils.local_path_export() cmd = ("{export} {cov2lr} -a {cov2lr_opt} {read_mapping_file} {coverage_file} | " + - "{lr2gene} {lr2gene_opt} > {output_fpath}") + "{lr2gene} {lr2gene_opt} > {tx_out_file}") do.run(cmd.format(**locals()), "Seq2C CNV calling") return output_fpath @@ -268,8 +298,10 @@ def _depth_to_seq2cov(input_fpath, output_fpath, sample_name): out.write('\t'.join(fs) + '\n') return output_fpath -def _combine_coverages(items, work_dir): +def _combine_coverages(items, work_dir, input_backs=None): """Combine coverage cnns calculated for individual inputs into single file. + + Optionally moves over pre-calculated coverage samples from a background file. """ out_file = os.path.join(work_dir, "sample_coverages.txt") if not utils.file_exists(out_file): @@ -282,10 +314,18 @@ def _combine_coverages(items, work_dir): cov_file = svouts[0]["coverage"] with open(cov_file) as cov_f: out_f.write(cov_f.read()) + if input_backs: + for input_back in input_backs: + with open(input_back) as in_handle: + for line in in_handle: + if len(line.split()) >= 4: + out_f.write(line) return out_file -def _calculate_mapping_reads(items, work_dir): +def _calculate_mapping_reads(items, work_dir, input_backs=None): """Calculate read counts from samtools idxstats for each sample. + + Optionally moves over pre-calculated mapping counts from a background file. """ out_file = os.path.join(work_dir, "mapping_reads.txt") if not utils.file_exists(out_file): @@ -299,7 +339,13 @@ def _calculate_mapping_reads(items, work_dir): lines.append("%s\t%s" % (dd.get_sample_name(data), count)) with file_transaction(items[0], out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: - out_handle.write("\n".join(lines)) + out_handle.write("\n".join(lines) + "\n") + if input_backs: + for input_back in input_backs: + with open(input_back) as in_handle: + for line in in_handle: + if len(line.split()) == 2: + out_handle.write(line) return out_file # def _count_mapped_reads(data, work_dir, bed_file, bam_file): diff --git a/setup.py b/setup.py index 94101149d..d369a1ccc 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import os from setuptools import setup, find_packages -version = "1.1.2" +version = "1.1.3a0" def write_version_py(): version_py = os.path.join(os.path.dirname(__file__), 'bcbio', 'pipeline',