diff --git a/bcbio/cwl/defs.py b/bcbio/cwl/defs.py index 22c3860f7..26f88c079 100644 --- a/bcbio/cwl/defs.py +++ b/bcbio/cwl/defs.py @@ -516,6 +516,7 @@ def _variant_sv(checkpoints): ["config", "algorithm", "svprioritize"], ["config", "algorithm", "svvalidate"], ["regions", "sample_callable"], ["genome_resources", "variation", "gc_profile"], + ["genome_resources", "variation", "germline_het_pon"], ["genome_resources", "aliases", "snpeff"], ["reference", "snpeff", "genome_build"], ["sv_coverage_rec"]] if checkpoints.get("vc"): diff --git a/bcbio/pipeline/genome.py b/bcbio/pipeline/genome.py index 99b75ed8c..ba7b8b732 100644 --- a/bcbio/pipeline/genome.py +++ b/bcbio/pipeline/genome.py @@ -46,6 +46,7 @@ def add_required_resources(resources): required = [["variation", "cosmic"], ["variation", "clinvar"], ["variation", "dbsnp"], ["variation", "lcr"], ["variation", "polyx"], ["variation", "encode_blacklist"], ["variation", "gc_profile"], + ["variation", "germline_het_pon"], ["variation", "train_hapmap"], ["variation", "train_indels"], ["variation", "editing"], ["variation", "exac"], ["variation", "esp"], ["variation", "1000g"]] diff --git a/bcbio/structural/gatkcnv.py b/bcbio/structural/gatkcnv.py new file mode 100644 index 000000000..c1b617fe3 --- /dev/null +++ b/bcbio/structural/gatkcnv.py @@ -0,0 +1,40 @@ +"""Support for Copy Number Variations (CNVs) with GATK4 + +https://software.broadinstitute.org/gatk/documentation/article?id=11682 +https://gatkforums.broadinstitute.org/dsde/discussion/11683/ +""" +import os + +import toolz as tz + +from bcbio import broad, utils +from bcbio.distributed.transaction import file_transaction +from bcbio.pipeline import datadict as dd +from bcbio.variation import bedutils + +def heterogzygote_counts(paired): + """Provide tumor/normal counts at population heterozyogte sites with CollectAllelicCounts. + """ + work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(paired.tumor_data), "structural", "counts")) + key = "germline_het_pon" + het_bed = tz.get_in(["genome_resources", "variation", key], paired.tumor_data) + vr = bedutils.population_variant_regions([x for x in [paired.tumor_data, paired.normal_data] if x]) + cur_het_bed = bedutils.intersect_two(het_bed, vr, work_dir, paired.tumor_data) + tumor_counts = _run_collect_allelic_counts(cur_het_bed, key, work_dir, paired.tumor_data) + normal_counts = _run_collect_allelic_counts(cur_het_bed, key, work_dir, paired.normal_data) + return tumor_counts, normal_counts + +def _run_collect_allelic_counts(pos_file, pos_name, work_dir, data): + """Counts by alleles for a specific sample and set of positions. + """ + out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", "counts")) + out_file = os.path.join(out_dir, "%s-%s-counts.tsv" % (dd.get_sample_name(data), pos_name)) + if not utils.file_exists(out_file): + with file_transaction(data, out_file) as tx_out_file: + params = ["-T", "CollectAllelicCounts", "-L", pos_file, "-I", dd.get_align_bam(data), + "-R", dd.get_ref_file(data), "-O", tx_out_file] + num_cores = dd.get_num_cores(data) + memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None + broad_runner = broad.runner_from_config(data["config"]) + broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale) + return out_file diff --git a/bcbio/structural/purple.py b/bcbio/structural/purple.py index c01ae89f5..8daed5bfa 100644 --- a/bcbio/structural/purple.py +++ b/bcbio/structural/purple.py @@ -16,6 +16,7 @@ from bcbio.pipeline import datadict as dd from bcbio.provenance import do from bcbio.variation import vcfutils +from bcbio.structural import gatkcnv def run(items): paired = vcfutils.get_paired(items) @@ -25,7 +26,7 @@ def run(items): return items work_dir = _sv_workdir(paired.tumor_data) from bcbio import heterogeneity - het_file = _amber_het_file(heterogeneity.get_variants(paired.tumor_data), work_dir, paired) + het_file = _amber_het_file("pon", heterogeneity.get_variants(paired.tumor_data), work_dir, paired) depth_file = _run_cobalt(paired, work_dir) purple_out = _run_purple(paired, het_file, depth_file, work_dir) out = [] @@ -158,20 +159,29 @@ def write_row(self, rec, stats): stats["normal"]["freq"], _normalize_baf(stats["normal"]["freq"]), stats["normal"]["depth"]]) -def _amber_het_file(vrn_files, work_dir, paired): +def _amber_het_file(method, vrn_files, work_dir, paired): """Create file of BAFs in normal heterozygous positions compatible with AMBER. + Two available methods: + - pon -- Use panel of normals with likely heterozygous sites. + - variants -- Use pre-existing variant calls, filtered to likely heterozygotes. + https://github.com/hartwigmedical/hmftools/tree/master/amber https://github.com/hartwigmedical/hmftools/blob/637e3db1a1a995f4daefe2d0a1511a5bdadbeb05/hmf-common/src/test/resources/amber/new.amber.baf """ assert vrn_files, "Did not find compatible variant calling files for TitanCNA inputs" from bcbio.heterogeneity import bubbletree - prep_file = bubbletree.prep_vrn_file(vrn_files[0]["vrn_file"], vrn_files[0]["variantcaller"], - work_dir, paired, AmberWriter) - amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber")) - out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data)) - utils.symlink_plus(prep_file, out_file) + if method == "variants": + amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber")) + out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data)) + prep_file = bubbletree.prep_vrn_file(vrn_files[0]["vrn_file"], vrn_files[0]["variantcaller"], + work_dir, paired, AmberWriter) + utils.symlink_plus(prep_file, out_file) + else: + assert method == "pon" + tumor_counts, normal_counts = gatkcnv.heterogzygote_counts(paired) + out_file = _count_files_to_amber(tumor_counts, normal_counts, work_dir, paired.tumor_data) pcf_file = out_file + ".pcf" if not utils.file_exists(pcf_file): with file_transaction(paired.tumor_data, pcf_file) as tx_out_file: diff --git a/bcbio/structural/regions.py b/bcbio/structural/regions.py index 063093224..2010a60a0 100644 --- a/bcbio/structural/regions.py +++ b/bcbio/structural/regions.py @@ -47,7 +47,7 @@ def calculate_sv_bins(*items): cnv_group.work_dir, data) else: target_bed, anti_bed = cnvkit.targets_w_bins(cnv_group.region_file, cnv_group.access_file, - size_calc_fn, cnv_group.work_dir, data) + size_calc_fn, cnv_group.work_dir, data) if not data.get("regions"): data["regions"] = {} data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed, "group": str(i)} diff --git a/setup.py b/setup.py index c4433db91..dbbeeea12 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import os from setuptools import setup, find_packages -version = "1.1.1" +version = "1.1.2a0" def write_version_py(): version_py = os.path.join(os.path.dirname(__file__), 'bcbio', 'pipeline',