From 8a466ca72c81c60418e14189a69d94b3a87ea0e4 Mon Sep 17 00:00:00 2001 From: priesgof Date: Mon, 21 Jun 2021 22:02:34 +0200 Subject: [PATCH 1/3] filter out variant calls with ambiguous bases --- bin/assembly_variant_caller.py | 14 +++++++++++--- bin/test_assembly_variant_caller.py | 14 ++++++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/bin/assembly_variant_caller.py b/bin/assembly_variant_caller.py index 36c42d1..a020d59 100755 --- a/bin/assembly_variant_caller.py +++ b/bin/assembly_variant_caller.py @@ -6,6 +6,8 @@ from Bio.Align import PairwiseAlignment from typing import List +from Bio.Alphabet import IUPAC + CHROMOSOME = "MN908947.3" @@ -57,7 +59,7 @@ def _call_mutations(self, alignment: PairwiseAlignment) -> List[Variant]: # deletion if ref_start - prev_ref_end <= 50: # skips deletions longer than 50 bp ref = reference[prev_ref_end - 1: ref_start] - if 'N' not in ref: # do not call deletions with Ns + if not any(self._is_ambiguous_base(r) for r in ref): # do not call deletions with Ns variants.append(Variant( position=prev_ref_end - 1, reference=ref, @@ -67,7 +69,8 @@ def _call_mutations(self, alignment: PairwiseAlignment) -> List[Variant]: if alt_start - prev_alt_end <= 50: # skips insertions longer than 50 bp ref = reference[prev_ref_end - 1] alt = alternate[prev_alt_end:alt_start] - if ref != 'N' and 'N' not in alt: # do not call insertions with Ns + # do not call insertions with ambiguous bases + if not self._is_ambiguous_base(ref) and not any(self._is_ambiguous_base(a) for a in alt): variants.append(Variant( position=prev_ref_end - 1, reference=ref, @@ -77,7 +80,8 @@ def _call_mutations(self, alignment: PairwiseAlignment) -> List[Variant]: for pos, ref, alt in zip( range(ref_start, ref_end), reference[ref_start: ref_end], alternate[alt_start: alt_end]): # contiguous SNVs are reported separately - if ref != alt and ref != 'N' and alt != 'N': # do not call SNVs on Ns + # do not call insertions with ambiguous bases + if ref != alt and not self._is_ambiguous_base(ref) and not self._is_ambiguous_base(alt): variants.append(Variant(position=pos, reference=ref, alternate=alt)) prev_ref_end = ref_end @@ -85,6 +89,10 @@ def _call_mutations(self, alignment: PairwiseAlignment) -> List[Variant]: return variants + def _is_ambiguous_base(self, base): + return base not in IUPAC.unambiguous_dna.letters + + def write_vcf(mutations, output_vcf): with open(output_vcf, "w") as vcf_out: diff --git a/bin/test_assembly_variant_caller.py b/bin/test_assembly_variant_caller.py index 4cb1b12..2f87600 100644 --- a/bin/test_assembly_variant_caller.py +++ b/bin/test_assembly_variant_caller.py @@ -49,3 +49,17 @@ def test_assembly_variant_caller(self): self.assertEqual(snv.alternate, "GT") self.assertEqual(snv.position, 9) + def test_ambiguous_bases(self): + caller = AssemblyVariantCaller() + # no mutations + variants = caller.call_variants(sequence="ACGTACGT", reference="ACGTACGT") + self.assertEqual(len(variants), 0) + # ambiguous calls + variants = caller.call_variants(sequence="ACGTWCGT", reference="ACGTACGT") + self.assertEqual(len(variants), 0) + variants = caller.call_variants(sequence="ACGTNCGT", reference="ACGTACGT") + self.assertEqual(len(variants), 0) + variants = caller.call_variants(sequence="ACGTAAAZRCCCCGT", reference="ACGTACGT") + self.assertEqual(len(variants), 0) + + From c6683a5bfe9bccd30de0407bc264dd0a90806745 Mon Sep 17 00:00:00 2001 From: priesgof Date: Mon, 21 Jun 2021 22:51:47 +0200 Subject: [PATCH 2/3] upgrade biopython to 1.79 tp avoid a memory leak --- bin/assembly_variant_caller.py | 3 +-- environment.yml | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/bin/assembly_variant_caller.py b/bin/assembly_variant_caller.py index a020d59..31792f0 100755 --- a/bin/assembly_variant_caller.py +++ b/bin/assembly_variant_caller.py @@ -6,7 +6,6 @@ from Bio.Align import PairwiseAlignment from typing import List -from Bio.Alphabet import IUPAC CHROMOSOME = "MN908947.3" @@ -90,7 +89,7 @@ def _call_mutations(self, alignment: PairwiseAlignment) -> List[Variant]: return variants def _is_ambiguous_base(self, base): - return base not in IUPAC.unambiguous_dna.letters + return base not in "ACGT" diff --git a/environment.yml b/environment.yml index 9d0e4ea..0cf8b2a 100644 --- a/environment.yml +++ b/environment.yml @@ -7,7 +7,7 @@ channels: - defaults dependencies: - conda-forge::python=3.8.5 - - conda-forge::biopython=1.76 + - conda-forge::biopython=1.79 - bioconda::nextflow=21.04.0 - bioconda::bcftools=1.12 - bioconda::lofreq=2.1.5 From a6000910a3c153948f679992f7854afeaee7e638 Mon Sep 17 00:00:00 2001 From: priesgof Date: Tue, 22 Jun 2021 20:27:08 +0200 Subject: [PATCH 3/3] implement variant normalization within pipeline to avoid another call to nextflow --- README.md | 22 ---------------------- bin/assembly_variant_caller.py | 11 +++++------ environment.yml | 3 ++- main.nf | 27 +++++++++++++++------------ nextflow.config | 1 - 5 files changed, 22 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index d1bfa15..97fff6b 100644 --- a/README.md +++ b/README.md @@ -114,25 +114,3 @@ Output: - Wilm, A., Aw, P. P. K., Bertrand, D., Yeo, G. H. T., Ong, S. H., Wong, C. H., Khor, C. C., Petric, R., Hibberd, M. L., & Nagarajan, N. (2012). LoFreq: A sequence-quality aware, ultra-sensitive variant caller for uncovering cell-population heterogeneity from high-throughput sequencing datasets. Nucleic Acids Research, 40(22), 11189–11201. https://doi.org/10.1093/nar/gks918 - Grubaugh, N. D., Gangavarapu, K., Quick, J., Matteson, N. L., De Jesus, J. G., Main, B. J., Tan, A. L., Paul, L. M., Brackney, D. E., Grewal, S., Gurfield, N., Van Rompay, K. K. A., Isern, S., Michael, S. F., Coffey, L. L., Loman, N. J., & Andersen, K. G. (2019). An amplicon-based sequencing framework for accurately measuring intrahost virus diversity using PrimalSeq and iVar. Genome Biology, 20(1), 8. https://doi.org/10.1186/s13059-018-1618-7 - Shifu Chen, Yanqing Zhou, Yaru Chen, Jia Gu; fastp: an ultra-fast all-in-one FASTQ preprocessor, Bioinformatics, Volume 34, Issue 17, 1 September 2018, Pages i884–i890, https://doi.org/10.1093/bioinformatics/bty560 - - -## Known issues - -The variant normalization workflow may appear as corrupt when calling to it concurrently. -A workaround to this situation is to clone the tronflow dependencies and let covigator pipeline know of the location of the `main.nf` files. - -For instance: -``` -cd /covigator/dependencies -git clone --branch v1.4.1 https://github.com/TRON-Bioinformatics/tronflow-bwa.git -git clone --branch v1.5.0 https://github.com/TRON-Bioinformatics/tronflow-bam-preprocessing.git -git clone --branch v1.1.1 https://github.com/TRON-Bioinformatics/tronflow-variant-normalization.git -``` - -And then use the following parameters: -``` -nextflow run tron-bioinformatics/covigator-ngs-pipeline -profile conda \ ---tronflow_bwa /covigator/dependencies/tronflow-bwa/main.nf \ ---tronflow_bam_preprocessing /covigator/dependencies/tronflow-bam-preprocessing/main.nf \ ---tronflow_variant_normalization /covigator/dependencies/tronflow-variant-normalization/main.nf ... -``` \ No newline at end of file diff --git a/bin/assembly_variant_caller.py b/bin/assembly_variant_caller.py index 31792f0..104c11a 100755 --- a/bin/assembly_variant_caller.py +++ b/bin/assembly_variant_caller.py @@ -18,7 +18,7 @@ class Variant: def to_vcf_line(self): # transform 0-based position to 1-based position - return CHROMOSOME, str(self.position + 1), ".", self.reference, self.alternate, ".", "PASS", "." + return [CHROMOSOME, str(self.position + 1), ".", self.reference, self.alternate, ".", "PASS", "."] class AssemblyVariantCaller: @@ -57,7 +57,7 @@ def _call_mutations(self, alignment: PairwiseAlignment) -> List[Variant]: if prev_ref_end is not None and prev_ref_end != ref_start: # deletion if ref_start - prev_ref_end <= 50: # skips deletions longer than 50 bp - ref = reference[prev_ref_end - 1: ref_start] + ref = str(reference[prev_ref_end - 1: ref_start]) if not any(self._is_ambiguous_base(r) for r in ref): # do not call deletions with Ns variants.append(Variant( position=prev_ref_end - 1, @@ -67,7 +67,7 @@ def _call_mutations(self, alignment: PairwiseAlignment) -> List[Variant]: # insertion if alt_start - prev_alt_end <= 50: # skips insertions longer than 50 bp ref = reference[prev_ref_end - 1] - alt = alternate[prev_alt_end:alt_start] + alt = str(alternate[prev_alt_end:alt_start]) # do not call insertions with ambiguous bases if not self._is_ambiguous_base(ref) and not any(self._is_ambiguous_base(a) for a in alt): variants.append(Variant( @@ -92,7 +92,6 @@ def _is_ambiguous_base(self, base): return base not in "ACGT" - def write_vcf(mutations, output_vcf): with open(output_vcf, "w") as vcf_out: header = ( @@ -103,8 +102,8 @@ def write_vcf(mutations, output_vcf): ) for row in header: vcf_out.write(row + "\n") - for row in mutations: - vcf_out.write("\t".join(row.to_vcf_line()) + "\n") + for mutation in mutations: + vcf_out.write("\t".join(mutation.to_vcf_line()) + "\n") def main(): diff --git a/environment.yml b/environment.yml index 0cf8b2a..1165c7d 100644 --- a/environment.yml +++ b/environment.yml @@ -14,4 +14,5 @@ dependencies: - bioconda::gatk4=4.2.0.0 - bioconda::ivar=1.3.1 - bioconda::fastp=0.20.1 - - bioconda::snpeff=5.0 \ No newline at end of file + - bioconda::snpeff=5.0 + - bioconda::vt=0.57721 \ No newline at end of file diff --git a/main.nf b/main.nf index 0471e9e..dc4486a 100755 --- a/main.nf +++ b/main.nf @@ -411,20 +411,23 @@ process variantNormalization { set name, file(vcf) from vcfs_to_normalize output: - set name, file("${vcf.baseName}.normalized.vcf") into normalized_vcf_files + set name, file("${vcf.baseName}.normalized.vcf") into normalized_vcf_files + script: + """ + # initial sort of the VCF + bcftools sort ${vcf} | \ + + # checks reference genome, decompose multiallelics, trim and left align indels + bcftools norm --multiallelics -any --check-ref e --fasta-ref ${params.reference} \ + --old-rec-tag OLD_CLUMPED - | \ + + # decompose complex variants + vt decompose_blocksub -a -p - | \ + + # remove duplicates after normalisation + bcftools norm --rm-dup exact -o ${vcf.baseName}.normalized.vcf - """ - # --input_files needs to be forced, otherwise it is inherited from profile in tests - nextflow run ${params.tronflow_variant_normalization} \ - --input_vcf ${vcf} \ - --input_files false \ - --output . \ - --reference ${reference} \ - -profile ${workflow.profile} \ - -work-dir ${workflow.workDir} - - mv ${vcf.baseName}/${vcf.baseName}.normalized.vcf ${vcf.baseName}.normalized.vcf - """ } process variantAnnotation { diff --git a/nextflow.config b/nextflow.config index 3529bfe..dbbdb2a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -6,7 +6,6 @@ params.tronflow_bwa = "tron-bioinformatics/tronflow-bwa -r v1.4.1" params.tronflow_bam_preprocessing = "tron-bioinformatics/tronflow-bam-preprocessing -r v1.5.0" -params.tronflow_variant_normalization = "tron-bioinformatics/tronflow-variant-normalization -r v1.1.1" params.reference = "$baseDir/reference/Sars_cov_2.ASM985889v3.dna.toplevel.fa" params.gff = "$baseDir/reference/Sars_cov_2.ASM985889v3.101.gff3"