Skip to content

Commit

Permalink
Merge branch 'support-assemblies' into 'master'
Browse files Browse the repository at this point in the history
Remove ambiguous variant calls + upgrade biopython library

See merge request tron/covigator-ngs-pipeline!6
  • Loading branch information
Pablo Riesgo Ferreiro committed Jun 22, 2021
2 parents 4020625 + a600091 commit 7488ef0
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 45 deletions.
22 changes: 0 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,25 +114,3 @@ Output:
- Wilm, A., Aw, P. P. K., Bertrand, D., Yeo, G. H. T., Ong, S. H., Wong, C. H., Khor, C. C., Petric, R., Hibberd, M. L., & Nagarajan, N. (2012). LoFreq: A sequence-quality aware, ultra-sensitive variant caller for uncovering cell-population heterogeneity from high-throughput sequencing datasets. Nucleic Acids Research, 40(22), 11189–11201. https://doi.org/10.1093/nar/gks918
- Grubaugh, N. D., Gangavarapu, K., Quick, J., Matteson, N. L., De Jesus, J. G., Main, B. J., Tan, A. L., Paul, L. M., Brackney, D. E., Grewal, S., Gurfield, N., Van Rompay, K. K. A., Isern, S., Michael, S. F., Coffey, L. L., Loman, N. J., & Andersen, K. G. (2019). An amplicon-based sequencing framework for accurately measuring intrahost virus diversity using PrimalSeq and iVar. Genome Biology, 20(1), 8. https://doi.org/10.1186/s13059-018-1618-7
- Shifu Chen, Yanqing Zhou, Yaru Chen, Jia Gu; fastp: an ultra-fast all-in-one FASTQ preprocessor, Bioinformatics, Volume 34, Issue 17, 1 September 2018, Pages i884–i890, https://doi.org/10.1093/bioinformatics/bty560


## Known issues

The variant normalization workflow may appear as corrupt when calling to it concurrently.
A workaround to this situation is to clone the tronflow dependencies and let covigator pipeline know of the location of the `main.nf` files.

For instance:
```
cd /covigator/dependencies
git clone --branch v1.4.1 https://github.com/TRON-Bioinformatics/tronflow-bwa.git
git clone --branch v1.5.0 https://github.com/TRON-Bioinformatics/tronflow-bam-preprocessing.git
git clone --branch v1.1.1 https://github.com/TRON-Bioinformatics/tronflow-variant-normalization.git
```

And then use the following parameters:
```
nextflow run tron-bioinformatics/covigator-ngs-pipeline -profile conda \
--tronflow_bwa /covigator/dependencies/tronflow-bwa/main.nf \
--tronflow_bam_preprocessing /covigator/dependencies/tronflow-bam-preprocessing/main.nf \
--tronflow_variant_normalization /covigator/dependencies/tronflow-variant-normalization/main.nf ...
```
22 changes: 14 additions & 8 deletions bin/assembly_variant_caller.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from Bio.Align import PairwiseAlignment
from typing import List


CHROMOSOME = "MN908947.3"


Expand All @@ -17,7 +18,7 @@ class Variant:

def to_vcf_line(self):
# transform 0-based position to 1-based position
return CHROMOSOME, str(self.position + 1), ".", self.reference, self.alternate, ".", "PASS", "."
return [CHROMOSOME, str(self.position + 1), ".", self.reference, self.alternate, ".", "PASS", "."]


class AssemblyVariantCaller:
Expand Down Expand Up @@ -56,8 +57,8 @@ def _call_mutations(self, alignment: PairwiseAlignment) -> List[Variant]:
if prev_ref_end is not None and prev_ref_end != ref_start:
# deletion
if ref_start - prev_ref_end <= 50: # skips deletions longer than 50 bp
ref = reference[prev_ref_end - 1: ref_start]
if 'N' not in ref: # do not call deletions with Ns
ref = str(reference[prev_ref_end - 1: ref_start])
if not any(self._is_ambiguous_base(r) for r in ref): # do not call deletions with Ns
variants.append(Variant(
position=prev_ref_end - 1,
reference=ref,
Expand All @@ -66,8 +67,9 @@ def _call_mutations(self, alignment: PairwiseAlignment) -> List[Variant]:
# insertion
if alt_start - prev_alt_end <= 50: # skips insertions longer than 50 bp
ref = reference[prev_ref_end - 1]
alt = alternate[prev_alt_end:alt_start]
if ref != 'N' and 'N' not in alt: # do not call insertions with Ns
alt = str(alternate[prev_alt_end:alt_start])
# do not call insertions with ambiguous bases
if not self._is_ambiguous_base(ref) and not any(self._is_ambiguous_base(a) for a in alt):
variants.append(Variant(
position=prev_ref_end - 1,
reference=ref,
Expand All @@ -77,14 +79,18 @@ def _call_mutations(self, alignment: PairwiseAlignment) -> List[Variant]:
for pos, ref, alt in zip(
range(ref_start, ref_end), reference[ref_start: ref_end], alternate[alt_start: alt_end]):
# contiguous SNVs are reported separately
if ref != alt and ref != 'N' and alt != 'N': # do not call SNVs on Ns
# do not call insertions with ambiguous bases
if ref != alt and not self._is_ambiguous_base(ref) and not self._is_ambiguous_base(alt):
variants.append(Variant(position=pos, reference=ref, alternate=alt))

prev_ref_end = ref_end
prev_alt_end = alt_end

return variants

def _is_ambiguous_base(self, base):
return base not in "ACGT"


def write_vcf(mutations, output_vcf):
with open(output_vcf, "w") as vcf_out:
Expand All @@ -96,8 +102,8 @@ def write_vcf(mutations, output_vcf):
)
for row in header:
vcf_out.write(row + "\n")
for row in mutations:
vcf_out.write("\t".join(row.to_vcf_line()) + "\n")
for mutation in mutations:
vcf_out.write("\t".join(mutation.to_vcf_line()) + "\n")


def main():
Expand Down
14 changes: 14 additions & 0 deletions bin/test_assembly_variant_caller.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,17 @@ def test_assembly_variant_caller(self):
self.assertEqual(snv.alternate, "GT")
self.assertEqual(snv.position, 9)

def test_ambiguous_bases(self):
caller = AssemblyVariantCaller()
# no mutations
variants = caller.call_variants(sequence="ACGTACGT", reference="ACGTACGT")
self.assertEqual(len(variants), 0)
# ambiguous calls
variants = caller.call_variants(sequence="ACGTWCGT", reference="ACGTACGT")
self.assertEqual(len(variants), 0)
variants = caller.call_variants(sequence="ACGTNCGT", reference="ACGTACGT")
self.assertEqual(len(variants), 0)
variants = caller.call_variants(sequence="ACGTAAAZRCCCCGT", reference="ACGTACGT")
self.assertEqual(len(variants), 0)


5 changes: 3 additions & 2 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@ channels:
- defaults
dependencies:
- conda-forge::python=3.8.5
- conda-forge::biopython=1.76
- conda-forge::biopython=1.79
- bioconda::nextflow=21.04.0
- bioconda::bcftools=1.12
- bioconda::lofreq=2.1.5
- bioconda::gatk4=4.2.0.0
- bioconda::ivar=1.3.1
- bioconda::fastp=0.20.1
- bioconda::snpeff=5.0
- bioconda::snpeff=5.0
- bioconda::vt=0.57721
27 changes: 15 additions & 12 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -411,20 +411,23 @@ process variantNormalization {
set name, file(vcf) from vcfs_to_normalize

output:
set name, file("${vcf.baseName}.normalized.vcf") into normalized_vcf_files
set name, file("${vcf.baseName}.normalized.vcf") into normalized_vcf_files

script:
"""
# initial sort of the VCF
bcftools sort ${vcf} | \
# checks reference genome, decompose multiallelics, trim and left align indels
bcftools norm --multiallelics -any --check-ref e --fasta-ref ${params.reference} \
--old-rec-tag OLD_CLUMPED - | \
# decompose complex variants
vt decompose_blocksub -a -p - | \
# remove duplicates after normalisation
bcftools norm --rm-dup exact -o ${vcf.baseName}.normalized.vcf -
"""
# --input_files needs to be forced, otherwise it is inherited from profile in tests
nextflow run ${params.tronflow_variant_normalization} \
--input_vcf ${vcf} \
--input_files false \
--output . \
--reference ${reference} \
-profile ${workflow.profile} \
-work-dir ${workflow.workDir}
mv ${vcf.baseName}/${vcf.baseName}.normalized.vcf ${vcf.baseName}.normalized.vcf
"""
}

process variantAnnotation {
Expand Down
1 change: 0 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

params.tronflow_bwa = "tron-bioinformatics/tronflow-bwa -r v1.4.1"
params.tronflow_bam_preprocessing = "tron-bioinformatics/tronflow-bam-preprocessing -r v1.5.0"
params.tronflow_variant_normalization = "tron-bioinformatics/tronflow-variant-normalization -r v1.1.1"

params.reference = "$baseDir/reference/Sars_cov_2.ASM985889v3.dna.toplevel.fa"
params.gff = "$baseDir/reference/Sars_cov_2.ASM985889v3.101.gff3"
Expand Down

0 comments on commit 7488ef0

Please sign in to comment.