diff --git a/main.nf b/main.nf index 93beb95..34fd95f 100755 --- a/main.nf +++ b/main.nf @@ -13,6 +13,11 @@ include { VARIANT_ANNOTATION; VARIANT_SARSCOV2_ANNOTATION; VARIANT_VAF_ANNOTATION; VAFATOR } from './modules/06_variant_annotation' include { PANGOLIN_LINEAGE; VCF2FASTA } from './modules/07_lineage_annotation' include { BGZIP } from './modules/08_compress_vcf' +include { FASTQC; NANOPLOT } from './modules/09_ont_qc' +include { NANOFILT; PORECHOP; CHOPPER; PORECHOP_ABI; MINIMAP2 } from './modules/10_ont_preprocess' +include { OPTIMUS_NO_PRIME; PRIMER_SOFTCLIP } from './modules/11_ont_primer_removal' +include { NANOCALLER; CLAIR3 } from './modules/12_ont_variant_calling' + params.help= false @@ -60,6 +65,8 @@ params.input_fastqs_list = false params.input_fastas_list = false params.input_vcfs_list = false params.input_bams_list = false +params.input_ont = false + if (params.help) { log.info params.help_message @@ -239,42 +246,55 @@ workflow { bam_files = ALIGNMENT_PAIRED_END.out } else { - READ_TRIMMING_SINGLE_END(input_fastqs) - ALIGNMENT_SINGLE_END(READ_TRIMMING_SINGLE_END.out[0], reference) - bam_files = ALIGNMENT_SINGLE_END.out - } - BAM_PREPROCESSING(bam_files, reference) - preprocessed_bams = BAM_PREPROCESSING.out.preprocessed_bams - - if (primers) { - PRIMER_TRIMMING_IVAR(preprocessed_bams, primers) - preprocessed_bams = PRIMER_TRIMMING_IVAR.out.trimmed_bam + if (params.input_ont) { + FASTQC(input_fastqs) + PORECHOP(input_fastqs) + CHOPPER(PORECHOP.out.fq) + NANOPLOT(CHOPPER.out.fq) + input_bams = MINIMAP2(CHOPPER.out.fq).bam + } + else { + READ_TRIMMING_SINGLE_END(input_fastqs) + ALIGNMENT_SINGLE_END(READ_TRIMMING_SINGLE_END.out[0], reference) + bam_files = ALIGNMENT_SINGLE_END.out } - COVERAGE_ANALYSIS(preprocessed_bams) + if (params.input_ont) { + preprocessed_bams = PRIMER_SOFTCLIP(OPTIMUS_NO_PRIME(input_bams).prediction) + } + else { + BAM_PREPROCESSING(bam_files, reference) + preprocessed_bams = BAM_PREPROCESSING.out.preprocessed_bams + + if (primers) { + PRIMER_TRIMMING_IVAR(preprocessed_bams, primers) + preprocessed_bams = PRIMER_TRIMMING_IVAR.out.trimmed_bam + } + COVERAGE_ANALYSIS(preprocessed_bams) // variant calling vcfs_to_normalize = null - if (!params.skip_bcftools) { + if (!params.skip_bcftools && !params.input_ont) { VARIANT_CALLING_BCFTOOLS(preprocessed_bams, reference) vcfs_to_normalize = vcfs_to_normalize == null? VARIANT_CALLING_BCFTOOLS.out : vcfs_to_normalize.concat(VARIANT_CALLING_BCFTOOLS.out) } - if (!params.skip_lofreq) { + if (!params.skip_lofreq && !params.input_ont) { VARIANT_CALLING_LOFREQ(preprocessed_bams, reference) vcfs_to_normalize = vcfs_to_normalize == null? VARIANT_CALLING_LOFREQ.out : vcfs_to_normalize.concat(VARIANT_CALLING_LOFREQ.out) } - if (!params.skip_gatk) { + if (!params.skip_gatk && !params.input_ont) { VARIANT_CALLING_GATK(preprocessed_bams, reference) vcfs_to_normalize = vcfs_to_normalize == null? VARIANT_CALLING_GATK.out : vcfs_to_normalize.concat(VARIANT_CALLING_GATK.out) } - if (!params.skip_ivar && gff) { + if (!params.skip_ivar && gff && !params.input_ont) { VARIANT_CALLING_IVAR(preprocessed_bams, reference, gff) IVAR2VCF(VARIANT_CALLING_IVAR.out, reference) vcfs_to_normalize = vcfs_to_normalize == null? IVAR2VCF.out : vcfs_to_normalize.concat(IVAR2VCF.out) } + } else if (input_fastas) { if (!params.skip_pangolin) { @@ -313,6 +333,10 @@ workflow { } if (preprocessed_bams) { + if (params.input_ont) { + CLAIR3(preprocessed_bams.bam) + NANOCALLER(preprocessed_bams.bam) + } // we can only add technical annotations when we have the reads VAFATOR(normalized_vcfs.combine(preprocessed_bams, by: 0)) VARIANT_VAF_ANNOTATION(VAFATOR.out.annotated_vcf) diff --git a/modules/13_ont_annotate.nf b/modules/13_ont_annotate.nf deleted file mode 100755 index 63b32b1..0000000 --- a/modules/13_ont_annotate.nf +++ /dev/null @@ -1,169 +0,0 @@ -process VAFATOR { - cpus 1 - memory "3 GB" - tag "${sampleId}" - publishDir "${params.outDir}/${sampleId}/variant_calls/${caller}", mode: "copy" - - conda (params.enable_conda ? "bioconda::vafator=1.2.5" : null) - - input: - tuple val(sampleId), path(bam), path(bai), path(vcf), val(caller) - - output: - tuple val(sampleId), path("${sampleId}_${caller}_vaf.vcf"), val(caller), emit: annotated_vcf - - script: - mq_param = params.vafator_min_mapping_quality != false ? "--mapping-quality " + params.vafator_min_mapping_quality : "" - bq_param = params.vafator_min_base_quality != false ? "--base-call-quality " + params.vafator_min_base_quality : "" - """ - vafator \ - --input-vcf ${vcf} \ - --output-vcf ${sampleId}_${caller}_vaf.vcf \ - --bam vafator ${bam} ${mq_param} ${bq_param} - """ -} - -process SNPEFF { - cpus 1 - memory params.memory - publishDir "${params.outDir}/${sampleId}/variant_calls/${caller}", mode: "copy" - tag "${sampleId}" - - conda (params.enable_conda ? "bioconda::snpeff=5.0 bioconda::samtools=1.16.1" : null) - - input: - tuple val(sampleId), path(vcf), val(caller) - val(snpeff_data) - val(snpeff_config) - val(snpeff_organism) - - output: - tuple val(sampleId), path("${sampleId}_${caller}_snpeff.vcf.gz"), emit: annotated_vcf - path("${sampleId}_${caller}_snpeff.vcf.gz.tbi") - - """ - # for some reason the snpEff.config file needs to be in the folder where snpeff runs... - cp ${snpeff_config} . - - snpEff eff -Xmx${params.memory} -dataDir ${snpeff_data} \ - -noStats -no-downstream -no-upstream -no-intergenic -no-intron -onlyProtein -hgvs1LetterAa -noShiftHgvs \ - ${snpeff_organism} ${vcf} | bgzip -c > ${sampleId}_${caller}_snpeff.vcf.gz - - tabix -p vcf ${sampleId}_${caller}_snpeff.vcf.gz - """ -} - - -process VARIANT_VAF_ANNOTATION { - cpus 1 - memory "3 GB" - publishDir "${params.outDir}", mode: "copy" - tag "${sampleId}" - - conda (params.enable_conda ? "conda-forge::gsl=2.7 bioconda::bcftools=1.14" : null) - - input: - tuple val(sampleId), path(vcf) - - output: - tuple val(sampleId), path("${sampleId}.lowfreq_subclonal.vcf"), emit: vaf_annotated - - """ - tabix -p vcf ${vcf} - - # annotates low frequency and subclonal variants - bcftools view -Ob ${vcf} | \ - bcftools filter \ - --exclude 'INFO/vafator_af < ${params.low_frequency_variant_threshold}' \ - --soft-filter LOW_FREQUENCY - | \ - bcftools filter \ - --exclude 'INFO/vafator_af >= ${params.low_frequency_variant_threshold} && INFO/vafator_af < ${params.subclonal_variant_threshold}' \ - --soft-filter SUBCLONAL \ - --output-type v - | \ - bcftools filter \ - --exclude 'INFO/vafator_af >= ${params.subclonal_variant_threshold} && INFO/vafator_af < ${params.lq_clonal_variant_threshold}' \ - --soft-filter LOW_QUALITY_CLONAL \ - --output-type v - > ${sampleId}.lowfreq_subclonal.vcf - """ -} - -/** -Add SARS-CoV-2 specific annotations: conservation, Pfam domains and problematic sites. -Also, according to problematic sites described in DeMaio et al. (2020) we filter out any variants at the beginning and -end of the genome. -*/ -process VARIANT_SARSCOV2_ANNOTATION { - cpus 1 - memory "3 GB" - publishDir "${params.output}", mode: "copy" - tag "${sampleId}" - - conda (params.enable_conda ? "conda-forge::gsl=2.7 bioconda::bcftools=1.14" : null) - - input: - tuple val(sampleId), path(vcf) - - output: - tuple val(sampleId), path("${sampleId}.annotated_sarscov2.vcf"), emit: annotated_vcfs - - """ - bcftools annotate \ - --annotations ${params.conservation_sarscov2} \ - --header-lines ${params.conservation_sarscov2_header} \ - -c CHROM,FROM,TO,CONS_HMM_SARS_COV_2 \ - --output-type z ${vcf} | \ - bcftools annotate \ - --annotations ${params.conservation_sarbecovirus} \ - --header-lines ${params.conservation_sarbecovirus_header} \ - -c CHROM,FROM,TO,CONS_HMM_SARBECOVIRUS \ - --output-type z - | \ - bcftools annotate \ - --annotations ${params.conservation_vertebrate} \ - --header-lines ${params.conservation_vertebrate_header} \ - -c CHROM,FROM,TO,CONS_HMM_VERTEBRATE_COV \ - --output-type z - | \ - bcftools annotate \ - --annotations ${params.pfam_names} \ - --header-lines ${params.pfam_names_header} \ - -c CHROM,FROM,TO,PFAM_NAME \ - --output-type z - | \ - bcftools annotate \ - --annotations ${params.pfam_descriptions} \ - --header-lines ${params.pfam_descriptions_header} \ - -c CHROM,FROM,TO,PFAM_DESCRIPTION - | \ - bcftools filter \ - --exclude 'POS <= 55 | POS >= 29804' \ - --output-type z - > annotated_sarscov2.vcf.gz - - tabix -p vcf annotated_sarscov2.vcf.gz - - bcftools annotate \ - --annotations ${params.problematic_sites} \ - --columns INFO/problematic:=FILTER annotated_sarscov2.vcf.gz > ${sampleId}.annotated_sarscov2.vcf - """ -} - -process PANGOLIN { - cpus 1 - memory "3 GB" - publishDir "${params.outDir}", mode: "copy" - tag "${sampleId}" - - conda (params.enable_conda ? "bioconda::pangolin=4.1.2" : null) - - input: - tuple val(sampleId), path(fasta) - - output: - tuple val(name), path("*pangolin.csv") - - """ - mkdir tmp - #--decompress-model - pangolin \ - ${fasta} \ - --outfile ${sampleId}.pangolin.csv \ - --tempdir ./tmp \ - --threads 1 - """ -} \ No newline at end of file