From 9950a69dca3a2ad71e5b8404c4b9bc997fc006e1 Mon Sep 17 00:00:00 2001
From: AitorOP <aitor.pesetaop@gmail.com>
Date: Fri, 16 Aug 2024 09:08:38 +0200
Subject: [PATCH] Friederike requests

---
 README.md                                     |  4 +-
 conf/modules/lofreq.config                    |  1 +
 docs/output.md                                | 35 +++++++++---------
 nextflow_schema.json                          |  2 +-
 .../main.nf                                   |  3 ++
 tests/config/pytesttags.yml                   | 12 ++++++
 tests/test_lofreq.yml                         | 37 +++++++++++++++++++
 7 files changed, 74 insertions(+), 20 deletions(-)
 create mode 100644 tests/test_lofreq.yml
diff --git a/README.md b/README.md
index fe4daca409..9fdbb504c8 100644
--- a/README.md
+++ b/README.md
@@ -42,9 +42,9 @@ It's listed on [Elixir - Tools and Data Services Registry](https://bio.tools/nf-
 Depending on the options and samples provided, the pipeline can currently perform the following:
 
 - Form consensus reads from UMI sequences (`fgbio`)
-- Sequencing quality control and trimming (enabled by `--trim_fastq`) (`FastQC`, `fastp`,`bedtools`)
+- Sequencing quality control and trimming (enabled by `--trim_fastq`) (`FastQC`, `fastp`)
 - Map Reads to Reference (`BWA-mem`, `BWA-mem2`, `dragmap` or `Sentieon BWA-mem`)
-- Process BAM file (`GATK MarkDuplicates`, `GATK BaseRecalibrator` and `GATK ApplyBQSR` or `Sentieon LocusCollector` and `Sentieon Dedup`)
+- Process BAM file (`GATK MarkDuplicates`,`bedtools`, `GATK BaseRecalibrator` and `GATK ApplyBQSR` or `Sentieon LocusCollector` and `Sentieon Dedup`)
 - Summarise alignment statistics (`samtools stats`, `mosdepth`)
 - Variant calling (enabled by `--tools`, see [compatibility](https://nf-co.re/sarek/latest/docs/usage#which-variant-calling-tool-is-implemented-for-which-data-type)):
   - `ASCAT`
diff --git a/conf/modules/lofreq.config b/conf/modules/lofreq.config
index 107a8a75fe..21007e3bf0 100644
--- a/conf/modules/lofreq.config
+++ b/conf/modules/lofreq.config
@@ -23,6 +23,7 @@ process {
                 path: { "${params.outdir}/variant_calling/lofreq/${meta.id}/" },
                 pattern: "*{vcf.gz,vcf.gz.tbi}"
             ]
+            max_cpus: 4
         }
     }
 }
diff --git a/docs/output.md b/docs/output.md
index 63c5d124d9..0b467167c2 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -16,12 +16,12 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
     - [Trim adapters](#trim-adapters)
     - [Split FastQ files](#split-fastq-files)
     - [UMI consensus](#umi-consensus)
-    - [Bedtools](#bedtools)
   - [Map to Reference](#map-to-reference)
     - [BWA](#bwa)
     - [BWA-mem2](#bwa-mem2)
     - [DragMap](#dragmap)
     - [Sentieon BWA mem](#sentieon-bwa-mem)
+    - [Bedtools](#bedtools)
   - [Mark Duplicates](#mark-duplicates)
     - [GATK MarkDuplicates (Spark)](#gatk-markduplicates-spark)
   - [Sentieon LocusCollector and Dedup](#sentieon-locuscollector-and-dedup)
@@ -160,22 +160,6 @@ These files are intermediate and by default not placed in the output-folder kept
 
 </details>
 
-#### Bedtools
-
-[Bedtools](https://github.com/arq5x/bedtools2) utilities are a swiss-army knife of tools for a wide-range of genomics analysis tasks. The most widely-used tools enable genome arithmetic. Bedtools allows one to intersect, merge, count, complement, and shuffle genomic intervals from multiple files in widely-used genomic file formats such as BAM, BED, GFF/GTF, VCF.
-While each individual tool is designed to do a relatively simple task (e.g., intersect two interval files), quite sophisticated analyses can be conducted by combining multiple bedtools operations on the UNIX command line.
-
-<details markdown="1">
-<summary>Output files for all samples</summary>
-
-**Output directory: `{outdir}/reports/bedtools/`**
-
-- `<sample>.bed`
-  - New .bed file with the news changes.
-  </details>
-
-</details>
-
 ### Map to Reference
 
 #### BWA
@@ -213,6 +197,23 @@ The alignment files (BAM or CRAM) produced by the chosen aligner are not publish
   - BAM file and index
   </details>
 
+#### Bedtools
+
+[Bedtools](https://github.com/arq5x/bedtools2) utilities are a swiss-army knife of tools for a wide-range of genomics analysis tasks. The most widely-used tools enable genome arithmetic. Bedtools allows one to intersect, merge, count, complement, and shuffle genomic intervals from multiple files in widely-used genomic file formats such as BAM, BED, GFF/GTF, VCF.
+While each individual tool is designed to do a relatively simple task (e.g., intersect two interval files), quite sophisticated analyses can be conducted by combining multiple bedtools operations on the UNIX command line.
+
+<details markdown="1">
+<summary>Output files for all samples</summary>
+
+**Output directory: `{outdir}/reports/bedtools/`**
+
+- `<sample>.bed`
+  - When applying bedtools sort to a .bed file, the lines are reordered so that the genomic regions are in ascending order according to their position in the   genome.
+  - When applying bedtools merge, overlapping or adjacent regions are combined into one, reducing redundancy and creating longer intervals that cover all the original regions.
+  </details>
+
+</details>
+
 ### Mark Duplicates
 
 During duplicate marking, read pairs that are likely to have originated from duplicates of the same original DNA fragments through some artificial processes are identified. These are considered to be non-independent observations, so all but a single read pair within each set of duplicates are marked, causing the marked pairs to be ignored by default during the variant discovery process.
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 83a2eb6825..a0e4e104de 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -112,7 +112,7 @@
                     "fa_icon": "fas fa-toolbox",
                     "description": "Tools to use for duplicate marking, variant calling and/or for annotation.",
                     "help_text": "Multiple tools separated with commas.\n\n**Variant Calling:**\n\nGermline variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: DeepVariant, FreeBayes, GATK HaplotypeCaller, mpileup, Sentieon Haplotyper, Strelka, Lofreq\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit\n\nTumor-only somatic variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, mpileup, Mutect2, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit, ControlFREEC\n\nSomatic variant calling can currently only be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, Mutect2, Strelka\n- Structural variants: Manta, TIDDIT\n- Copy-Number: ASCAT, CNVKit, Control-FREEC\n- Microsatellite Instability: MSIsensorpro\n\n> **NB** Mutect2 for somatic variant calling cannot be combined with `--no_intervals`\n\n**Annotation:**\n \n- snpEff, VEP, merge (both consecutively), and bcftools annotate (needs `--bcftools_annotation`).\n\n> **NB** As Sarek will use bgzip and tabix to compress and index VCF files annotated, it expects VCF files to be sorted when starting from `--step annotate`.",
-                    "pattern": "^((ascat|bcfann|cnvkit|controlfreec|deepvariant|freebayes|haplotypecaller|sentieon_dnascope|sentieon_haplotyper|manta|merge|mpileup|msisensorpro|mutect2|lofreq|ngscheckmate|sentieon_dedup|snpeff|strelka|tiddit|vep)?,?)*(?<!,)$"
+                    "pattern": "^((ascat|bcfann|cnvkit|controlfreec|deepvariant|freebayes|haplotypecaller|lofreq|sentieon_dnascope|sentieon_haplotyper|manta|merge|mpileup|msisensorpro|mutect2|ngscheckmate|sentieon_dedup|snpeff|strelka|tiddit|vep)?,?)*(?<!,)$"
                 },
                 "skip_tools": {
                     "type": "string",
diff --git a/subworkflows/local/bam_variant_calling_tumor_only_lofreq/main.nf b/subworkflows/local/bam_variant_calling_tumor_only_lofreq/main.nf
index 509ada601f..dc655ee4cd 100644
--- a/subworkflows/local/bam_variant_calling_tumor_only_lofreq/main.nf
+++ b/subworkflows/local/bam_variant_calling_tumor_only_lofreq/main.nf
@@ -33,7 +33,10 @@ workflow BAM_VARIANT_CALLING_TUMOR_ONLY_LOFREQ {
 
     vcf = Channel.empty().mix(LOFREQ.out.vcf)
         .map{ meta, vcf -> [ meta + [ variantcaller:'lofreq' ], vcf ] }
+
     versions = versions.mix(LOFREQ.out.versions)
+    versions = versions.mix(SORT_INTERVALS.out.versions)
+    versions = versions.mix(MERGE_INTERVALS.out.versions)
 
     emit:
     vcf
diff --git a/tests/config/pytesttags.yml b/tests/config/pytesttags.yml
index 63c96d73a9..a547e8337a 100644
--- a/tests/config/pytesttags.yml
+++ b/tests/config/pytesttags.yml
@@ -536,6 +536,18 @@ strelka_bp:
   - tests/csv/3.0/recalibrated_somatic.csv
   - tests/test_strelka_bp.yml
 
+## lofreq
+lofreq:
+  - conf/modules/lofreq.config
+  - modules/nf-core/bedtools/sort/**
+  - modules/nf-core/bedtools/merge/**
+  - modules/nf-core/mosdepth/**
+  - modules/nf-core/lofreq/callparallel/**
+  - subworkflows/local/bam_variant_calling_tumor_only_lofreq/**
+  - subworkflows/local/bam_variant_calling_tumor_only_all/**
+  - tests/csv/3.0/recalibrated_tumoronly.csv
+  - tests/test_lofreq.yml
+
 ## tiddit
 tiddit:
   - conf/modules/tiddit.config
diff --git a/tests/test_lofreq.yml b/tests/test_lofreq.yml
new file mode 100644
index 0000000000..49ab8e206f
--- /dev/null
+++ b/tests/test_lofreq.yml
@@ -0,0 +1,37 @@
+- name: Run variant calling on tumor only sample with lofreq
+  command: nextflow run main.nf -profile test,tools_tumoronly --tools lofreq --outdir results
+  tags:
+    - lofreq
+    - tumor_only
+    - variant_calling
+  files:
+    - path: results/csv/variantcalled.csv
+      md5sum: 5cce88d8a0961c51e15120c6cffc1de4
+    - path: results/csv/mapped.csv
+      md5sum: 85c4d7e1fed217509c3f5c9cbd93539f
+    - path: results/csv/recalibrated.csv
+      md5sum: 4251894dfed507f5b4a59b97cdea68cf
+    - path: results/multiqc
+    - path: results/sort
+    - path: results/merge
+    - path: results/reports/bcftools/lofreq/sample2/sample2.bcftools_stats.txt
+      md5sum: 795d766515702e277fecfe54cef17eb0
+    # conda changes md5sums for test
+    - path: results/reports/vcftools/lofreq/sample2/sample2.FILTER.summary
+      md5sum: be7ff84cf917483f02a6ae28edae999d
+    - path: results/reports/vcftools/lofreq/sample2/sample2.TsTv.qual
+      md5sum: e5d29ea7ac3d1ddfe77ae4574615c366
+    # conda changes md5sums for test
+    - path: results/reports/samtools/sample2/sample2.recal.cram.stats
+      md5sum: 345e7084e5dda88fe368894d19ee50de
+    - path: results/reports/samtools/sample2/sample2.sorted.cram.stats
+      md5sum: 6e3505dc1d2ea5db94232fdd8e33ae84
+    # conda changes md5sums for test
+    - path: results/variant_calling/lofreq/sample2/sample2.vcf.gz
+      md5sum: 15b7a969076d113d6fb18f00c9312a76
+    # binary changes md5sums on reruns
+    - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.global.dist.txt
+    - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.region.dist.txt
+    - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.summary.txt
+    - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz
+    - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz.csi