Merge pull request #32 from sanger-tol/cov

add vecscreen test set on ci, checkin coverage related scripts, updat…
sanger-tol · Oct 12, 2023 · 06db313 · 06db313
2 parents a8bdffd + b994dae
commit 06db313
Show file tree

Hide file tree

Showing 6 changed files with 105 additions and 1 deletion.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -74,6 +74,11 @@ jobs:
           mkdir diamond
           wget -c https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/UP000000212_1234679_tax.dmnd -O diamond/UP000000212_1234679_tax.dmnd
 
+      - name: Download the vecscreen test data
+        run: |
+          mkdir vecscreen
+          wget -c https://tolit.cog.sanger.ac.uk/test-data/resources/vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna -O vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna
+
       - name: Run pipeline with test data
         # TODO nf-core: You can customise CI pipeline run tests as required
         # For example: adding multiple test runs with different parameters

diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml
@@ -16,6 +16,7 @@ busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages
 fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/
 diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
 diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
+vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna
 seqkit:
   sliding: 6000
   window: 100000
diff --git a/assets/test.yaml b/assets/test.yaml
@@ -13,6 +13,9 @@ ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdu
 ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp
 busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-03-14/lineages
 fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb
+vecscreen_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/vecscreen_database/adaptors_for_screening_euks.fa
+diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/uniprot/uniprot_reference_proteomes_with_taxonnames.dmnd
+diamond_nr_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/proteins2/nr.dmnd
 seqkit:
   sliding: 6000
   window: 100000
diff --git a/bin/sam_to_sorted_indexed_bam.py b/bin/sam_to_sorted_indexed_bam.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+"""
+Script conversion of .sam file with mapped reads to sorted and indexed .bam file
+"""
+# MIT License
+#
+# Copyright (c) 2020-2021 Genome Research Ltd.
+#
+# Author: Eerik Aunin ([email protected])
+#
+# This file is a part of the Genome Decomposition Analysis (GDA) pipeline.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import general_purpose_functions as gpf
+import argparse
+
+
+def main(sam_file, threads, fasta_path):
+    sam_file_extensionless_path = sam_file.split(".sam")[0]
+    bam_file = sam_file_extensionless_path + ".bam"
+    sorted_bam_file = sam_file_extensionless_path + "_sorted.bam"
+    sam_to_bam_command = None
+    if fasta_path == "":
+        sam_to_bam_command = "samtools view -Sb -@ {} {} > {}".format(threads, sam_file, bam_file)
+    else:
+        sam_to_bam_command = "samtools view -T {} -Sb -@ {} {} > {}".format(fasta_path, threads, sam_file, bam_file)
+    gpf.run_system_command(sam_to_bam_command)
+    sort_bam_command = "samtools sort -@ {} {} -o {}".format(threads, bam_file, sorted_bam_file)
+    gpf.run_system_command(sort_bam_command)
+    index_bam_command = "samtools index " + sorted_bam_file
+    gpf.run_system_command(index_bam_command)
+    remove_sam_command = "rm " + sam_file
+    gpf.run_system_command(remove_sam_command)
+    remove_unsorted_bam_command = "rm " + bam_file
+    gpf.run_system_command(remove_unsorted_bam_command)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("sam_file", type=str, help="Path to input SAM file")
+    parser.add_argument("--threads", type=int, help="Number of threads (default: 1)", default=1)
+    parser.add_argument(
+        "--fasta_path",
+        type=str,
+        help="Optional (needed when the reads have been mapped to a genome larger than 4Gb with minimap2): path the FASTA file to which the reads were mapped",
+        default="",
+    )
+    args = parser.parse_args()
+    main(args.sam_file, args.threads, args.fasta_path)
diff --git a/bin/samtools_depth_average_coverage.py b/bin/samtools_depth_average_coverage.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+"""
+Script for finding the average coverage of each scaffold in a SAMtools depth output file
+"""
+
+import sys
+import general_purpose_functions as gpf
+
+in_path = sys.argv[1]
+in_data = gpf.ll(in_path)
+scaffs_dict = dict()
+
+for line in in_data:
+    split_line = line.split()
+    scaff_name = split_line[0]
+    coverage = int(split_line[2])
+    if scaff_name in scaffs_dict:
+        scaffs_dict[scaff_name]["coverage_sum"] += coverage
+        scaffs_dict[scaff_name]["scaff_len"] += 1
+    else:
+        scaffs_dict[scaff_name] = dict()
+        scaffs_dict[scaff_name]["coverage_sum"] = coverage
+        scaffs_dict[scaff_name]["scaff_len"] = 1
+
+for scaff_name in scaffs_dict:
+    average_coverage = scaffs_dict[scaff_name]["coverage_sum"] / scaffs_dict[scaff_name]["scaff_len"]
+    print(scaff_name + "," + str(average_coverage))
diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf
@@ -35,9 +35,10 @@ workflow YAML_INPUT {
                 ncbi_taxonomy_path:                             ( data.ncbi_taxonomy_path               )
                 ncbi_rankedlineage_path:                        ( data.ncbi_rankedlineage_path          )
                 busco_lineages_folder:                          ( data.busco_lineages_folder            )
-                seqkit_values                       :           ( data.seqkit                           )
+                seqkit_values:                                  ( data.seqkit                           )
                 diamond_uniprot_database_path:                  ( data.diamond_uniprot_database_path    )
                 diamond_nr_database_path:                       ( data.diamond_nr_database_path         )
+                vecscreen_database_path:                        ( data.vecscreen_database_path          )
 
         }
         .set{ group }
@@ -82,6 +83,7 @@ workflow YAML_INPUT {
     fcs_gx_database_path             = group.fcs_gx_database_path
     diamond_uniprot_database_path    = group.diamond_uniprot_database_path
     diamond_nr_database_path         = group.diamond_nr_database_path
+    vecscreen_database_path          = group.vecscreen_database_path
     seqkit_sliding                   = seqkit.sliding_value
     seqkit_window                    = seqkit.window_value
     versions                         = ch_versions.ifEmpty(null)