Skip to content

Commit

Permalink
Merge pull request #32 from sanger-tol/cov
Browse files Browse the repository at this point in the history
add vecscreen test set on ci, checkin coverage related scripts, updat…
  • Loading branch information
yumisims authored Oct 12, 2023
2 parents a8bdffd + b994dae commit 06db313
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 1 deletion.
5 changes: 5 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ jobs:
mkdir diamond
wget -c https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/UP000000212_1234679_tax.dmnd -O diamond/UP000000212_1234679_tax.dmnd
- name: Download the vecscreen test data
run: |
mkdir vecscreen
wget -c https://tolit.cog.sanger.ac.uk/test-data/resources/vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna -O vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna
- name: Run pipeline with test data
# TODO nf-core: You can customise CI pipeline run tests as required
# For example: adding multiple test runs with different parameters
Expand Down
1 change: 1 addition & 0 deletions assets/github_testing/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages
fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/
diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna
seqkit:
sliding: 6000
window: 100000
3 changes: 3 additions & 0 deletions assets/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdu
ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp
busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-03-14/lineages
fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb
vecscreen_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/vecscreen_database/adaptors_for_screening_euks.fa
diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/uniprot/uniprot_reference_proteomes_with_taxonnames.dmnd
diamond_nr_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/proteins2/nr.dmnd
seqkit:
sliding: 6000
window: 100000
66 changes: 66 additions & 0 deletions bin/sam_to_sorted_indexed_bam.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env python3
"""
Script conversion of .sam file with mapped reads to sorted and indexed .bam file
"""
# MIT License
#
# Copyright (c) 2020-2021 Genome Research Ltd.
#
# Author: Eerik Aunin ([email protected])
#
# This file is a part of the Genome Decomposition Analysis (GDA) pipeline.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import general_purpose_functions as gpf
import argparse


def main(sam_file, threads, fasta_path):
sam_file_extensionless_path = sam_file.split(".sam")[0]
bam_file = sam_file_extensionless_path + ".bam"
sorted_bam_file = sam_file_extensionless_path + "_sorted.bam"
sam_to_bam_command = None
if fasta_path == "":
sam_to_bam_command = "samtools view -Sb -@ {} {} > {}".format(threads, sam_file, bam_file)
else:
sam_to_bam_command = "samtools view -T {} -Sb -@ {} {} > {}".format(fasta_path, threads, sam_file, bam_file)
gpf.run_system_command(sam_to_bam_command)
sort_bam_command = "samtools sort -@ {} {} -o {}".format(threads, bam_file, sorted_bam_file)
gpf.run_system_command(sort_bam_command)
index_bam_command = "samtools index " + sorted_bam_file
gpf.run_system_command(index_bam_command)
remove_sam_command = "rm " + sam_file
gpf.run_system_command(remove_sam_command)
remove_unsorted_bam_command = "rm " + bam_file
gpf.run_system_command(remove_unsorted_bam_command)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("sam_file", type=str, help="Path to input SAM file")
parser.add_argument("--threads", type=int, help="Number of threads (default: 1)", default=1)
parser.add_argument(
"--fasta_path",
type=str,
help="Optional (needed when the reads have been mapped to a genome larger than 4Gb with minimap2): path the FASTA file to which the reads were mapped",
default="",
)
args = parser.parse_args()
main(args.sam_file, args.threads, args.fasta_path)
27 changes: 27 additions & 0 deletions bin/samtools_depth_average_coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env python3
"""
Script for finding the average coverage of each scaffold in a SAMtools depth output file
"""

import sys
import general_purpose_functions as gpf

in_path = sys.argv[1]
in_data = gpf.ll(in_path)
scaffs_dict = dict()

for line in in_data:
split_line = line.split()
scaff_name = split_line[0]
coverage = int(split_line[2])
if scaff_name in scaffs_dict:
scaffs_dict[scaff_name]["coverage_sum"] += coverage
scaffs_dict[scaff_name]["scaff_len"] += 1
else:
scaffs_dict[scaff_name] = dict()
scaffs_dict[scaff_name]["coverage_sum"] = coverage
scaffs_dict[scaff_name]["scaff_len"] = 1

for scaff_name in scaffs_dict:
average_coverage = scaffs_dict[scaff_name]["coverage_sum"] / scaffs_dict[scaff_name]["scaff_len"]
print(scaff_name + "," + str(average_coverage))
4 changes: 3 additions & 1 deletion subworkflows/local/yaml_input.nf
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ workflow YAML_INPUT {
ncbi_taxonomy_path: ( data.ncbi_taxonomy_path )
ncbi_rankedlineage_path: ( data.ncbi_rankedlineage_path )
busco_lineages_folder: ( data.busco_lineages_folder )
seqkit_values : ( data.seqkit )
seqkit_values: ( data.seqkit )
diamond_uniprot_database_path: ( data.diamond_uniprot_database_path )
diamond_nr_database_path: ( data.diamond_nr_database_path )
vecscreen_database_path: ( data.vecscreen_database_path )

}
.set{ group }
Expand Down Expand Up @@ -82,6 +83,7 @@ workflow YAML_INPUT {
fcs_gx_database_path = group.fcs_gx_database_path
diamond_uniprot_database_path = group.diamond_uniprot_database_path
diamond_nr_database_path = group.diamond_nr_database_path
vecscreen_database_path = group.vecscreen_database_path
seqkit_sliding = seqkit.sliding_value
seqkit_window = seqkit.window_value
versions = ch_versions.ifEmpty(null)
Expand Down

0 comments on commit 06db313

Please sign in to comment.