Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dp24 barcodes #34

Merged
merged 26 commits into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion assets/github_testing/test.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
assembly_path: /home/runner/work/ascc/ascc/asccTinyTest/assembly/Pyoeliiyoelii17XNL_assembly.fa
assembly_title: asccTinyTest
pacbio_multiplexing_barcode_names: ""
pacbio_barcodes: /home/runner/work/ascc/ascc/assets/pacbio_adaptors.fa
pacbio_multiplexing_barcode_names: "bc1008_BAK8A_OA,bc1009_BAK8A_OA"
DLBPointon marked this conversation as resolved.
Show resolved Hide resolved
pacbio_reads_path: /home/runner/work/ascc/ascc/asccTinyTest/pacbio/
sci_name: "Plasmodium yoelii yoelii 17XNL"
taxid: 352914
Expand Down
3 changes: 2 additions & 1 deletion assets/test.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
assembly_path: /lustre/scratch123/tol/resources/treeval/treeval-testdata/asccTinyTest/assembly/Pyoeliiyoelii17XNL_assembly.fa
assembly_title: asccTinyTest
pacbio_multiplexing_barcode_names: something
pacbio_barcodes: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/assets/pacbio_adaptors.fa
DLBPointon marked this conversation as resolved.
Show resolved Hide resolved
pacbio_multiplexing_barcode_names: "bc1008_BAK8A_OA,bc1009_BAK8A_OA"
pacbio_reads_path: /lustre/scratch123/tol/resources/treeval/treeval-testdata/asccTinyTest/pacbio/
sci_name: "Plasmodium yoelii yoelii 17XNL"
taxid: 352914
Expand Down
59 changes: 35 additions & 24 deletions bin/pacbio_barcode_check.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
#!/usr/bin/env python3

"""
Notes: Forces sys.exit(1) to kill pipeline
Pacbio Barcode Check
------------------------
Looks for Pacbio barcodes in ref and data.
If supplied barcodes arn't in data then pipeline dies.
DLBPointon marked this conversation as resolved.
Show resolved Hide resolved

Originally written by Eerik Aunin @eeaunin
DLBPointon marked this conversation as resolved.
Show resolved Hide resolved

Expand All @@ -21,7 +26,9 @@ def detect_barcodes_from_read_file_names(barcodes_fasta_path, pacbio_read_files)
barcodes_fasta_data = gpf.l(barcodes_fasta_path)
barcode_names = [n.split(">")[1] for n in barcodes_fasta_data if n.startswith(">")]
if len(barcode_names) == 0:
print("NO BARCODES, KILL PIPELINE")
sys.stderr.write(
f"Failed to read PacBio multiplexing barcode names from the specified file {barcodes_fasta_data}\n"
)
sys.exit(1)
detected_barcodes = list()
for barcode_name in barcode_names:
Expand All @@ -40,44 +47,48 @@ def check_if_barcodes_exist_in_barcodes_fasta(barcodes_list, barcodes_fasta_path
barcode_names_in_fasta = [n.split(">")[1] for n in barcodes_fasta_data if n.startswith(">")]
for barcode in barcodes_list:
if barcode not in barcode_names_in_fasta:
# sys.stderr.write(f"The PacBio multiplexing barcode ({barcode}) was not found in the barcode sequences file ({barcodes_fasta_path})\n")
print("NO BARCODES, KILL PIPELINE")
sys.stderr.write(
f"The PacBio multiplexing barcode ({barcode}) was not found in the barcode sequences file ({barcodes_fasta_path})\n"
)
sys.exit(1)

# If this print statement is reached, all user-supplied codes are present.
print("BARCODES FOUND\n")
DLBPointon marked this conversation as resolved.
Show resolved Hide resolved


def main(barcodes_fasta_path, pacbio_read_files, pacbio_multiplexing_barcode_names):
pacbio_read_files = pacbio_read_files.split(",")

barcodes_list = []
if pacbio_multiplexing_barcode_names != "NA":
barcodes_list = pacbio_multiplexing_barcode_names.split(",")

current_script_dir = os.path.dirname(sys.argv[0])
if len(pacbio_multiplexing_barcode_names) > 0:
barcodes_list = pacbio_multiplexing_barcode_names.strip("[").strip("]").split(",")

if barcodes_fasta_path is None:
barcodes_fasta_path = f"{current_script_dir}/third_party_files/pacbio_barcode_screen/pacbio_adaptors.fa"
else:
if os.path.isfile(barcodes_fasta_path) is False:
print("NO BARCODES, KILL PIPELINE")
sys.exit(1)
if os.path.isfile(barcodes_fasta_path) is False:
sys.stderr.write(
"FASTA file with PacBio multiplexing barcode sequences ({barcodes_fasta_path}) was not found\n"
)
sys.exit(1)

if barcodes_list == []:
if len(barcodes_list) == 0:
barcodes_list = detect_barcodes_from_read_file_names(barcodes_fasta_path, pacbio_read_files)

if len(barcodes_list) == 0:
print("NO BARCODES, KILL PIPELINE")
sys.exit(1)
sys.stderr.write(
"Skipping the PacBio barcodes check, as no barcodes were specified by the user and no barcodes were found in PacBio read file names\n"
)
sys.exit(0)

check_if_barcodes_exist_in_barcodes_fasta(
barcodes_list, barcodes_fasta_path
) # This is a TRUE | FALSE check, if FALSE kill pipeline.
print("BARCODES FOUND!")
check_if_barcodes_exist_in_barcodes_fasta(barcodes_list, barcodes_fasta_path)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("barcode_fasta", type=str, help="Pacbio Barcode FASTA file")
parser.add_argument("pacbio_reads", type=str, help="Pacbio Read FASTA.gz files")
parser.add_argument("multiplex_name", type=str, help="Pacbio Multiplex Barcode Name")
parser.add_argument("-b", "--barcode_fasta", type=str, help="Pacbio Barcode FASTA file")

parser.add_argument("-p", "--pacbio_reads", type=str, help="Pacbio Read FASTA.gz files")

parser.add_argument("-m", "--multiplex_name", type=str, help="Pacbio Multiplex Barcode Name")

parser.add_argument("-v", action="version", version="1.0")
args = parser.parse_args()
main(args.barcode_fasta, args.pacbio_reads, args.multiplex_name)
Expand Down
4 changes: 4 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ process {
ext.args = 'nucleotide'
}

withName: BLAST_MAKEBLASTDB {
ext.args = { "-dbtype nucl" }
}

withName: BLAST_BLASTN {
ext.args = { "-outfmt '6 qseqid staxids bitscore std' -max_target_seqs 10 -max_hsps 1 -evalue 1e-25 -dust yes -lcase_masking" }
}
Expand Down
23 changes: 12 additions & 11 deletions modules/local/check_barcode.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,23 @@ process CHECK_BARCODE {
'biocontainers/python:3.9' }"

input:
tuple val(meta) , path(barcodes)
tuple val(meta2) , path(pacbio_dir)
tuple val(meta3) , path(multiplex_csv)
tuple val(meta) , path(pacbio_dir)
path barcodes
val multiplex_csv

output:
stdout , emit: debarcoded
env OUTPUT , emit: result
path "versions.yml" , emit: versions

script:
def prefix = task.ext.prefix ?: "${meta.id}"
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def args = task.ext.args ?: ''
"""
pacbio_barcode_check.py \\
${barcode_fasta} \\
${pacbio_dir} \\
${multiplex_csv}
OUTPUT=\$(\\
pacbio_barcode_check.py \\
-b ${barcodes} \\
-p ${pacbio_dir} \\
-m ${multiplex_csv})

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand All @@ -34,7 +35,7 @@ process CHECK_BARCODE {

stub:
"""
echo "BARCODES FOUND!"
OUTPUT="BARCODES FOUND"

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
31 changes: 16 additions & 15 deletions modules/local/filter_barcode.nf
DLBPointon marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -2,43 +2,44 @@ process FILTER_BARCODE {
tag "${meta.id}"
label 'process_low'

conda "conda-forge::python=3.9"
conda "conda-forge::python=3.9 conda-forge::biopython=1.78"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/python:3.9' :
'biocontainers/python:3.9' }"
'https://depot.galaxyproject.org/singularity/biopython:1.78' :
'biocontainers/biopython:1.78' }"

input:
tuple val(meta), path(fasta)
tuple val(meta2), path(barcodes)
tuple val(meta) , path(fasta)
tuple val(meta2), path(blast_data)
val barcodes

output:
tuple val(meta), path( "*txt" ) , emit: debarcoded
path "versions.yml" , emit: versions
tuple val(meta), path( "*filtered.txt" ) , emit: debarcoded
path "versions.yml" , emit: versions

script:
def prefix = task.ext.prefix ?: "${meta.id}"
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}.debarcoded" // args is for `pacbio_multiplexing_barcodes_check_${meta.barcode}.txt`
def prefix = task.ext.prefix ?: "${meta.id}"
"""
filter_barcode_blast_results.py \\
--input ${fasta} \\
--barcodes ${barcodes} \\
--barcode ${barcodes} \\
--blast ${blast_data} \\
--output ${prefix}.txt
--output ${prefix}_${barcodes}_filtered.txt

cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python --version | sed 's/Python //g')
biopython: \$(python3 -c 'import Bio; print(Bio.__version__)')
filter_barcode_blast_results: \$(filter_barcode_blast_results.py -v)
END_VERSIONS
"""

stub:
def prefix = task.ext.prefix ?: "${meta.id}"
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}.debarcoded" // args is for `pacbio_multiplexing_barcodes_check_${meta.barcode}.txt`
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def barcodes = "bc1008_BAK8A_OA"
DLBPointon marked this conversation as resolved.
Show resolved Hide resolved
"""
touch ${prefix}.txt
touch ${prefix}-${barcodes}-filtered.txt

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
2 changes: 1 addition & 1 deletion modules/local/gc_content.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ process GC_CONTENT {

output:
tuple val(meta), path( "*-gc.txt" ) , emit: txt
path "versions.yml" , emit: versions
path "versions.yml" , emit: versions

script:
def prefix = task.ext.prefix ?: "${meta.id}"
Expand Down
1 change: 1 addition & 0 deletions modules/local/get_lineage_for_kraken.nf
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ process GET_LINEAGE_FOR_KRAKEN {
cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python --version | sed 's/Python //g')
pandas: \$(python3 -c 'import pandas; print(pandas.__version__)')
general_purpose_functions.py: \$(general_purpose_functions.py --version | cut -d' ' -f2)
get_lineage_for_kraken_results.py: \$(get_lineage_for_kraken_results.py --version | cut -d' ' -f2)
END_VERSIONS
Expand Down
1 change: 0 additions & 1 deletion subworkflows/local/extract_nt_blast.nf
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
// MODULE IMPORT BLOCK
include { BLAST_BLASTN } from '../../modules/nf-core/blast/blastn/main'

include { SEQKIT_SLIDING } from '../../modules/nf-core/seqkit/sliding/main'
include { BLAST_CHUNK_TO_FULL } from '../../modules/local/blast_chunk_to_full'
include { REFORMAT_FULL_OUTFMT6 } from '../../modules/local/reformat_full_outfmt6'
Expand Down
4 changes: 1 addition & 3 deletions subworkflows/local/extract_tiara_hits.nf
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
include { TIARA_TIARA } from '../../modules/nf-core/tiara/tiara/main'



workflow EXTRACT_TIARA_HITS {

take:
Expand All @@ -19,4 +17,4 @@ workflow EXTRACT_TIARA_HITS {
ch_tiara = TIARA_TIARA.out.classifications
versions = ch_versions.ifEmpty(null)

}
}
1 change: 1 addition & 0 deletions subworkflows/local/generate_genome.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ include { GET_LARGEST_SCAFF } from '../../modules/local/get_largest_scaff'
workflow GENERATE_GENOME {
take:
to_chromsize // tuple [[meta.id], file]
barcodes

main:
ch_versions = Channel.empty()
Expand Down
102 changes: 102 additions & 0 deletions subworkflows/local/pacbio_barcode_check.nf
DLBPointon marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
//
// PACBIO_BARCODE_CHECK IDENTIFIED LOCATIONS OF BARCODE SEQUENCES IN THE INPUT ASSEMBLY
//

//
// MODULE IMPORT BLOCK
//
include { CHECK_BARCODE } from '../../modules/local/check_barcode'
include { BLAST_MAKEBLASTDB } from '../../modules/nf-core/blast/makeblastdb'
include { BLAST_BLASTN } from '../../modules/nf-core/blast/blastn'
include { FILTER_BARCODE } from '../../modules/local/filter_barcode'

workflow PACBIO_BARCODE_CHECK {
take:
reference_tuple // tuple [[meta.id], reference ]
pacbio_tuple // tuple [[meta.id], pacbio-files]
barcodes // tuple [[meta.id], barcode-file]
barcode_multiplex // val (csv-list-string)

main:
ch_versions = Channel.empty()

//
// MODULE: CHECK FOR KNOWN BARCODES IN SAMPLE DATA
//
CHECK_BARCODE (
pacbio_tuple,
barcodes.map{it[1]},
barcode_multiplex
)
ch_versions = ch_versions.mix(CHECK_BARCODE.out.versions)

//
// LOGIC: INCASE THE PIPELINE MANAGES TO CONTINUE AFTER FAILING CHECK_BARCODE
DLBPointon marked this conversation as resolved.
Show resolved Hide resolved
// HERE WE ENSURE THE REST OF THE SUBWORKFLOW DOES NOT RUN
//
CHECK_BARCODE.out.result
.branch {
valid : it.toString().contains('BARCODES FOUND')
invalid : it.toString().contains('FAILED')
}
.set { gatekeeping }

//
// LOGIC: ENSURE THE VALID CHANNEL IS MIXED WITH THE BARCODES CHANNEL
// ACTS AS A GATEKEEPER FOR THE FLOW
//
gatekeeping.valid
.combine( barcodes )
.map {str, meta, file ->
file
}
.set {ch_new_barcodes}

//
// MODULE: GENERATE BLAST DB ON ORGANELLAR GENOME
//
BLAST_MAKEBLASTDB (
ch_new_barcodes
)
ch_versions = ch_versions.mix(BLAST_MAKEBLASTDB.out.versions)

//
// MODULE: RUN BLAST WITH GENOME AGAINST ORGANELLAR GENOME
//
BLAST_BLASTN (
reference_tuple,
BLAST_MAKEBLASTDB.out.db
)
ch_versions = ch_versions.mix(BLAST_BLASTN.out.versions)

//
// LOGIC: FOR I (MAPPED TO OTHER CHANNELS) IN CSV LIST RUN FILTER BLAST
//
barcode_multiplex
.map { it ->
tuple( it.split(',') )
}
.flatten()
.combine( reference_tuple )
.combine( BLAST_BLASTN.out.txt )
.multiMap { code, ref_meta, ref, blast_meta, blast ->
barcodes: code
reference: tuple( ref_meta, ref )
blastdata: tuple( blast_meta, blast )
}
.set {testing}

//
// MODULE: CREATE A FILTERED BLAST OUTPUT PER BARCODE
//
FILTER_BARCODE (
testing.reference,
testing.blastdata,
testing.barcodes
)
ch_versions = ch_versions.mix(FILTER_BARCODE.out.versions)

emit:
filtered = FILTER_BARCODE.out.debarcoded
versions = ch_versions.ifEmpty(null)
}
Loading