Skip to content

Commit

Permalink
fix: testing fixes for all use cases of input, add bclconvert docker
Browse files Browse the repository at this point in the history
  • Loading branch information
Ryan Routsong committed Dec 15, 2023
1 parent 83bdaf5 commit 019b201
Show file tree
Hide file tree
Showing 10 changed files with 97 additions and 95 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ The **`./weave`** pipeline is composed of two sub commands to setup and run the
**weave** common use is to gauge the qualtiy of reads for potential downstream analysis. Since bioinformatic analysis requires robust and accurate data to draw scientific conclusions, this helps save time and resources when it comes to analyzing the volumous amount of sequencing data that is collected routinely.

Several of the applications that **weave** uses to visualize and report quality metrics are:
- [Kraken](https://github.com/DerrickWood/kraken2)<sup>71</sup>, kmer analysis
- [Kraken](https://github.com/DerrickWood/kraken2)<sup>7</sup>, kmer analysis
- [Kaiju](https://bioinformatics-centre.github.io/kaiju/)<sup>4</sup>, kmer analysis
- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/), fastq statistics
- [fastp](https://github.com/OpenGene/fastp)<sup>6</sup>, fastq adapter removal (trimming)
Expand Down
File renamed without changes.
9 changes: 9 additions & 0 deletions docker/bclconvert/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FROM ubuntu:23.04
ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get -qq update; apt-get -qq install pstack gdb wget vim curl alien
WORKDIR "/"
RUN wget -q https://hpc.nih.gov/~OpenOmics/weave/bcl-convert-4.2.4-2.el7.x86_64.rpm
RUN alien --scripts bcl-convert-4.2.4-2.el7.x86_64.rpm
RUN dpkg -i bcl-convert_4.2.4-3_amd64.deb
RUN mkdir /work; chmod -R 777 /work
WORKDIR "/work"
2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ The **`./weave`** pipeline is composed of two sub commands to setup and run the
**weave** common use is to gauge the qualtiy of reads for potential downstream analysis. Since bioinformatic analysis requires robust and accurate data to draw scientific conclusions, this helps save time and resources when it comes to analyzing the volumous amount of sequencing data that is collected routinely.

Several of the applications that **weave** uses to visualize and report quality metrics are:
- [Kraken](https://github.com/DerrickWood/kraken2)<sup>71</sup>, kmer analysis
- [Kraken](https://github.com/DerrickWood/kraken2)<sup>7</sup>, kmer analysis
- [Kaiju](https://bioinformatics-centre.github.io/kaiju/)<sup>4</sup>, kmer analysis
- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/), fastq statistics
- [fastp](https://github.com/OpenGene/fastp)<sup>6</sup>, fastq adapter removal (trimming)
Expand Down
14 changes: 9 additions & 5 deletions scripts/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ def get_all_seq_dirs(top_dir, server):


def check_if_demuxed(data_dir):
is_demuxed = False
do_demuxed = True
if Path(data_dir, 'Analysis').exists():
if list(Path(data_dir, 'Analysis').rglob('*.fastq*')):
is_demuxed = True
return is_demuxed
do_demuxed = False
return do_demuxed


def valid_run_output(output_directory, dry_run=False):
Expand All @@ -59,7 +59,11 @@ def runid2samplesheet(runid, top_dir=DIRECTORY_CONFIGS['bigsky']['seq']):
ss_path = Path(top_dir, runid)
if not ss_path.exists():
raise FileNotFoundError(f"Run directory does not exist: {ss_path}")
if Path(ss_path, f"SampleSheet_{runid}.txt").exists():
if Path(ss_path, f"SampleSheet.txt").exists():
ss_path = Path(ss_path, f"SampleSheet.txt")
elif Path(ss_path, f"SampleSheet.csv").exists():
ss_path = Path(ss_path, f"SampleSheet.csv")
elif Path(ss_path, f"SampleSheet_{runid}.txt").exists():
ss_path = Path(ss_path, f"SampleSheet_{runid}.txt")
elif Path(ss_path, f"SampleSheet_{runid}.csv").exists():
ss_path = Path(ss_path, f"SampleSheet_{runid}.csv")
Expand Down Expand Up @@ -149,7 +153,7 @@ def get_run_directories(runids, seq_dir=None, sheetname=None):
this_run_info = dict(run_id=rid)

if Path(run_p, 'SampleSheet.csv').exists():
sheet = parse_samplesheet(Path(run_p, 'SampleSheet.csv').absolute())
sheet = Path(run_p, 'SampleSheet.csv').absolute()
elif Path(run_p, f'SampleSheet_{rid}.csv').exists():
sheet = Path(run_p, f'SampleSheet_{rid}.csv').absolute()
elif Path(run_p, f'SampleSheet_{rid}.csv').exists():
Expand Down
20 changes: 8 additions & 12 deletions weave
Original file line number Diff line number Diff line change
Expand Up @@ -25,25 +25,21 @@ def run(args):
"Please file issue if this message is blocking: https://github.com/OpenOmics/weave/issues")
pairs = ['1', '2'] if sample_sheet.is_paired_end else ['1']


# ~~~ general run configuration ~~~
exec_config['bclconvert'].append(utils.is_bclconvert(sample_sheet))
exec_config['run_ids'].append(rundir.name)
exec_config['demux_input_dir'].append(rundir.absolute())
exec_config['sids'].append([x['sid'] for x in sample_list])
exec_config['project'].append(project_list[0])
exec_config['rnums'].append(pairs)

# ~~~ demultiplexing configuration ~~~
bcls = [x for x in Path(rundir).rglob('*.bcl.*') if not 'tmp' in str(x)]
if not bcls:
bcls = [x for x in Path(rundir).rglob('*.cbcl') if not 'tmp' in str(x)]
exec_config['sample_sheet'].append(str(sample_sheet.path))
exec_config['bcl_files'].append(bcls)
exec_config['demux_data'].append(files.check_if_demuxed(rundir))

# ~~~ QC/QA configuration ~~~
sample_sheet = run_infos['samplesheet']
exec_config['sample_sheet'].append(str(sample_sheet.path.absolute()))
exec_config['bclconvert'].append(utils.is_bclconvert(sample_sheet))
exec_config['run_ids'].append(rundir.name)
exec_config['demux_input_dir'].append(rundir.absolute())
exec_config['sids'].append([x['sid'] for x in sample_list])
exec_config['project'].append(project_list[0])
exec_config['rnums'].append(pairs)
exec_config['samples'].append(sample_list)

# ~~~ output verification ~~~
Expand Down Expand Up @@ -85,7 +81,7 @@ if __name__ == '__main__':
parser_run.add_argument('-d', '--dry-run', action='store_true',
help='Dry run the demultiplexing workflow')
parser_run.add_argument('-n', '--noqc', action='store_false',
help='Dry run the demultiplexing workflow')
help='Do not run the QC/QA portion of the workflow (Default is on)')
parser_run.add_argument('--sheetname', metavar='Sample Sheet Filename',
help='Name of the sample sheet file to look for (default is SampleSheet.csv)')
parser_run.add_argument('-l', '--local', action='store_true',
Expand Down
22 changes: 8 additions & 14 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,59 +28,53 @@ qa_qc_outputs = flatten(
[
# ~~ fastqc on untrimmed reads ~~
expand(
"{out_dir}/{rid}/{project}/{sids}/fastqc_untrimmed/{sids}_R{rnum}_001_fastqc.zip",
"{out_dir}/{project}/{sids}/fastqc_untrimmed/{sids}_R{rnum}_001_fastqc.zip",
out_dir=config["out_to"],
project=config["project"],
rid=config["run_ids"],
sids=config["sids"],
rnum=config["rnums"],
),
# ~~ fastqc on trimmed reads ~~
expand(
"{out_dir}/{rid}/{project}/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnum}_fastqc.zip",
"{out_dir}/{project}/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnum}_fastqc.zip",
out_dir=config["out_to"],
sids=config["sids"],
project=config["project"],
rid=config["run_ids"],
rnum=config["rnums"],
),
# ~~ fastp trimming metrics ~~
expand(
"{out_dir}/{rid}/{project}/{sids}/fastp/{sids}_trimmed_R{rnum}.fastq.gz",
"{out_dir}/{project}/{sids}/fastp/{sids}_trimmed_R{rnum}.fastq.gz",
out_dir=config["out_to"],
sids=config["sids"],
project=config["project"],
rid=config["run_ids"],
rnum=config["rnums"],
),
# ~~ fastq screen ~~
expand(
"{out_dir}/{rid}/{project}/{sids}/fastq_screen/{sids}_trimmed_R{rnum}_screen.html",
"{out_dir}/{project}/{sids}/fastq_screen/{sids}_trimmed_R{rnum}_screen.html",
out_dir=config["out_to"],
sids=config["sids"],
rnum=config["rnums"],
rid=config["run_ids"],
project=config["project"],
),
# kraken2
expand(
"{out_dir}/{rid}/{project}/{sids}/kraken/{sids}.tsv",
"{out_dir}/{project}/{sids}/kraken/{sids}.tsv",
out_dir=config["out_to"],
sids=config["sids"],
project=config["project"],
rid=config["run_ids"],
),
# kaiju
expand(
"{out_dir}/{rid}/{project}/{sids}/kaiju/{sids}.tsv",
"{out_dir}/{project}/{sids}/kaiju/{sids}.tsv",
out_dir=config["out_to"],
sids=config["sids"],
project=config["project"],
rid=config["run_ids"],
),
# multiqc
expand(
"{out_dir}/{rid}/{project}/multiqc/Run-{rid}-Project-{project}_multiqc_report.html",
"{out_dir}/{project}/multiqc/Run-{rid}-Project-{project}_multiqc_report.html",
out_dir=config["out_to"],
project=config["project"],
rid=config["run_ids"],
Expand Down Expand Up @@ -116,7 +110,7 @@ dragen_linker_outputs = flatten(
]
)

if not config['demux_data']:
if config['demux_data']:
if config['bclconvert']:
all_outputs = bclconvert_outputs
else:
Expand Down
30 changes: 15 additions & 15 deletions workflow/demux.smk
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ rule bcl2fastq:
params:
out_dir = config["out_to"] + "/demux",
container: config["resources"]["sif"] + "bcl2fastq.sif",
log: config["out_to"] + "/logs/bcl2fastq/" + config["run_ids"] + "_" + config["project"] + ".log",
threads: 26
resources:
mem_mb = "32g",
mem_mb = "32G",
slurm_partition = "quick",
runtime = 60*4,
tasks = 1,
Expand All @@ -49,7 +50,6 @@ rule bcl2fastq:
--fastq-compression-level 9 \
--no-lane-splitting \
-o {params.out_dir}
find . > .fqlist
touch {output.breadcrumb}
"""

Expand Down Expand Up @@ -81,7 +81,7 @@ rule bclconvert:
input:
run_dir = config['demux_input_dir'],
binary_base_calls = expand("{files}", files=config['bcl_files'] if config['bclconvert'] else demux_noop_args),
samplesheets = expand("{run}/SampleSheet.csv", run=config['demux_input_dir'] if config['bclconvert'] else demux_noop_args),
samplesheet = expand("{ss}", ss=config['sample_sheet'] if config['bclconvert'] else demux_noop_args),
runinfo = expand("{run}/RunInfo.xml", run=config['demux_input_dir'] if config['bclconvert'] else demux_noop_args),
params:
out_dir = config["out_to"] + "/demux/",
Expand All @@ -95,7 +95,6 @@ rule bclconvert:
top_unknown = expand("{out_to}/demux/Reports/Top_Unknown_Barcodes.csv", **demux_expand_args if config['bclconvert'] else demux_noop_args),
breadcrumb = expand("{out_to}/demux/.BC_DEMUX_COMPLETE", **demux_expand_args if config['bclconvert'] else demux_noop_args),
container: config["resources"]["sif"] + "weave_bclconvert_0.0.3.sif",
log: config["out_to"] + "/logs/bclconvert/" + config["run_ids"] + "_" + config["project"] + ".log",
threads: 75
resources: mem_mb = int(64e3)
shell:
Expand All @@ -104,6 +103,7 @@ rule bclconvert:
--bcl-input-directory {input.run_dir} \
--force \
--output-directory {params.out_dir} \
--sample-sheet {input.samplesheet} \
--fastq-gzip-compression-level 9 \
--bcl-sampleproject-subdirectories true \
--bcl-num-conversion-threads 24 \
Expand All @@ -117,18 +117,18 @@ rule bclconvert:

rule fastq_linker_from_dragen:
input:
read1 = expand(config["demux_input_dir"] + "/Analysis/1/Data/fastq/{full_sid}_R1_001.fastq.gz", full_sid=config["sids"]),
read2 = expand(config["demux_input_dir"] + "/Analysis/1/Data/fastq/{full_sid}_R2_001.fastq.gz", full_sid=config["sids"]),
adapter_metrics = config["demux_input_dir"] + "/Analysis/1/Data/Reports/Adapter_Metrics.csv",
qual_metrics = config["demux_input_dir"] + "/Analysis/1/Data/Reports/Quality_Metrics.csv",
demux_stats = config["demux_input_dir"] + "/Analysis/1/Data/Reports/Demultiplex_Stats.csv",
read1 = expand(config["demux_input_dir"] + "/Analysis/1/Data/fastq/{full_sid}_R1_001.fastq.gz", full_sid=config["sids"]) if not config['demux_data'] else [],
read2 = expand(config["demux_input_dir"] + "/Analysis/1/Data/fastq/{full_sid}_R2_001.fastq.gz", full_sid=config["sids"]) if not config['demux_data'] else [],
adapter_metrics = config["demux_input_dir"] + "/Analysis/1/Data/Reports/Adapter_Metrics.csv" if not config['demux_data'] else [],
qual_metrics = config["demux_input_dir"] + "/Analysis/1/Data/Reports/Quality_Metrics.csv" if not config['demux_data'] else [],
demux_stats = config["demux_input_dir"] + "/Analysis/1/Data/Reports/Demultiplex_Stats.csv" if not config['demux_data'] else [],
output:
out_read1 = expand(config["out_to"] + "/demux/" + config["project"] + "/{full_sid}_R1_dragen.fastq.gz", full_sid=config["sids"]),
out_read2 = expand(config["out_to"] + "/demux/" + config["project"] + "/{full_sid}_R2_dragen.fastq.gz", full_sid=config["sids"]),
breadcrumb = expand(config["out_to"] + "/demux/.breadcrumb/{full_sid}", full_sid=config["sids"]),
adapter_metrics_out = config["out_to"] + "/demux/dragen_reports/Adapter_Metrics.csv",
qual_metrics_out = config["out_to"] + "/demux/dragen_reports/Quality_Metrics.csv",
demux_stats_out = config["out_to"] + "/demux/dragen_reports/Demultiplex_Stats.csv",
out_read1 = expand(config["out_to"] + "/demux/" + config["project"] + "/{full_sid}_R1_dragen.fastq.gz", full_sid=config["sids"]) if not config['demux_data'] else [],
out_read2 = expand(config["out_to"] + "/demux/" + config["project"] + "/{full_sid}_R2_dragen.fastq.gz", full_sid=config["sids"]) if not config['demux_data'] else [],
breadcrumb = expand(config["out_to"] + "/demux/.breadcrumb/{full_sid}", full_sid=config["sids"]) if not config['demux_data'] else [],
adapter_metrics_out = config["out_to"] + "/demux/dragen_reports/Adapter_Metrics.csv" if not config['demux_data'] else [],
qual_metrics_out = config["out_to"] + "/demux/dragen_reports/Quality_Metrics.csv" if not config['demux_data'] else [],
demux_stats_out = config["out_to"] + "/demux/dragen_reports/Demultiplex_Stats.csv" if not config['demux_data'] else [],
run:
demux_dir = Path(config["out_to"], 'demux').resolve()
bc_dir = Path(demux_dir, '.breadcrumb').resolve()
Expand Down
Loading

0 comments on commit 019b201

Please sign in to comment.