Skip to content

Commit

Permalink
Merge pull request #1 from TRON-Bioinformatics/support-bams
Browse files Browse the repository at this point in the history
Support bams
  • Loading branch information
Pablo Riesgo-Ferreiro authored Mar 25, 2024
2 parents 099c6d7 + 2c08937 commit 8a52bb9
Show file tree
Hide file tree
Showing 21 changed files with 1,788 additions and 61 deletions.
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
/workspace.xml
work
.nextflow.log*
report.html
timeline.html
report.html*
timeline.html*
trace.txt*
trace.txt
dag.dot
.nextflow
Expand Down
27 changes: 20 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,21 @@ Nextflow (Di Tommaso, 2017) pipeline for HLA typing using HLA-HD (Kawaguchi, 201

## How to run it

Prepare an input table with the FASTQs for each sample with three tab-separated columns **without a header**
Prepare an input table with the FASTQs for each sample with three tab-separated columns **without a header** using `--input_fastqs`.

| Sample name | FASTQ1 | FASTQ2 |
|----------------------|---------------------------------|------------------------------|
| sample_1 | /path/to/sample_1.1.fq | /path/to/sample_1.2.fq |
| sample_2 | /path/to/sample_2.1.fq | /path/to/sample_2.2.fq |
| Sample name | FASTQ1 | FASTQ2 |
|----------------------|---------------------------|------------------------------|
| sample_1 | /path/to/sample_1.1.fq.gz | /path/to/sample_1.2.fq.gz |
| sample_2 | /path/to/sample_2.1.fq.gz | /path/to/sample_2.2.fq.gz |

Alternatively, provide a table with BAM files using `--input_bams`.

| Sample name | BAM |
|----------------------|-----------------------|
| sample_1 | /path/to/sample_1.bam |
| sample_2 | /path/to/sample_2.bam |

BAM files should be indexed.

Run as indicated below.

Expand All @@ -34,22 +42,27 @@ Usage:
nextflow run main.nf --input_files input_files --output output_folder
Input:
* input_files: the path to a tab-separated values file containing in each row the sample name, FASTQ 1 and FASTQ 2
* input_fastqs: the path to a tab-separated values file containing in each row the sample name, FASTQ 1 and FASTQ 2
The input file does not have header!
Example input file:
name1 fastq1.fq.gz fastq2.fq.gz
name2 fastq1.fq.gz fastq2.fq.gz
* input_bams: the path to a tab-separated values file containing in each row the sample name and BAM
The input file does not have header!
Example input file:
name1 name1.bam
name2 name2.bam
* output: output folder where results will be stored
Optional input:
* reference: the reference genome to use (default: hg38, possible values: hg38 or hg19)
* read_length: the read length (default: 50)
* hlahd_folder: the HLA-HD folder (default: /code/hlahd.1.2.0.1)
* bowtie2_folder: the bowtie2 folder (default: /code/bowtie/2.3.4.3)
* bowtie2_module: the module to load with bowtie2
* ld_library_path: the value to set in LD_LIBRARY_PATH
* cpus: the number of CPUs per sample (default: 15)
* memory: the amount of memory per sample (default: 30g)
```


Expand Down
76 changes: 29 additions & 47 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

nextflow.enable.dsl = 2

include { BAM2FASTQ } from './modules/00_bam2fastq'
include { HLAHD } from './modules/01_hlahd'


def helpMessage() {
log.info params.help_message
Expand All @@ -13,63 +16,42 @@ if (params.help) {
}

// checks required inputs
if (params.input_files) {
if (params.input_fastqs) {
Channel
.fromPath(params.input_files)
.fromPath(params.input_fastqs)
.splitCsv(header: ['name', 'fastq1', 'fastq2'], sep: "\t")
.map{ row-> tuple(row.name, file(row.fastq1), file(row.fastq2)) }
.set { input_files }
.set { input_fastqs }
} else if (params.input_bams) {
Channel
.fromPath(params.input_bams)
.splitCsv(header: ['name', 'bam'], sep: "\t")
.map{ row-> tuple(row.name, file(row.bam)) }
.set { input_bams }
} else {
exit 1, "Input file not specified!"
exit 1, "Provide either --input_fastqs or --input_bams"
}


process HLAHD {
cpus params.cpus
memory params.memory
tag "${name}"
publishDir "${params.output}/${name}", mode: "copy", pattern: "*.txt"
module params.bowtie2_module

input:
tuple val(name), val(fastq1), val(fastq2)

output:
tuple val("${name}"), path("*final*")

script:
"""
mkdir temp
# HLA-HD wants its own binaries and bowtie2 in path
export PATH=${params.hlahd_folder}/bin/:${params.bowtie2_folder}:\$PATH
export LD_LIBRARY_PATH=${params.ld_library_path}
if (params.reference == 'hg38') {
contigs = "$baseDir/references/contigs_hla_reads_hg38.bed"
}
else if (params.reference == 'hg19') {
contigs = "$baseDir/references/contigs_hla_reads_hg19.bed"
}
else {
log.error "--reference only supports hg38 or hg19"
exit 1
}

zcat ${fastq1} > input_fastq1.fastq
zcat ${fastq2} > input_fastq2.fastq

# HLA HD does not accept gzipped fastq files, unzip them first
hlahd.sh \
-m ${params.read_length} \
-t ${task.cpus} \
-f ${params.hlahd_folder}/freq_data/ \
input_fastq1.fastq \
input_fastq2.fastq \
${params.hlahd_folder}/HLA_gene.split.txt \
${params.hlahd_folder}/dictionary/ \
${name} \
temp

# moves the final result to base folder
mv temp/${name}/result/* .
workflow {

# deletes temp folder
rm -rf temp
rm -f input_fastq1.fastq
rm -f input_fastq2.fastq
"""
}
if (input_bams) {
BAM2FASTQ(input_bams, contigs)
input_fastqs =BAM2FASTQ.out
}

workflow {
HLAHD(input_files)
HLAHD(input_fastqs)
}
42 changes: 42 additions & 0 deletions modules/00_bam2fastq.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
process BAM2FASTQ {
cpus params.cpus
memory params.memory
tag "${name}"
publishDir "${params.output}/${name}", mode: "copy", pattern: "*.txt"
module params.bowtie2_module

conda (params.enable_conda ? "bioconda::samtools=1.18 bioconda::gatk4=4.2.5.0" : null)

input:
tuple val(name), val(bam)
val(contigs)

output:
tuple val("${name}"), path("${name}.hla.1.fastq.gz"), path("${name}.hla.2.fastq.gz")

script:
"""
samtools index $bam
# gets reads in the provided regions
# only non duplicated reads
samtools view \
-b \
-L ${contigs} \
-F 1024 \
${bam} > ${name}.mhc.bam
# Extract unmap reads
samtools view -b -f 4 $bam > ${name}.unmap.bam
#Merge bam files
samtools merge -o ${name}.merge.bam ${name}.unmap.bam ${name}.mhc.bam
#Create fastq
gatk SamToFastq --VALIDATION_STRINGENCY SILENT -I ${name}.merge.bam -F ${name}.hlatmp.1.fastq -F2 ${name}.hlatmp.2.fastq
#Change fastq ID
cat ${name}.hlatmp.1.fastq |awk '{if(NR%4 == 1){O=\$0;gsub("/1"," 1",O);print O}else{print \$0}}' | gzip > ${name}.hla.1.fastq.gz
cat ${name}.hlatmp.2.fastq |awk '{if(NR%4 == 1){O=\$0;gsub("/2"," 2",O);print O}else{print \$0}}' | gzip > ${name}.hla.2.fastq.gz
"""
}
45 changes: 45 additions & 0 deletions modules/01_hlahd.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
process HLAHD {
cpus params.cpus
memory params.memory
tag "${name}"
publishDir "${params.output}/${name}", mode: "copy", pattern: "*.txt"
module params.bowtie2_module

input:
tuple val(name), val(fastq1), val(fastq2)

output:
tuple val("${name}"), path("*final*")

script:
"""
mkdir temp
# HLA-HD wants its own binaries and bowtie2 in path
export PATH=${params.hlahd_folder}/bin/:${params.bowtie2_folder}:\$PATH
export LD_LIBRARY_PATH=${params.ld_library_path}
zcat ${fastq1} > input_fastq1.fastq
zcat ${fastq2} > input_fastq2.fastq
# HLA HD does not accept gzipped fastq files, unzip them first
hlahd.sh \
-m ${params.read_length} \
-t ${task.cpus} \
-f ${params.hlahd_folder}/freq_data/ \
input_fastq1.fastq \
input_fastq2.fastq \
${params.hlahd_folder}/HLA_gene.split.txt \
${params.hlahd_folder}/dictionary/ \
${name} \
temp
# moves the final result to base folder
mv temp/${name}/result/* .
# deletes temp folder
rm -rf temp
rm -f input_fastq1.fastq
rm -f input_fastq2.fastq
"""
}
30 changes: 25 additions & 5 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,21 @@ params.bowtie2_folder = "/code/bowtie/2.3.4.3"
params.bowtie2_module = "bioinf/bowtie2/2.3.4.3"
params.ld_library_path = "/usr/local/lib64/"
params.read_length = 50
params.reference = 'hg38'

profiles {
conda {
params.enable_conda = true
}
debug { process.beforeScript = 'echo $HOSTNAME' }
test {
params.cpus = 1
params.memory = "3g"
timeline.enabled = false
report.enabled = false
trace.enabled = false
dag.enabled = false
}
}

// Export this variable to prevent local Python libraries from conflicting with those in the container
Expand All @@ -27,7 +39,7 @@ env {
// Capture exit codes from upstream processes when piping
process.shell = ['/bin/bash', '-euo', 'pipefail']

VERSION = '0.1.0'
VERSION = '0.2.0'

manifest {
name = 'TRON-Bioinformatics/tronflow-hlahd'
Expand All @@ -40,20 +52,28 @@ manifest {
}

params.help_message = """
TronFlow HLA-HD v${VERSION}
nextflow run tron-bioinformatics/tronflow-hla-hd --help
Launching `main.nf` [intergalactic_shannon] - revision: e707c77d7b
Usage:
nextflow run main.nf --input_files input_files --output output_folder
Input:
* input_files: the path to a tab-separated values file containing in each row the sample name, FASTQ 1 and FASTQ 2
* input_fastqs: the path to a tab-separated values file containing in each row the sample name, FASTQ 1 and FASTQ 2
The input file does not have header!
Example input file:
name1 fastq1.fq.gz fastq2.fq.gz
name2 fastq1.fq.gz fastq2.fq.gz
* input_bams: the path to a tab-separated values file containing in each row the sample name and BAM
The input file does not have header!
Example input file:
name1 fastq1.fq.gz fastq2.fq.gz
name2 fastq1.fq.gz fastq2.fq.gz
name1 name1.bam
name2 name2.bam
* output: output folder where results will be stored
Optional input:
* reference: the reference genome to use (default: hg38, possible values: hg38 or hg19)
* read_length: the read length (default: 50)
* hlahd_folder: the HLA-HD folder (default: /code/hlahd.1.2.0.1)
* bowtie2_folder: the bowtie2 folder (default: /code/bowtie/2.3.4.3)
Expand Down
8 changes: 8 additions & 0 deletions references/contigs_hla_reads_hg19.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
chr6 28477797 33448354
chr6_apd_hap1 1 4622290
chr6_cox_hap2 1 4795371
chr6_dbb_hap3 1 4610396
chr6_mann_hap4 1 4683263
chr6_mcf_hap5 1 4833398
chr6_qbl_hap6 1 4611984
chr6_ssto_hap7 1 4928567
Loading

0 comments on commit 8a52bb9

Please sign in to comment.