forked from jasteen/nextflow-workflows
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexome_stats_only.nf
267 lines (226 loc) · 8.67 KB
/
exome_stats_only.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#!/usr/bin/env nextflow
// Required Inputs
refFolder = file("/projects/vh83/reference/genomes/b37/bwa_0.7.12_index/")
inputDirectory = file('./fastqs')
panel_int = file('/projects/vh83/reference/IDT_exome_panel/AML_AMLv3_Targets_Standard.b37.interval_list')
padded_int = file('/projects/vh83/reference/IDT_exome_panel/AML_AMLv3_Targets_Standard.b37.interval_list')
panel_bed = file('/projects/vh83/reference/IDT_exome_panel/AML_AMLv3_Targets_Standard.b37.bed')
padded_bed = file('/projects/vh83/reference/IDT_exome_panel/AML_AMLv3_Targets_Standard.b37.bed')
tmp_dir = file('/scratch/vh83/tmp/')
// Getting Reference Files
refBase = "$refFolder/human_g1k_v37_decoy"
ref = file("${refBase}.fasta")
refDict = file("${refBase}.dict")
refFai = file("${refBase}.fasta.fai")
millsIndels = file("/projects/vh83/reference/genomes/b37/accessory_files/Mills_and_1000G_gold_standard.indels.b37.vcf")
knownIndels = file("/projects/vh83/reference/genomes/b37/accessory_files/1000G_phase1.indels.b37.vcf")
dbSNP = file("/projects/vh83/reference/genomes/b37/accessory_files/dbsnp_138.b37.vcf")
/*
mills_grch37: reference/Mills_and_1000G_gold_standard.indels.b37.vcf
one_k_g_grch37_indels: reference/1000G_phase1.indels.b37.vcf
one_k_g_snps: reference/1000G_omni2.5.b37.vcf
one_k_g_highconf_snps: reference/1000G_phase1.snps.high_confidence.b37.vcf
one_k_g_indels: reference/1000G_phase1.indels.b37.vcf
hapmap: reference/hapmap_3.3.b37.vcf
interval_grch37: reference/Broad.human.exome.b37.interval_list
dbsnp_grch37: reference/dbsnp_138.b37.vcf
CEU_mergeGvcf: reference/CEU_mergeGvcf.vcf
FIN_mergeGvcf: reference/FIN_mergeGvcf.vcf
GBR_mergeGvcf: reference/GBR_mergeGvcf.vcf
*/
// Tools
picardJar = '~/picard.jar'
gatkJar = '/usr/local/gatk/3.7/bin/GenomeAnalysisTK.jar'
bwaModule = 'bwa/0.7.17-gcc5'
samtoolsModule = 'samtools/1.9'
gatkModule = 'gatk/3.7'
rModule = 'R/3.5.1'
fgbioJar = '/usr/local/fgbio/0.9.0/target/fgbio-0.9.0-17cb5fb-SNAPSHOT.jar'
// Global Resource Configuration Options
globalExecutor = 'slurm'
globalStageInMode = 'symlink'
globalCores = 1
bwaCores = 12
vardictCores = 4
globalMemoryS = '6 GB'
globalMemoryM = '32 GB'
globalMemoryL = '64 GB'
globalTimeS = '8m'
globalTimeM = '1h'
globalTimeL = '24h'
globalQueueS = 'short'
globalQueueL = 'comp'
// Creating channel from input directory
//create channel flat because we want to join it later, and the tuple makes that more annoying than I want it to be
ch_inputFiles = Channel.fromFilePairs("$inputDirectory/*R{1,2}.fastq.gz", flat: true)
process createUnmappedBam {
publishDir path: './output/intermediate', mode: 'copy'
input:
set baseName, file(R1), file(R2) from ch_inputFiles
output:
set baseName, file("${baseName}.unmapped.bam") into ch_unmappedBams
publishDir path: './output/intermediate', mode: 'copy'
cache 'lenient'
executor globalExecutor
stageInMode globalStageInMode
cpus 1
module 'fgbio'
module 'java'
memory globalMemoryM
time '3h'
queue globalQueueL
script:
"""
java -Xmx30g -Djava.io.tmpdir=$tmp_dir -XX:+AggressiveOpts -XX:+AggressiveHeap \
-jar $fgbioJar FastqToBam --input $R1 $R2 --output "${baseName}.unmapped.bam" --read-structures +T +T \
--sample "${baseName}" --read-group-id "${baseName}" --library A --platform illumina --sort true
"""
}
process markAdaptors {
publishDir path: './output/intermediate', mode: 'copy'
input:
set baseName, file(bam) from ch_unmappedBams
output:
set baseName, file("${baseName}.unmapped.marked.bam"),
file("${baseName}.unmapped.marked_metrics.tsv") into ch_markedBams
cache 'lenient'
executor globalExecutor
stageInMode globalStageInMode
cpus 1
module 'java'
memory globalMemoryM
time '3h'
queue globalQueueL
script:
"""
java -Dpicard.useLegacyParser=false -Xmx30g -jar $picardJar MarkIlluminaAdapters \
-INPUT $bam \
-OUTPUT "${baseName}.unmapped.marked.bam" \
-METRICS "${baseName}.unmapped.marked_metrics.tsv"
"""
}
process alignBwa {
input:
set baseName, file(bam), file(metrics) from ch_markedBams
output:
set baseName, file("${baseName}.mapped.bam") into ch_mappedBams
publishDir path: './output/intermediate', mode: 'copy'
cache 'lenient'
executor globalExecutor
stageInMode globalStageInMode
module bwaModule
module 'samtools'
module 'picard'
cpus bwaCores
memory globalMemoryM
time globalTimeL
queue globalQueueL
script:
"""
set -o pipefail
java -Dpicard.useLegacyParser=false -Xmx6G -jar $picardJar SamToFastq \
-I "$bam" \
-FASTQ '/dev/stdout' -CLIPPING_ATTRIBUTE XT -CLIPPING_ACTION 2 \
-INTERLEAVE true -NON_PF true -TMP_DIR "$tmp_dir" | \
bwa mem -M -t ${task.cpus} -p $ref /dev/stdin | \
java -Dpicard.useLegacyParser=false -Xmx6G -jar $picardJar MergeBamAlignment \
-ALIGNED_BAM '/dev/stdin' -UNMAPPED_BAM "$bam" \
-OUTPUT "${baseName}.mapped.bam" -R "$ref" -ADD_MATE_CIGAR true \
-CLIP_ADAPTERS false -MAX_INSERTIONS_OR_DELETIONS '-1' \
-PRIMARY_ALIGNMENT_STRATEGY MostDistant -SO queryname -ATTRIBUTES_TO_RETAIN XS -TMP_DIR "$tmp_dir"
"""
}
process markDuplicatesPicard {
input:
set baseName, bam from ch_mappedBams
output:
set baseName, file("${baseName}.mapped.marked.bam") into ch_markedBamFiles
set baseName, file("${baseName}.markduplicates.metrics") into ch_metrics
publishDir path: './output/metrics/markduplicates', mode: 'copy'
cache 'lenient'
executor globalExecutor
stageInMode globalStageInMode
cpus 1
memory globalMemoryM
time globalTimeL
queue globalQueueL
// TODO: CLEAR_DT=false option in GATK pipeline but not supported by
// this version of picard.
// ADD_PG_TAG_TO_READS=false also not supported.
"""
java -Dpicard.useLegacyParser=false -Xmx32G -jar $picardJar MarkDuplicates \
-INPUT $bam \
-OUTPUT ${baseName}.mapped.marked.bam \
-METRICS_FILE ${baseName}.markduplicates.metrics \
-VALIDATION_STRINGENCY SILENT \
-OPTICAL_DUPLICATE_PIXEL_DISTANCE 2500 \
-ASSUME_SORT_ORDER queryname
"""
}
process sortBam {
input:
set baseName, file(markedBam) from ch_markedBamFiles
output:
set baseName, file("${baseName}.mapped.marked.sorted.bam") into ch_sortedBamFiles
cache 'lenient'
executor globalExecutor
stageInMode globalStageInMode
cpus 1
memory globalMemoryS
time globalTimeL
queue globalQueueL
"""
java -Djava.io.tmpdir=$tmp_dir -Dpicard.useLegacyParser=false -Xmx4g -jar $picardJar SortSam \
-INPUT $markedBam \
-OUTPUT ${baseName}.mapped.marked.sorted.bam \
-SORT_ORDER coordinate \
-CREATE_INDEX false \
-CREATE_MD5_FILE false \
-MAX_RECORDS_IN_RAM 300000
"""
}
process indexBam {
input:
set baseName, file(bam) from ch_sortedBamFiles
output:
set baseName, file(bam), file("${baseName}.mapped.marked.sorted.bam.bai") into ch_forHSMetrics
publishDir path: './output/intermediate', mode: 'copy'
cache 'lenient'
executor globalExecutor
stageInMode globalStageInMode
module 'samtools'
cpus globalCores
memory globalMemoryM
time globalTimeL
queue globalQueueL
script:
"""
samtools index $bam ${baseName}.mapped.marked.sorted.bam.bai
"""
}
process collectHSMetrics {
input:
set sample, file(bam), file(bai) from ch_forHSMetrics
output:
set sample, file("*.HSmetrics.txt"), file("*.perbase.txt"), file("*.pertarget.txt") into ch_metrics_unused2
publishDir path: './output/metrics/coverage', mode: 'copy'
executor globalExecutor
stageInMode globalStageInMode
cpus 1
memory globalMemoryM
time globalTimeL
queue globalQueueL
script:
"""
module purge
module load R/3.5.1
java -Dpicard.useLegacyParser=false -Xmx6G -jar ${picardJar} CollectHsMetrics \
-I ${bam} \
-O "${sample}.HSmetrics.txt" \
-R ${ref} \
-BI $panel_int \
-TI $padded_int \
--PER_BASE_COVERAGE "${sample}.perbase.txt" \
--PER_TARGET_COVERAGE "${sample}.pertarget.txt"
"""
}