-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFastQtoGVCF.norealign.q
2650 lines (2207 loc) · 88.5 KB
/
FastQtoGVCF.norealign.q
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**
* FastQ file to Gnomic VCF file.
*
* This scala script runs through GATK's Queue process to run alignment process in a multi-core and multi-node friendly manner.
*
* It takes a file containing a list of id requests, matches them to the sample description file line and passes these lines to the alignment through to variant calling.
*
* Command line options are:
* -a or --alignemtns_file [file_path]: file containing alignment requests.
* -d or --sample_description_file [file_path]: file containing sample file paths and known information such as capture platform, gender, pedigree, etc. Defaults settings file entry when not specified.
* -r or --reference_file [file_path]: genomic reference file to use. Defaults settings file entry when not specified.
* -nt or --number_of_threads [integer]: number of cpu threads to use. This is multiplicative for memory use as each thread has it's own memory (ram) block. Defaults to 4 when not specified.
* -nct or --number_of_core_threads [integer]: number of cpu cores to use. In theory this should allow processes with a single core to use the same memory (ram) block, reducing the overall memory consumed by the process. For now, this option is tied to -nt as most processes that can have -nt cannot use -nct and vise versa and having to specify both pointless.
* -nsc or --number_of_scatters [integer]: number of sub-jobs to allow processes to scatter to. Dividing up large jobs into smaller segments and running them in parallel instead of sequentially. Please note that only some functions can be scattered and the sum of all scattered jobs is greater than the sum of the single job as the data must be initially parcelled out to each process and then re-combined at the end of the process runs. Please also note that just because you've split a job up, does not mean that they will ALL run simultaneously. It will work on however many jobs it has been allowed to at a given instant. For now, using -nt will result in some degree of scatter-gather processing for functions that cannot use -nt or -nsc. Defaults to 1 when not specified and uses -nt number to generate reduced scatter count.
* -bwa or --bwa_path [file_path]: command path for Burrows Wheeler Aligner. Defaults settings file entry when not specified.
* -qm or --qualimap_path [file_path]: command path for QualiMap. Defaults settings file entry when not specified.
* -xp or --allow_cross_platform: no arguments given. this will allow the merging of Libraries from different exome capture platforms which is usually not permitted. When this is specified, quality-mapping will be done at the library level instead of individual level to give the best result. This will result in multiple quality-mapping runs. Defaults to not allowing cross capture platform merging at library level when this is not specified.
* -mp or --mitochondira_ploidy [integer]: specified the ploidy at which to process mitochondiral data. Anything Less than 2 means excluded. Defaults to 0 if not specified.
* -s or --file_folder_structure [folder|single]: specifies whether to have all your ouput in a single individual's folder or subdivided into dna, library and run folders to separate out the input and output files. Essensially replaces the folder / with a _ if anything other than 'folder' is set. Defaults to 'folder' if not specified.
*
* It takes various settings from a global settings file located at [/home/clinical_genetics/Resources/QSettings]
* This includes:
* Genomic regions (PAR1, True X, Y, etc)
* Reference file for the human genome.
* Sample description file.
* Common variants file used for finger-printing runs and samples.
* Capture platform paths.
* Burrows Wheeler Aligner command path.
* QualiMap command path.
* Several pipeline control command paths.
* Mills, HapMap and DBSNP file paths.
* Additional known targets file paths which will ideally build over time.
*
* Prior to running the primary alignment phase (BWA alignment on the fastq file, Picard Sort Sam on that output and then Picard Clean Sam on the sorted output) it will verify sample description file data.
* It will:
* Check the settings file for any unspecified configurations required.
* Build a list of capture platforms and their gender-specific boundries.
* Separate jobs list by individual, library and runs.
* Attempt to locate trimmed files for the given sample.
* Verify sample desccription data all samples for an individual. (Gender/Capture platform consistenty)
*
* After Primary alignemnt of each run it will verify that all aligned, sorted and cleaned samples list as for a given individual are actually from the same individual. If the genotype concordance between any runs for an individual are inconsistent the pipeline for this individual will break and can be reviewed/restarted later once the samples have been verified, swapped or discarded.
*
* Once all runs are aligned and verified to be the same individual, the runs will be merged into a single library with GATK Realigner Target Creator, GATK Indel Realigner and finally Picard Mark Duplicates).
*
* If the capture platform is unknown, or there are multiple capture platforms for the various libraried for this individual, qualimap will run per library.
*
* If the capture platform is unknown for any sample or if cross platform merging is not permitted, the pipeline for this individual will break at this point to be resumed laster.
*
* If cross platform merging is permitted and all platforms are known, qualimap will still run per-library and they will continue on.
*
*
*/
import org.broadinstitute.gatk.utils.commandline._
import org.broadinstitute.gatk.queue.extensions.gatk._
import org.broadinstitute.gatk.queue.extensions.picard._
import java.io._
//import scala.collection.JavaConversions._
class FastQtoGVCF extends org.broadinstitute.gatk.queue.QScript {
qscript =>
// File containing sequence(s) to be processed.
// File should have 1 request per line with tab or space separated fields.
@Input(
doc="File containing alignment requests to lookup and process.",
shortName="a",
fullName="alignments_file",
required=false
)
var alignmentsFile: File = "align.txt"
@Argument(
doc="Sample Descriptiosn file containing file paths",
shortName="d",
fullName="sample_description_file",
required=false
)
var sampleDescriptionFile: File = _
// Reference to use for primary alignment and indel realignment.
// BWA doesn't like when it ends in .fastq so strip that off for BWA runs.
@Argument(
doc="FastA reference file to align against",
shortName="r",
fullName="reference_file",
required=false
)
var referenceFile: File = _
// Base output path for files to be created at.
@Argument(
doc="File folder location to output final data to. Some intermediate steps may occur on the system's local storage in your home folder/queueTMP",
shortName="b",
fullName="base_path",
required=true
)
var basePath: File = _
@Input(
doc="Settings file location",
shortName="c",
fullName="config_file",
required=true
)
var settingsFile: File = "settings.txt"
// How many threads can the tools use?
// This will be passed to BWA and GATK tools.
@Argument(
doc="Integer number of threads to allocate to processes.",
shortName="nt",
fullName="number_of_threads",
required=false
)
var numThreads: Int = 4
// How many threads can the tools use?
// This will be passed to BWA and GATK tools.
@Argument(
doc="Integer number of jobs to scatter across nodes/cpus.",
shortName="nsc",
fullName="number_of_scatters",
required=false
)
var numScatters: Int = 1
// Path to bwa executable.
// Should we pad /bwa on the end of it's missing?
@Argument(
doc="System path to bwa command.",
shortName="bwa",
fullName="bwa_path",
required=false
)
var bwaCmd: File = _
// Path to qualimap executable.
@Argument(
doc="System path to qualimap command.",
shortName="qm",
fullName="qualimap_path",
required=false
)
var qualimapCmd: File = _
@Argument(
doc="Allow different capture platforms to merge.",
shortName="xp",
fullName="allow_cross_platform",
required=false
)
val crossPlatform: Boolean = false
@Argument(
doc="HaplotypeCaller include mitochondria with the given ploidy. If less than 2 then effectively not specified as mitochondria will be excluded.",
shortName="mp",
fullName="mitochondira_ploidy",
required=false
)
var mitoPloidy: Int = 0
@Argument(
doc="File Structure. 'single' to operate in individual's folder. Anything else will create subfolders for dna, library and runs",
shortName="s",
fullName="file_folder_structure",
required=false
)
var fileStructure: String = "folder"
val DEBUG: Boolean = false
// Capture platforms enumeration
// Collected list of capture platforms linked to their .bed file.
var capturePath: File = null
var capPlatforms: Map[String,File] = Map() // PLATFORM -> FILEPATH.bed
var capData: Map[String,File] = Map() // PLATFORM -> FILEPATH.bed
//var capBounds: Map[String,String] = Map() // PLATFORM -> x,x,...
// Environment
//val tmpDir:String = System.getenv("HOME") + "/.queueTmp/"
//////////////////
// Base Configs //
//////////////////
//val settingsFile: File = "settings.txt"
// Generally known Indel sites to examine.
var mills: File = null
var hapmap: File = null
var dbsnp: File = null
// Localized list of known sites to examine.
var knownIndels: Seq[File] = Nil
// Process Files
// Stage 1: Primary Alignment, sort and clean bam
val paBWAAligned: File = "paAligned" //"all"
val paSortedSam: File = "paSorted" //"allsort"
val paCleanedSam: File = "paCleaned" //"allsortclean"
// Stage 2: Per-sample fingerprinting for Identity validation.
val idcHaplotyped: File = "idcHaplotyped"
val idcSelected: File = "idcSelected"
val idcPrinted: File = "idcFingerprint"
val idcGenotyped: File = "idcGenotyped"
val idcMatched: File = "idcMatched"
val idcMatchFail: File = "idcMatch_FAILED"
// Stage 3: Merge runs to library.
val m2lTargetInts: File = "mrTarget"
val m2lIndelRealign:File = "mrRealigned"
val m2lMarkDups: File = "mrMarkDuped"
val m2lQualiMapped: File = "mrQualiMap"
// Stage 4: Merge libraries to individual
val m2iQualiMapped: File = "mlQualiMap"
val m2iTargetInts: File = "mlTarget"
val m2iIndelRealign:File = "mlRealigned"
val m2iBQSR1: File = "BQSR_FirstPass"
val m2iBQSR2: File = "BQSR_SecondPass"
val m2iACoverants: File = "AnalyzeCovariates"
val m2iPrintReads: File = "PrintReads"
// Stage 5: Gender valiation
val gvMatched: File = "gvMatched"
val gvDepth: File = "gvDepthOfCoverage"
// Stage 6: Fingerprinting
val fpHaplotyped: File = "fpHaplotyped"
val fpSelected: File = "fpSelected"
val fpPrinted: File = "FingerPrint"
// Stage 7: Variant calling
val vcAutosomeOut: File = "vcAutosome"
val vcPar1XOut: File = "vcParOneX"
val vcTrueXOut: File = "vcTrueX"
val vcPar2XOut: File = "vcParTwoX"
val vcFullYOut: File = "vcFullY"
val vcGLOut: File = "vcGL"
val vcMitoOut: File = "vcMitochondria"
val vcCatVar: File = "vcCatVarred"
// Shell scripts holders
var fingerPrintCmd: File = null
var sampleMatchCmd: File = null
var bridgeLibraryCmd: File = null
var genderMatchCmd: File = null
// File Breaker
// "/" for folder sub-division
// "_" for underscore (single directory) sub-division
var fileBreak: File = null // "/" // " + fileBreak + "
// Grabbed once on first *.fastq.gz -> *_trimmed.fastq.gz
// Re-parsed for additional files.
// Saves re-walking over the WHOLE trimmed folder... which is big and slow!
var trimmedFastQBlob: List[File] = Nil
// Common variants for sample comparison
var commonVariants: File = null
var par1XRegion: String = null
var trueXRegion: String = null
var par2XRegion: String = null
var trueYRegion: String = null
// Define the FastQdescriptions headers.
// This is for reference and debugging purposes only.
var fqHead: Seq[String] = Nil
/*Seq(
"Individual ID", // 0
"DNA Number", // 1
"Library Number", // 2
"Run Number", // 3
"Sequence Platform", // 4
"Filename1", // 5
"Filename2", // 6
"Directory Location", // 7
"Project Name", // 8
"Exome chip", // 9
"Target Coverage", // 10
"Date sequenced", // 11
"Family ID", // 12
"Individual ID", // 13
"Father ID", // 14
"Mother ID", // 15
"Gender", // 16
"Affection Status", // 17
"Any Other Comments", // 18
"Status" // 19
)*/
// Frequently used header entry positions.
var fqID: Int = 0 // 0
var fqDNA: Int = 0 // 1
var fqLib: Int = 0 // 2
var fqRun: Int = 0 // 3
var fqCap: Int = 0 // 9
var fqGen: Int = 0 // 16
var fqChr: Int = 0 // 16
// Where it all begins //
def script = {
PrintLog("----------\nFastQ to Genomic VCF\n----------\n", "AlignReport.txt")
// If it exists, read settings file
if(
settingsFile.exists &&
settingsFile.canRead
) {
grepConfigs(settingsFile)
PrintLog("INFORMA\tSettings imported:" +
"\n\tAlignment request file: " + alignmentsFile +
"\n\tSample description file: " + sampleDescriptionFile +
"\n\tReference sequence file: " + referenceFile +
"\n\tBase output folder path: " + basePath +
"\n\tNumber of threads: " + numThreads +
"\n\tNumber of scatters: " + numScatters +
"\n\tPath to BWA command: " + bwaCmd +
"\n\tPath to Qualimap command: " + qualimapCmd +
"\n\tCross platform merging is " + (if (!crossPlatform) "NOT ") + "permitted." +
"\n\tMitochondria is " + (if (mitoPloidy > 2) "included at a ploidy of " + mitoPloidy else "NOT included") + "." +
"\n\tFile structur is: " + fileStructure, "AlignReport.txt")
if (knownIndels.size != knownIndels.distinct.size)
PrintLog("WARNING\tKnown indels has duplicate entries!", "AlignReport.txt")
grepPlatforms(capturePath)
grepSource(capturePath)
//grepPlatformBoundries(capturePath)
// Check for missing sex chromosome tolerance boundries.
for (platforms <- capPlatforms.keys)
//if (!capBoundries.contains(platforms))
if (!capData.contains(platforms))
PrintLog("WARNING\tCapture platform " + platforms + " does not have gender bounds defined.", "AlignReport.txt")
PrintLog("----------\n", "AlignReport.txt")
// Set file generation mode.
// Single: all files in single folder.
// Anything else separates files by folder structure.
// Already make a folder for the individual.
fileBreak = if (fileStructure.toLowerCase == "single") "_" else "/"
// Get alignment requests from request file.
// Match alignment requests against entried in FastQ Description file. FQDF
// Split samples by individual.
grepIndividuals(grepFastQLines(grepSamples(alignmentsFile)))
} else {
PrintLog("\nFAILURE\tSettings file " + settingsFile + " does not exist!\n", "AlignReport.txt")
}
}
/**
* Read core settings from file.
*
* Takes settings file location.
* Parses entries to set default paths, files and modes.
*
**/
def grepConfigs(configFile: File) {
val setupLines = getFileLines(configFile)
// Parse lines in settings file.
for (line <- setupLines) {
if (
line.length > 0 && // Line has content
!line.startsWith("#") // Line doesn't start with a # for comments.
) {
// Splits based on any clump of white-space
val pairs = line.replaceAll("\"","").split("\\s+").toList
val head: String = pairs(0).toLowerCase
val param: String = pairs(1)
println("INFORMA\tSetting " + head + ": " + param)
if (head == "par1xregion")
par1XRegion = param
else if (head == "truexregion")
trueXRegion = param
else if (head == "par2xregion")
par2XRegion = param
else if (head == "trueyregion")
trueYRegion = param
else if (
head == "basepath" &&
basePath == null
) basePath = param
else if (new File(param).exists) {
// Display incoming values.
if (
head == "reference" &&
referenceFile == null
) referenceFile = param
else if (
head == "sampledesc" &&
sampleDescriptionFile == null
) sampleDescriptionFile = param
else if (
head == "common" &&
commonVariants == null
) commonVariants = param
else if (
head == "capturepath" &&
capturePath == null
) capturePath = param
else if (
head == "bwacmd" &&
bwaCmd == null
) bwaCmd = param
else if (
head == "qualimapcmd" &&
qualimapCmd == null
) qualimapCmd = param
else if (head == "samplematchcmd")
sampleMatchCmd = param
else if (head == "bridgelibcmd")
bridgeLibraryCmd = param
else if (head == "fingerprintcmd")
fingerPrintCmd = param
else if (head == "gendermatchcmd")
genderMatchCmd = param
else if (head == "known")
knownIndels :+= param
else if (head == "mills")
mills = param
else if (head == "hapmap")
hapmap = param
else if (head == "dbsnp")
dbsnp = param
} else {
PrintLog("WARNING\tUnmapped settings line: " + line + ". If it's a file, does it exist?", "AlignReport.txt")
}
}
}
knownIndels = knownIndels.distinct
}
/**
* Parses the alignment request file for entries.
*
* Takes alignment request file.
* Returns sequence of alignment requests.
*/
def grepSamples(alignFile: File): Seq[String] = {
// Ensure file exists.
if (
!alignFile.exists ||
!alignFile.canRead
) {
PrintLog("\nFAILURE\tAlignment request file \"" + alignFile + "\" does not exist or cannot be read.\n", "AlignReport.txt")
return Nil
}
// Read lines from alignments request file
// Convert lines to a list
// Sorted the list
val alignList = getFileLines(alignFile).sorted
//PrintLog("INFORMA\tAlignment file:", "AlignReport.txt")
//for(requests <- alignList) PrintLog("\t" + requests, "AlignReport.txt")
// add warning about duplication of alignment request.
if (alignList.size != alignList.distinct.size) {
// Print out any duplicates found.
var misAlignment: String = ""
for (alignment <- alignList) {
var alignMatch: Int = 0
for(alignAgain <- alignList)
if (alignment.startsWith(alignAgain))
alignMatch = alignMatch + 1
if (alignMatch > 1)
misAlignment = misAlignment +
(if (misAlignment.length > 0) ", ") +
alignment.replace("\\s+"," ")
}
PrintLog("\nFAILURE\tSamples list in \"" + alignFile + "\" contains duplicated entries for " + misAlignment + "! Please review sample list and fix!\n", "AlignReport.txt")
return Nil
}
// Build list of samples.
var samples: Seq[String] = Nil
// Cycle through all entries in alignments list.
for (line <- alignList) {
// Split input entry by any continuous regions of whitespace.
// Add Entires that where found in the fastqdescription file.
if (
!line.isEmpty &&
!line.startsWith("#")
) {
val stitchLine = line.split("\\s+").toList
samples :+= stitchLine(0) + '\t' +
stitchLine(1) + '\t' +
stitchLine(2) + '\t' +
stitchLine(3)
}
}
return samples
}
/**
* Parses FastQ Description File for matching alignment requests
*
* Takes sequence of alignment requests
* Returns Sequence of lines from FQDF that matched alignment requests.
*/
def grepFastQLines(newAlign:Seq[String]): Seq[String] = {
// Read source as fastq description file
var alignList: Seq[String] = newAlign
if (
!sampleDescriptionFile.exists ||
!sampleDescriptionFile.canRead
) {
PrintLog("\nFAILURE\tSample descriptions file \"" + sampleDescriptionFile + "\" does not exist or cannot be read! Got permissions?\n", "AlignReport.txt")
return Nil
}
val descriptions: List[String] = getFileLines(sampleDescriptionFile)
// validate for duplicate 1st 4 blocks in fqdf for error checking.
// kill it if duplication found for current sample list
// warn if dupilciate found that does not affect the requested runs
var lineMatches: Seq[String] = Nil
// Cycle through lines in fastq description file
for (line <- descriptions) {
val lineBlocks: Seq[String] = line.split("\t").toList
if (lineBlocks.size > 4) {
// Get header names for smart matching later.
if (
line.contains("Individual") &&
line.contains("DNA") &&
line.contains("Library") &&
line.contains("Run")
) {
fqHead = line.split("\\t").toList
fqID = fqHead.indexOf("Individual ID")
fqDNA = fqHead.indexOf("DNA Number")
fqLib = fqHead.indexOf("Library Number")
fqRun = fqHead.indexOf("Run Number")
fqCap = fqHead.indexOf("Exome chip")
fqGen = fqHead.indexOf("Gender")
fqChr = fqHead.indexOf("Sex Chromosomes")
}
// Cycle through alignment requests.
for (alignRequest <- newAlign) {
val alignBlocks: Seq[String] = alignRequest.split("\\s").toList
// Scan 1st 4 positions from fqdf.
// Can't use .startsWith since "a b c d" matches "a b c de"
if (
alignBlocks(0) == lineBlocks(0) &&
alignBlocks(1) == lineBlocks(1) &&
alignBlocks(2) == lineBlocks(2) &&
alignBlocks(3) == lineBlocks(3)
) {
// this line matched a request. Add to the list.
lineMatches :+= line
PrintLog("INFORMA\tAlignement request " + alignRequest.replaceAll("\t",",") + " matched!", "AlignReport.txt")
// Check if matches lines without current line is more than 1.
if (lineMatches.filter(x => x == line).size > 1) {
// More than 1 matched line in list. This means multiple entires in FQDF!
PrintLog("\nFAILURE\tMultiple matches for " + alignRequest.replace("\t"," ") + " found in \"" + sampleDescriptionFile + "\"!\n\tPlease review before re-running pipeline!\n", "AlignReport.txt")
return Nil
}
alignList = alignList.filterNot(x => x == alignRequest)
}
}
} // else it's a blank line or a comment
}
if (alignList.nonEmpty) {
// We have more requests than matched lines!
// This mean there are unmatched request lines.
PrintLog("\nFAILURE\t" + alignList.size + " unmatched samples:", "AlignReport.txt")
for (request <- alignList) PrintLog("\n\t" + request.replace("\t"," "), "AlignReport.txt")
PrintLog("", "AlignReport.txt")
return Nil
}
// Returns sorted by individual, DNA, Library, Run, Platform, etc...
return lineMatches
}
/**
* Locate a trimmed version of the given fastq file.
*
* Take path and file name and search for trimmed versions of a file.
* Returns the the trimmed or untrimmed file, in that order.
*/
def grepTrimmedFastQFile(path: File, file: File): File = {
val rePath: File = path.split("/Originals")(0) + "/Trimmed"
val previousRun: File = swapSuffix(alignmentsFile, ".txt", ".trimmed.list")
if (previousRun.exists) {
trimmedFastQBlob = getFileLines(previousRun)
} else if (trimmedFastQBlob.size == 0) {
// Trimmed trimmedFastQBlob not filled so grab all files in trimmed folder.
PrintLog("INFORMA\tNo trimmed fastq file blob. Walking " + rePath, "AlignReport.txt")
// This SUCKS, but there's really no other way.
// It's a relative drop in the bucket.
// Filter for *.fastq.gz
// Filter for *_trimmed*
// Filter out *unpaired*
trimmedFastQBlob = grepFileTree(new File(rePath)).filter(_.getName.endsWith("fastq.gz")).filter(_.getName.contains("_trimmed")).filterNot(_.getName.contains("unpaired")).toList
// Dump list of matched files to list.
printToFile(previousRun) { p =>
trimmedFastQBlob.foreach(p.println)
}
}
// Locate my trimmed file from trimmed directory blob.
// *.fastq.gz -> *._trimmed.fastq.gz -> *._trimmed_paired.fastq.gz
val myBlob: List[File] = trimmedFastQBlob.filter(_.getName.startsWith(file.split(".fastq")(0)))
if (myBlob.size == 1) {
PrintLog("INFORMA\tFound trimmed file " + myBlob(0).getName, "AlignReport.txt")
return myBlob(0)
} else if (myBlob.size > 1) {
PrintLog("\nFAILURE\tToo many matches for " + file.getName + " found in " + rePath + ":", "AlignReport.txt")
for (files <- myBlob) println("\t" + files.getName)
PrintLog("", "AlignReport.txt")
return null
}
PrintLog("\nFAILURE\tNo matches for " + file.getName + " found in " + rePath + ".\n", "AlignReport.txt")
return null
}
/**
* Gather all runs for an individual and pass them to primary alignment.
*/
def grepIndividuals(samples: Seq[String]) {
// Replicate sample list for per-individual pruning.
var sampleList: Seq[String] = samples
// Cycle through all samples.
for (sample <- sampleList) {
// Split sample up to blocks.
val sampleBlock: Seq[String] = sample.split("\t").toList
// List of matched individuals for this cycle.
var samplesGrepped: Seq[String] = Nil
// Compare current cycle to list of samples.
for (compare <- sampleList) {
val compareBlock: Seq[String] = compare.split("\t").toList
if (compareBlock(fqID) == sampleBlock(fqID)) {
// Individual match!
// Add compared sample to current individual.
samplesGrepped :+= compare
// Prune compare entry from list.
sampleList = sampleList.filterNot(x => x == compare)
}
}
// Check that more than 0 matches as can be 0 from pruned list iterating over pruned entries.
if (samplesGrepped.size > 0) {
// More than 0 matches found.
val greppedBlock: Seq[String] = samplesGrepped(0).split("\t")
val individual: String = greppedBlock(fqID)
val greppedChip: String = greppedBlock(fqCap)
val greppedGender: String = greppedBlock(fqGen)
val greppedChromes: String = greppedBlock(fqChr)
val greppedSample: String = greppedBlock(fqID) + "_" +
greppedBlock(fqDNA) + "_" +
greppedBlock(fqLib) + "_" +
greppedBlock(fqRun)
var passChecks: Boolean = true
PrintLog("----------\nMerge report for " + individual + "\n----------\n", basePath + "/" + greppedChip + "/" + individual + "/MergeReport.txt")
PrintLog(
"INFORMA\tID " + individual +
" has " + samplesGrepped.size +
" samples.", basePath + "/" + greppedChip + "/" + individual + "/MergeReport.txt"
) // ** Output to individial/MergeReport.txt **
// Verify consistency for gender and platform matching.
// If only 1 sample, most wont matter.
for (consistencyCheck <- samplesGrepped) {
val checkBlock: Seq[String] = consistencyCheck.split("\t").toList
val checkChip: String = checkBlock(fqCap)
val checkGender: String = checkBlock(fqGen)
val checkChromes: String = checkBlock(fqChr)
val checkSample: String = checkBlock(fqID) + "_" +
checkBlock(fqDNA) + "_" +
checkBlock(fqLib) + "_" +
checkBlock(fqRun)
// Gender consistency from fqdf.
if (
checkGender == "" ||
checkGender == "0"
) // Gender not specified.
PrintLog("WARNING\tGender is not specified for sample " + checkSample + "\n\tIf all samples for this individual are unknown then gender will be automatically mapped later.", basePath + "/" + checkChip + "/" + individual + "/MergeReport.txt")
if (checkGender != greppedGender) // Gender varies between two samples for an individual.
PrintLog("WARNING\tGender for sample " + checkSample + ": " + checkGender + " does not match sample " + greppedSample + ": " + greppedGender + "\n\tPlease review sample consistency check output " + "to make sure samples are from the same individual.", basePath + "/" + checkChip + "/" + individual + "/MergeReport.txt")
if (checkChromes != greppedChromes)
PrintLog("WARNING\tSex Chromosomes for sample " + checkSample + ": " + checkChromes + " does not match sample " + greppedSample + ": " + greppedChromes + "\n\tPlease review sample consistency check output " + "to make sure samples are from the same individual.", basePath + "/" + checkChip + "/" + individual + "/MergeReport.txt")
if (checkChip != greppedChip) {
PrintLog(
"WARNING\tSample " + checkSample +
" Exome chip does not match " + greppedSample
, basePath + "/" + checkChip + "/" + individual + "/MergeReport.txt")
if (crossPlatform) {
PrintLog("WARNING\tHowever, cross-platform merging is allowed.", basePath + "/" + checkChip + "/" + individual + "/MergeReport.txt")
} else {
PrintLog("\nFAILURE\tCross-platform merging is NOT allowed!\n", "AlignReport.txt")
passChecks = false
}
}
if (
checkChip == "" ||
checkChip == "unknown"
) {
PrintLog("WARNING\tExome chip is unknown.\n\tPlease review qualimap output and adjust.\n\tThis pipeline will stop after PrintReads", basePath + "/Unknown/" + individual + "/MergeReport.txt")
}
}
// Did all samples pass gender and platform matching?
if (passChecks) {
// Yes! Split samples by Library for given individual
PrimaryAlignSortCleanValidate(samplesGrepped)
grepLibraries(samplesGrepped)
MergeLibrariesToIndividual(samplesGrepped)
} else {
// No. Warn about this individual.
PrintLog("\nFAILURE\tExcluding Individual " + individual + " from pipeline!\n\tPlease verify samples and sample descriptions file for errors.\n", "AlignReport.txt")
println("\tPress Control + C to cancel run.\n\tContinuing with remaining individuals in 60 seconds.\n")
Thread.sleep(if (DEBUG) 10000 else 60000)
}
}
}
}
/**
* Splits samples up by library for a given individual.
*
* Takes a list of samples for an individual.
* Passes runs to grepRuns to process runs.
* Calls MergeRunsToLibrary on current list of libraries.
*
* This code should look kind of familiar :(
*/
def grepLibraries(samples:Seq[String]) {
// replicate sample list for pruning.
var sampleList: Seq[String] = samples
// Cycle through libraries for this individual
for (sample <- sampleList) {
// Break up sample line blocks
val sampleBlock = sample.split("\t").toList
// Matched library container
var samplesGrepped: Seq[String] = Nil
for (compare <- sampleList) {
// Break up compare line blocks
val compareBlock = compare.split("\t").toList
if (
compareBlock(fqID) == sampleBlock(fqID) && // Re-check individual match. You neven know who'll edit what.
compareBlock(fqLib) == sampleBlock(fqLib)
) {
// Individual and Library match!
// Add found sample to list.
samplesGrepped :+= compare
// Prune compared sample from list.
sampleList = sampleList.filterNot(x => x == compare)
}
}
// Check for matched libraries still in list.
if (samplesGrepped.size > 0) {
// At least 1 library matching
val greppedBlock: Seq[String] = samplesGrepped(0).split("\t")
val individual: String = greppedBlock(fqID)
val library: String = greppedBlock(fqLib)
val samplesFound: Int = samplesGrepped.size
val greppedChip: String = greppedBlock(fqCap)
val greppedSample: String = greppedBlock(fqID) + "_" +
greppedBlock(fqDNA) + "_" +
greppedBlock(fqLib) + "_" +
greppedBlock(fqRun)
var passChecks: Boolean = true
PrintLog(
"INFORMA\tID " +
individual +
" Library " +
library +
" has " + samplesFound + " sample" + (if (samplesFound != 1) "s") + " to merge."
, basePath + "/" + greppedChip + "/" + individual + "/MergeReport.txt")
// Verify consistency for gender and platform matching.
// If only 1 sample, most wont matter.
for (consistencyCheck <- samplesGrepped) {
val checkBlock: Seq[String] = consistencyCheck.split("\t").toList
val checkChip: String = checkBlock(fqCap)
val checkSample: String = checkBlock(fqID) + "_" +
checkBlock(fqDNA) + "_" +
checkBlock(fqLib) + "_" +
checkBlock(fqRun)
if (checkChip != greppedChip) {
PrintLog(
"\nFAILURE\tSample " + checkSample +
" Exome chip: " + checkChip + " does not match " + greppedSample + ": " + greppedChip + " within a library!\n"
, "AlignReport.txt")
passChecks = false
} else if (
checkChip == "" ||
checkChip == "unknown"
) {
PrintLog("WARNING\tExome chip is unknown.\n\tPlease review qualimap output and adjust.\n\tThis pipeline will stop after PrintReads", basePath + "/" + checkChip + "/" + individual + "/MergeReport.txt")
}
}
// Did all samples pass gender and platform matching?
if (passChecks) {
// Yes! Split samples by Library for given individual
MergeRunsToLibrary(samplesGrepped)
} else {
// No. Warn about this individual.
PrintLog("\nFAILURE\tExcluding Individual " + individual + " from pipeline!" +
"\n\tPlease verify samples and sample descriptions file for errors.\n", "AlignReport.txt")
println("FAILURE\tPress Control + C to cancel run." +
"\n\tContinuing with remaining individuals in 60 seconds.\n")
Thread.sleep(if (DEBUG) 10000 else 60000)
}
}
}
}
/**
* Runs BWA Mem, SortSam and CleanSam (3n) on a given sample.
*
* Takes a line from the Sample Descriptions File.
*
* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
* !!! Will there ever be mulitple runs that are from different cap platforms? !!!
* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
*/
def PrimaryAlignSortCleanValidate(samples:Seq[String]){
// Get the base sample to compare stuff to, if there's more than 1.
val zeroBlock: Seq[String] = samples(0).split("\t").toList
val individual: String = zeroBlock(fqID)
val dnaZero: String = zeroBlock(fqDNA)
val libraryZero: String = zeroBlock(fqLib)
val runZero: String = zeroBlock(fqRun)
val genderZero: String = zeroBlock(fqGen)
val chromeZero: String = zeroBlock(fqChr)
val capPlatZero: String = zeroBlock(fqCap)
// Cycle through all samples.
for (sample <- samples) {
val bwa = new BurrowsWheelerAligner
// val bwa = new BurrowsWheelerAlignerPicardSortSam
val sortSam = new SortSam with Picard_Arguments
val cleanSam = new CleanSam with Picard_Arguments
val haplotypeCaller = new HaplotypeCaller with GATK_Arguments with ISR_Intersect
val selectVariants = new SelectVariants with GATK_Arguments
val fingerPrinter = new FingerPrinter
val genotypeConcordance = new GenotypeConcordance with GATK_Arguments
val sampleMatcher = new SampleMatcher
val sampleBlock = sample.split("\t").toList
// Get base filename.
val fastqpath: File = sampleBlock(fqHead.indexOf("Directory Location"))
// Seek out trimmed versions of these files.
val filename1: File =
grepTrimmedFastQFile(
fastqpath,
sampleBlock(fqHead.indexOf("Filename1"))
)
val filename2: File =
grepTrimmedFastQFile(
fastqpath,
sampleBlock(fqHead.indexOf("Filename2"))
)
if (
filename1 != null &&
filename2 != null
) {
// Read a block from fastq file.
// Capture everything until 1st new-line char
// Capture everything after 1st @ char
// Capture @ delimited blocks.
val headerBlocks: Seq[String] = zcat(filename1).split("\n")(0).split("@")(1).split(":").toList
// May be required when machines names start with FC? Discuss!
val numblocks: Int = headerBlocks.size
//println("headerblocks " + numblocks)
//headerBlocks.foreach(println)
val dnaCur: String = sampleBlock(fqDNA)
val libraryCur: String = sampleBlock(fqLib)
val runCur: String = sampleBlock(fqRun)
val genderCur: String = sampleBlock(fqGen)
val chromeCur: String = sampleBlock(fqChr)
val capPlatCur: String = sampleBlock(fqCap)
val platform: String = sampleBlock(fqHead.indexOf("Sequence Platform"))
val instrument: String = headerBlocks(0).replaceAll(" ","_")
val instrumentrun: String = headerBlocks(1)
val flowcell: String = headerBlocks(2)
val lane: String = headerBlocks(3)
val index: String = headerBlocks(9)
// Construct output paths.
val runPath: String = basePath +
"/" + capPlatCur +
"/" + individual +
"/DNA_" + dnaCur +
fileBreak + "Library_" + libraryCur +
fileBreak + "Run_" + runCur +
fileBreak
val runPathZero: String = basePath +
"/" + capPlatZero +
"/" + individual +
"/DNA_" + dnaZero +
fileBreak + "Library_" + libraryZero +
fileBreak + "Run_" + runZero +
fileBreak
// Create output path if it doesn't already exist.
if (runPath.contains("/"))
new File(runPath.substring(0,runPath.lastIndexOf('/'))).mkdirs
///////////////////////////////////
// Begin Primary Alignment phase //
///////////////////////////////////
// Burrows Wheeler Aligner FastQ x2 to SAM
bwa.reference_sequence = referenceFile.replace(".fasta","")
bwa.operation = "mem -M"
bwa.input = Seq(filename1,filename2)
bwa.output = runPath + paBWAAligned + ".sam"
bwa.readGroups = "@RG" + "'\\t'" +
"ID:" + instrument + "_" + instrumentrun + "_" +