forked from knights-lab/BURST
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathburst.c
5164 lines (4859 loc) · 203 KB
/
burst.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* BURST aligner -- fast optimal aligner by Gabe.
Copyright (C) 2015-2018 Knights Lab, Regents of the University of Minnesota.
This software is released under the GNU Affero General Public License (AGPL) v3.0.
*/
#define VER "v1.0"
#define _LARGEFILE_SOURCE_
#define FILE_OFFSET_BITS 64
#include <stdio.h>
#include <inttypes.h>
#include <float.h>
#include <string.h>
#include <limits.h>
#include <stdlib.h>
#include <time.h>
// Ensure ability to read files > 4GB on Windows, or mmap on Unix
#ifdef _WIN32
#include <windows.h>
#define fseek _fseeki64
#define ftell _ftelli64
#else
#include <sys/stat.h>
#include <sys/mman.h>
#endif
// OpenMP is the multi-threading protocol used by BURST
#ifdef _OPENMP
#include <omp.h>
#define HAS_OMP 1
#define getNumberOfCores() omp_get_max_threads()
#else
#define HAS_OMP 0
#define omp_get_thread_num() (0)
#define getNumberOfCores() (1)
#define omp_get_wtime() ((double)clock()/CLOCKS_PER_SEC)
#endif
// This is the header that includes the assembly intrinsic functions
#include <immintrin.h>
#ifdef __AVX__
#define ASMVER "AVX-128"
#elif __SSE4_1__
#define ASMVER "SSE4.1"
#else
// These redefinitions of SSE4+ intrinsics are necessary on all other (non-SSE4)
// instruction sets. This allows functions that use SSE4 functions to still
// run, albeit with slight performance penalty.
#define _mm_popcnt_u64 __builtin_popcountll
#define _mm_blendv_epi8(f,t,m) \
_mm_xor_si128(_mm_and_si128(_mm_xor_si128(f,t),m),f)
#define _mm_blendv_ps(f,t,m) \
_mm_xor_ps(_mm_and_ps(_mm_xor_ps(f,t),m),f)
#define _mm_cvtepu8_epi32(x) \
_mm_unpacklo_epi16(_mm_unpacklo_epi8(x,_mm_setzero_si128()),_mm_setzero_si128())
#define _mm_extract_epi8(a,x) \
(0xff & (_mm_extract_epi16(a,((x)>>1))>>(8*((x) & 0x01))))
#define _mm_stream_load_si128 _mm_load_si128
#ifdef __SSSE3__
#define ASMVER "SSSE3"
#else
// With SSE2 (no longer officially supported), even more redefinitions
// are necessary to perform assembly functions including blends and lt/gt
#define ASMVER "SSE2"
#define _mm_cmple_epu16(x,y) \
_mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128())
#define _mm_blendv_si128 (x, y, mask) \
_mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(mask, y))
#define _mm_min_epu16(x,y) \
_mm_blendv_si128(y, x, _mm_cmple_epu16(x, y))
#define _mm_lddqu_si128 _mm_loadu_si128
#endif
#endif
#include "math.h"
// This enum type defines all possible running modes in -m. These
// are tested later in functions that change their behavior based on
// which alignment mode is used (see do_alignments()).
typedef enum {
FORAGE, BEST, ALLPATHS, CAPITALIST, ANY,
MATRIX, PAIRED, MAPPED, INLINE, COMMUNIST
} Mode;
// This enum type defines all of the database creation modes supported by BURST
typedef enum {DNA_16, DNA_8, DNA_4, PROT, DATA, QUICK} DBType;
Mode RUNMODE = CAPITALIST; // The default alignment run mode is now CAPITALIST; (-m)
uint32_t DO_PREPASS = 0; // Prepass heuristic is not enabled by default; (-p)
uint32_t LATENCY = 16; // The max length difference allowed between seqs within a DB cluster
int THREADS = 1; // By default, BURST uses one thread, but this is overriden by OpenMP env vars; (-t)
int cacheSz = 150; // How many rows of the alignment matrix to remember. More = more RAM bandwidth
int Xalpha = 0; // Whether to align with arbitrary symbol exact matching; (-x)
int REBASE = 0; // Size of shears to use when creating a database
int DO_FP = 0; // Whether to use fingerprints for DB creation (or aligning); (-f)
int DO_ACCEL = 0; // Whether to create or use an accelerator (DB creation/alignment); (-a)
int DO_HEUR = 0; // Prevents demotion of sequences to non-accelerator alignment by identity; (-hr)
uint32_t TAXACUT = 10; // Ratio (to 1) of taxonomic agreements required for CAPITALIST interpolation; (-bc)
float THRES = 0.97f; // Identity/similarity threshold to restrict valid alignments; (-i)
long REBASE_AMT = 500, DB_QLEN = 500; // Stores shear length and max query length (DB creation); (-s); (-d [] qLen)
#define VECSZ 16
#ifndef SCOUR_N
#define SCOUR_N 15
#endif
#define SCOUR_R (32 - 2*SCOUR_N)
#define SCOUR_L 4
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
#define PRINT_USAGE() { \
printf("\nBURST aligner (" VER "; DB%d version)\n", SCOUR_N); \
printf("Compiled with " ASMVER " and%s multithreading\n", !HAS_OMP ? " WITHOUT" : "");\
printf("\nBasic parameters:\n");\
printf("--references (-r) <name>: FASTA/edx DB of reference sequences [required]\n");\
printf("--accelerator (-a) <name>: Creates/uses a helper DB (acc/acx) [optional]\n"); \
puts("--queries (-q) <name>: FASTA file of queries to search [required if aligning]");\
puts("--output (-o) <name>: Blast6/edb file for output alignments/database [required]");\
puts("\nBehavior parameters:"); \
puts("--forwardreverse (-fr): also search the reverse complement of queries"); \
puts("--whitespace (-w): write full query names in output (include whitespace)"); \
/*puts("--npenalize (-n): Force A,C,G,T,U,N,X in query to mismatch X,N in reference");*/ \
puts("--xalphabet (-x): Allow any alphabet and disable ambiguity matching");\
puts("--nwildcard (-y): Allow N,X to match anything (in query and reference)"); \
puts("--taxonomy (-b) <name>: taxonomy map (to interpolate, use -m CAPITALIST)");\
puts("--mode (-m) <name>: Pick an alignment reporting mode by name. Available modes:");\
puts(" BEST (report first best match by hybrid BLAST id)"); \
puts(" ALLPATHS (report all ties with same error profile)"); \
puts(" CAPITALIST (minimize set of references AND interpolate taxonomy) [default]"); \
/* puts(" COMMUNIST (min. ref. set, taxa interpolation, and fair redistribution)"); */ \
puts(" FORAGE (report all matches above specified threshold)"); \
puts(" ANY (report any valid hit above specified threshold)"); \
/*puts(" [not enabled]: PAIRED, MAPPED, INLINE");*/ \
puts("--makedb (-d) [name qLen]: Create a database from input references");\
printf(" [name]: Optional. Can be DNA, RNA, or QUICK [QUICK]\n");\
printf(" [qLen]: Optional. Max query length to search in DB [%d]\n",DB_QLEN); \
printf("\nPerformance parameters:\n"); \
printf("--dbpartition (-dp) <int>: Split DB making into <int> chunks (lossy) [%u]\n",1); \
printf("--taxacut (-bc) <num>: allow 1/<int> rank discord OR %% conf; 1/[%u]\n",TAXACUT); \
printf("--taxa_ncbi (-bn): Assume NCBI header format '>xxx|accsn...' for taxonomy\n"); \
printf("--skipambig (-sa): Do not consider highly ambiguous queries (5+ ambigs)\n"); \
printf("--taxasuppress (-bs) [STRICT]: Suppress taxonomic specificity by %%ID\n"); \
printf("--id (-i) <decimal>: target minimum similarity (range 0-1) [%.2f]\n",THRES);\
printf("--threads (-t) <int>: How many logical processors to use [%u]\n",THREADS);\
printf("--shear (-s) [len]: Shear references longer than [len] bases [%ld]\n",REBASE_AMT);\
/*printf("--unique (-u): Dereplicate references (lossless preprocessing)\n");*/ \
printf("--fingerprint (-f): Use sketch fingerprints to precheck matches (or cluster db)\n"); \
printf("--prepass (-p) [speed]: use ultra-heuristic pre-matching\n"); \
printf(" [speed]: Optional. Integer, maximum search effort [16]\n"); \
printf("--heuristic (-hr): allow relaxed comparison of low-id matches\n"); \
printf("--noprogress: suppress progress indicator\n"); \
/*printf("--cache (-c) <int>: Performance tweaking parameter [%d]\n",cacheSz); */ \
/* printf("--latency (-l) <int>: Performance tweaking parameter [%d]\n",LATENCY); */ \
/*printf("--clustradius (-cr) <int>: Performance tweaking parameter [auto]\n");*/ \
puts("\n--help (-h): Shows this help screen with version info"); \
puts("\nExample: burst -r myRefs.fasta -q myQs.fasta -o outputs.txt -i 0.98"); \
puts("\nLicensed under the GNU Affero General Public License v3.0"); \
exit(1); \
}
// Gap penalty is worth 1 mismatch. This is not alterable currently, as any other
// value would violate later optimality edit distance assumptions.
#define GAP 1
// The width of a vector (in bytes) on the target machine. This is also immutable.
#define SCD 16
// Internal handler for database versions, to allow recognition. EDB is obsolete.
#define EDB_VERSION 2
#define EDX_VERSION 3
// There are two accelerator versions. One stores 16 million sheared references,
// and the other (big/LARGE-format) stores up to 256 million sheared references.
#define ACC_VERSION 0
#define ACC_VERSION_BIG 1
char Z = 1; // Whether to penalize N's (and also the penalty itself); (-n)
// BAK and RVC are obsolute debug-only reverse complement handlers
char *BAK = "\0ACGTNKMRYSWBVHD\0"; //
char *RVC = "\0TGCANMKYRSWVBDH\0";
char RVT[] = {0,4,3,2,1,5,7,6,9,8,10,11,13,12,15,14}; // for reverse complementing
// 0,T,G,C,A,N,M,K,Y,R,S, W, V, B, D, H
__m128i GAPV;
__m128i SCOREFAST[16] = {0};
char SCORENVedN[] = { -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, //.
//0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
//. A C G T N K M R Y S W B V H D
-1,0,1,1,1,0,1,0,0,1,1,0,1,0,0,0, //A
-1,1,0,1,1,0,1,0,1,0,0,1,0,0,0,1, //C
-1,1,1,0,1,0,0,1,0,1,0,1,0,0,1,0, //G
-1,1,1,1,0,0,0,1,1,0,1,0,0,1,0,0, //T/U
-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //N/X
-1,1,1,0,0,0,0,1,1,1,1,1,0,1,1,0, //K
-1,0,0,1,1,0,1,0,1,1,1,1,1,0,0,1, //M
-1,0,1,0,1,0,1,1,0,1,1,1,1,0,1,0, //R
-1,1,0,1,0,0,1,1,1,0,1,1,0,1,0,1, //Y
-1,1,0,0,1,0,1,1,1,1,0,1,0,0,1,1, //S
-1,0,1,1,0,0,1,1,1,1,1,0,1,1,0,0, //W
-1,1,0,0,0,0,0,1,1,0,0,1,0,1,1,1, //B
-1,0,0,0,1,0,1,0,0,1,0,1,1,0,1,1, //V
-1,0,0,1,0,0,1,0,1,0,1,0,1,1,0,1, //H
-1,0,1,0,0,0,0,1,0,1,1,0,1,1,1,0, //D
};
char CHAR2NUM[128], CHAR2IX[32] = {0,1,12,2,15,5,5,3,14,5,5,6,5,7,5,5,
5,5,8,10,4,4,13,11,5,9,5,0,0,0,0,0};
typedef union {
__m128i v;
__m128 vf;
char c[16];
uint8_t u8[16];
uint16_t u16[8];
uint32_t u32[4];
uint64_t u64[2];
int8_t i8[16];
int16_t i16[8];
int32_t i32[4];
int64_t i64[2];
float f[4];
double d[2];
} DualCoil;
typedef struct {
DualCoil sc, sh;
} SSMat;
typedef struct {
float score[16];
} Scores16;
typedef struct {uint32_t v, i;} Split;
typedef struct {char *s; uint32_t ix;} StrIxPair32;
typedef struct {char *s; uint64_t ix;} StrIxPair;
typedef struct {
float score[16];
uint32_t finalPos[16];
uint8_t numGapR[16], numGapQ[16];
} MetaPack;
typedef union {
uint8_t a[32];
uint16_t h[16];
uint32_t s[8];
uint64_t w[4];
} Prince;
// Structure to store sequence fingerprints
typedef struct {
Prince *P; // Prince
uint8_t *N; // Pops
void *initP;
uint64_t nf;
uint32_t *Ptrs;
} PackaPrince;
// One hash contains many references (accelerator lookup)
typedef struct {
uint32_t len, cap;
uint32_t *Refs;
} Accelerant;
typedef struct AccelNode AccelNode;
struct AccelNode {
AccelNode *next;
uint32_t ref;
};
// For sorting references by property
typedef struct {
char *seq;
uint32_t len, ix;
} Tuxedo;
// For intra-taxonomic cutoffs, use these similarity values
typedef struct {char *Head, *Tax;} TaxPair_t;
//K P C O F G S SS+
float TAXLEVELS_STRICT[] = {.65f, .75f, .78f, .82f, .86f, .94f, .98f, .995f},
TAXLEVELS_LENIENT[] = {.55f, .70f, .75f, .80f, .84f, .93f, .97f, .985f},
*TAXLEVELS = TAXLEVELS_LENIENT;
// Primary storage unit for each unique query
typedef struct {
char *Seq; //8
uint16_t div; //2
uint64_t six; //4
uint8_t rc; //1
} UniBin; //15
// Primary storage unit for shared query (RC) attributes
typedef struct {
uint32_t len; //4
uint16_t ed; //2
} ShrBin; //6
// Contains all info about references; for passing between functions
typedef struct {
char **RefHead, **RefSeq;
uint32_t *RefLen, *ClumpLen, *RefStart, *RefIxSrt, *TmpRIX, *RefDedupIx;
uint32_t totR, origTotR, numRclumps, maxLenR;
DualCoil **RefClump, **ProfClump;
// Fingerprint tools
Prince *Centroids;
PackaPrince FingerprintsR;
TaxPair_t *Taxonomy; // To store taxon and index
uint32_t taxa_parsed;
uint32_t clustradius; // Cluster refinement passes
uint32_t numRefHeads, *RefMap; // For easy reference name lookup
uint32_t *BadList, badListSz; // Stores ambiguous references in acx
void **Accelerators; // Used for constructing acx
DBType dbType; // Corresponds to second arg of '-d'
int cparts, skipAmbig;
uint64_t ForestSize;
uint64_t RawEDB_sz, RawAcc_sz;
int RawEDB_fno, RawAcc_fno;
char *RawEDB, *RawAcc;
} Reference_Data;
// Contains all info about queries; for passing between functions
typedef struct {
char **QHead, *SeqDumpRC;
uint64_t totQ, numUniqQ, *Offset, *QBins;
uint32_t maxLenQ, maxED, maxDiv, minLenQ;
int incl_whitespace, taxasuppress, rc, skipAmbig, quiet;
PackaPrince FingerprintsQ;
UniBin *UniBins;
ShrBin *ShrBins;
} Query_Data;
// Aligned memory allocation (for vectorized operations)
void * malloc_a(size_t algn, size_t size, void **oldPtr) {
uintptr_t mask = ~(uintptr_t)(algn - 1);
*oldPtr = malloc(size+algn-1);
return (void *)(((uintptr_t)*oldPtr+algn-1) & mask);
}
// Aligned (zerod) memory allocation (for vectorized operations)
void * calloc_a(size_t algn, size_t size, void **oldPtr) {
uintptr_t mask = ~(uintptr_t)(algn - 1);
*oldPtr = calloc(size+algn-1,1);
return (void *)(((uintptr_t)*oldPtr+algn-1) & mask);
}
// For sorting (comparing) structures by strings ('s') they contain
int cmpStrIx(const void *a, const void *b) {
return strcmp(((StrIxPair*)a)->s,((StrIxPair*)b)->s); }
// A generic function that sorts by strings in parallel
// Linear-time aggregation of common prefices, then N*logN sort within each
// To use, pass in Pack, len, and define structure type, num letters per block, and nib function
#define PARALLEL_SORT_PROTOTYPE(STRUCT_TYPE, STR_MEMBER, NUMLET, NIBFUNC) { \
static const uint32_t NLB = 1 << (NUMLET*4); \
STRUCT_TYPE **Bins = calloc(NLB,sizeof(*Bins)); \
uint32_t *BcountO = calloc(1+NLB,sizeof(*BcountO)), \
*Bcount = BcountO + 1; \
_Pragma("omp parallel for") \
for (uint32_t i = 0; i < len; ++i) { \
char *s = Pack[i].STR_MEMBER; \
uint32_t nib = NIBFUNC; \
_Pragma("omp atomic update") \
++Bcount[nib]; \
} \
for (uint32_t i = 0; i < NLB; ++i) if (Bcount[i]) \
Bins[i] = malloc(Bcount[i]*sizeof(*Bins[i])), \
Bcount[i] = 0; \
_Pragma("omp parallel for") \
for (uint32_t i = 0; i < len; ++i) { \
char *s = Pack[i].STR_MEMBER; \
uint32_t nib = NIBFUNC; \
uint32_t ix; \
_Pragma("omp atomic capture") \
ix = Bcount[nib]++; \
Bins[nib][ix] = Pack[i]; \
} \
int cmpFunc(const void *a, const void *b) { \
return strcmp(((STRUCT_TYPE *)a)->STR_MEMBER + NUMLET, \
((STRUCT_TYPE *)b)->STR_MEMBER + NUMLET); \
} \
_Pragma("omp parallel for schedule(dynamic,1)") \
for (uint32_t i = 0; i < NLB; ++i) if (Bins[i]) \
qsort(Bins[i],Bcount[i],sizeof(*Bins[i]),cmpFunc); \
--Bcount; \
for (uint32_t i = 2; i <= NLB; ++i) Bcount[i] += Bcount[i-1]; \
_Pragma("omp parallel for schedule(dynamic,1)") \
for (uint32_t i = 0; i < NLB; ++i) { \
uint32_t init = Bcount[i], ed = Bcount[i+1]; \
for (uint32_t pt = init; pt < ed; ++pt) \
Pack[pt] = Bins[i][pt - init]; \
free(Bins[i]); \
} \
free(Bins), free(BcountO); \
}
// For initial sorting/splitting of strings into prefix buckets
#define NIB4 s[0] << 12 | s[1] << 8 | s[2] << 4 | s[3]
#define NIB5 s[0] << 16 | s[1] << 12 | s[2] << 8 | s[3] << 4 | s[4]
// Some instantiations of the generic sort for different structs
static inline void parallel_sort_strpack(StrIxPair *Pack, uint32_t len)
PARALLEL_SORT_PROTOTYPE(StrIxPair, s, 5, NIB5)
static inline void parallel_sort_unibin(UniBin *Pack, uint32_t len)
PARALLEL_SORT_PROTOTYPE(UniBin, Seq, 5, NIB5)
static inline void parallel_sort_tuxedo(Tuxedo *Pack, uint32_t len) {
int tuxCmp(const void *a, const void *b) {
Tuxedo *A = (Tuxedo *)a, *B = (Tuxedo *)b;
char *s1 = A->seq, *s2 = B->seq;
uint32_t len1 = A->len, len2 = B->len, ml = MIN(len1,len2);
uint32_t i = 5;
for (; i < ml; ++i) if (s1[i] != s2[i]) break;
if (i < ml) return s1[i] - s2[i];
return len1 < len2 ? -1 : 1;
}
void tuxSort(Tuxedo *a, size_t n, size_t s)
{qsort(a,n,s,tuxCmp);}
#define qsort(a,n,s,c) tuxSort(a,n,s)
PARALLEL_SORT_PROTOTYPE(Tuxedo, seq, 5, NIB5)
#undef qsort
}
// Taxonomy lookup by binary search
char NULLTAX[1] = {0};
static char * taxa_lookup_generic(char *key, uint32_t sz, TaxPair_t *Dict) {
TaxPair_t *p = Dict;
while (sz) {
uint32_t w = sz >> 1;
char *ref_s = p[w+1].Head, *key_s = key;
while (*ref_s == *key_s++) if (!*ref_s++) return p[w+1].Tax;
if (*ref_s < *(key_s-1)) { p+=w+1; sz-=w+1; }
else sz = w;
}
char *ref_s = p->Head, *key_s = key;
while (*ref_s == *key_s++) if (!*ref_s++) return p->Tax;
return NULLTAX; //return p->Tax;
}
// Specialized version of the above for NCBI assembly formats ('>xxx|accsn...')
static char * taxa_lookup_ncbi(char *key, uint32_t sz, TaxPair_t *Dict) {
TaxPair_t *p = Dict;
int in = 0;
while (sz) {
uint32_t w = sz >> 1;
char *ref_s = p[w+1].Head, *key_s = key + 4;
while (*ref_s == *key_s++) if (!*ref_s++) return p[w+1].Tax;
if (*(key_s-1) == '.' && !*ref_s) return p[w+1].Tax;
char keyLast = *(key_s-1) == '.' ? 0 : *(key_s-1);
if (*ref_s < keyLast) { p+=w+1; sz-=w+1; }
else sz = w;
}
char *ref_s = p->Head, *key_s = key + 4;
while (*ref_s == *key_s++) if (!*ref_s++) return p->Tax;
if (*(key_s-1) == '.' && !*ref_s) return p->Tax;
return NULLTAX; //return p->Tax;
}
// Dynamic function pointer for taxa lookup; based on whether '-bn' specified
char * (*taxa_lookup)(char *, uint32_t, TaxPair_t *) = taxa_lookup_generic;
// Parses tab-delimited taxonomy file line-by-line into Tax_Pair_t object 'Obj'
// (stores header and taxonomy string separately). Returns number parsed.
size_t parse_taxonomy(char *filename, TaxPair_t **Obj) {
static const size_t linelen = 10000000; //1k entries, 10 meg line lines
size_t cur_sz = 1000, ns = 0;
FILE *file = fopen(filename,"rb");
if (!file)
{fprintf(stderr,"Cannot open TAXONOMY file: %s.\n",filename); exit(2);}
TaxPair_t *T = malloc(cur_sz * sizeof(*T));
char *line = calloc(linelen,1), *lineO = line;
while (line = fgets(line, linelen, file)) {
if (ns == cur_sz) { // double all data structures
T = realloc(T, (cur_sz*=2)*sizeof(*T));
if (!T) {fputs("OOM:T:parse_taxonomy\n",stderr); exit(3);}
}
uint32_t i, j;
for (i = 0; line[i] != '\t'; ++i)
if (!line[i]) {fprintf(stderr,"ERROR: invalid taxonomy [%u]\n",ns); exit(2);}
char *temp = malloc(i+1);
if (!temp) {fputs("OOM:temp:parse_taxonomy\n",stderr); exit(3);}
memcpy(temp,line,i), temp[i] = 0;
T[ns].Head = temp;
for (j = ++i; line[j] && line[j] != '\n' && line[j] != '\r' && line[j] != '\t'; ++j);
temp = malloc(j - i + 1);
if (!temp) {fputs("OOM:temp:parse_taxonomy\n",stderr); exit(3);}
memcpy(temp,line + i, j-i), temp[j-i] = 0;
T[ns].Tax = temp;
//printf("[%u] Head = %s\nTax=%s\n",ns,T[ns].Head,T[ns].Tax);
++ns;
}
free(lineO);
*Obj = realloc(T, ns*sizeof(*T)); // shrink T
fclose(file);
return ns;
}
// Standard line-by-line fasta sequence parser. Grows a bunch of parallel
// arrays containing headers, sequences, and their lengths. Returns number parsed.
// Used in standard reference database creation. No longer used for query parsing
size_t parse_tl_fasta(char * filename, char ***HeadersP, char ***SeqsP, uint32_t **LengthsP) {
static const size_t linelen = INT32_MAX; //1k entries, 2-gig lines
FILE *file = fopen(filename,"rb");
if (!file) {
fprintf(stderr,"Cannot open FASTA file: %s.\n",filename); exit(2); }
size_t cur_sz = 1000, ns = -1, len16;
char **Headers = malloc(cur_sz * sizeof(*Headers)),
**Seqs = malloc(cur_sz * sizeof(*Seqs));
uint32_t *Lengths = malloc(cur_sz * sizeof(*Lengths));
char *line = malloc(linelen), *lineO = line;
if (!Headers || !Seqs || !Lengths || !line) {
fputs("OOM:parse_tl_fasta\n",stderr); exit(3); }
int lastHd = 0;
while (line = fgets(line, linelen, file)) {
size_t len = strlen(line); // Kill newlines
if (len && line[len-1] == '\n') --len;
if (len && line[len-1] == '\r') --len;
line[len] = 0;
switch (*line) {
case '>': // We could be in the (a) header.
if (lastHd) break;
if (ns++ == cur_sz) { // double all data structures
cur_sz += cur_sz;
Headers = realloc(Headers, cur_sz*sizeof(*Headers));
Seqs = realloc(Seqs, cur_sz*sizeof(*Seqs));
Lengths = realloc(Lengths, cur_sz*sizeof(*Lengths));
if (!Headers || !Seqs || !Lengths) {
fputs("OOM:parse_tl_fasta\n",stderr); exit(3); }
}
lastHd = 1;
Headers[ns] = memcpy(malloc(len), line+1, len);
Lengths[ns] = 0;
case '\0': case ' ': break;
default: // we're in sequence (hopefully!)
lastHd = 0;
uint32_t a = Lengths[ns] + len;
len16 = a + (15 & (16 - (a & 15)));
if (!Lengths[ns]) Seqs[ns] = malloc(len16+1);
else Seqs[ns] = realloc(Seqs[ns],len16+1);
memcpy(Seqs[ns] + Lengths[ns],line,len);
Lengths[ns] += len;
memset(Seqs[ns]+Lengths[ns],'\0',len16-Lengths[ns]+1); // trailing nil
}
}
if (lastHd) puts("WARNING: file ends on header. Skipping last sequence."), --ns;
free(lineO);
Headers = realloc(Headers,++ns*sizeof(*Headers)), Seqs = realloc(Seqs,ns*sizeof(*Seqs));
Lengths = realloc(Lengths,ns*sizeof(*Lengths));
*HeadersP = Headers; *SeqsP = Seqs; *LengthsP = Lengths;
if (ns >= UINT32_MAX) puts("WARNING: >4 billion sequences processed.");
return ns;
}
// Variation of the above that internally arranges sequences contiguously in
// memory. Instead of sequences themselves, an offset pointer is returned.
// This specialized function is intended for database creation which requires
// packed references for vectorization and deduplication purposes.
size_t parse_tl_fasta_db(char *ref_FN, char ***HeadersP, char **SeqsP,
char ***OffsP, uint64_t *numLet) {
size_t linelen = INT32_MAX;
size_t cur_len = linelen, cur_sz = 1000;
FILE *file = fopen(ref_FN,"rb");
if (!file)
{ fprintf(stderr,"Cannot open FASTA file: %s.\n",ref_FN); exit(2); }
char **Headers = malloc(cur_sz * sizeof(*Headers)),
*Seqs = malloc(cur_len * sizeof(*Seqs));
char **Offsets = malloc(cur_sz * sizeof(*Offsets));
uint64_t cur_off = -1, ns = -1;
char *line = malloc(linelen), *lineO = line;
if (!line) {fputs("OOM:ptfdLn\n",stderr); exit(3);}
int lastHd = 0;
while (line = fgets(line, linelen, file)) {
size_t len = strlen(line); // kill newlines
if (line[len-1] == '\n') --len;
if (line[len-1] == '\r') --len;
line[len] = 0;
switch (*line) {
case '>': // We could be in the (a) header.
if (lastHd) break;
if (++ns == cur_sz) { // double all data structures
cur_sz += cur_sz;
Headers = realloc(Headers, cur_sz*sizeof(*Headers));
Offsets = realloc(Offsets, cur_sz*sizeof(*Offsets));
if (!Headers || !Offsets) {
fputs("OOM:db_parse",stderr); exit(3); }
}
lastHd = 1;
Headers[ns] = memcpy(malloc(len), line+1, len);
Offsets[ns] = (char*)++cur_off; // insert null post-seq
case '\0': case ' ': break;
default: // we're in sequence (hopefully!)
lastHd = 0;
if (cur_off + len + 1 >= cur_len) { //
Seqs = realloc(Seqs, (cur_len*=2)*sizeof(*Seqs));
if (!Seqs) {fputs("OOM:db_parse",stderr); exit(3);}
} // guaranteed to be enough -- but what about a while loop
// now dump the current sequence bits into the trunk
//update cur_off
memcpy(Seqs + cur_off,line,len+1); //+1 for null
cur_off += len;
}
}
// add tail offset at ++cur_off
// reallocate data structures with one extra space (tail)
if (lastHd) puts("WARNING: file ends on header. Skipping last sequence."), --ns;
free(lineO);
Headers = realloc(Headers, (++ns+1)*sizeof(*Headers));
Offsets = realloc(Offsets, (ns+1)*sizeof(*Offsets));
Seqs = realloc(Seqs,++cur_off+16); // ensure alignment of tail
if (!Headers || !Offsets || !Seqs) {
fputs("Error OOM terminus p1",stderr); exit(3); }
memset(Seqs+cur_off,'\0',15);
for (uint64_t i = 0; i < ns; ++i)
Offsets[i] = (uint64_t)Offsets[i] + Seqs;
Offsets[ns] = Seqs + cur_off;
Headers[ns] = 0; // just a tailcap
// return the proper data now
*HeadersP = Headers; *SeqsP = Seqs; *OffsP = Offsets;
*numLet = cur_off;
return ns;
}
// Finds the next newline character. Only for when the sequence
// is guaranteed not to terminate before next newline.
// Accessory function for the fast fasta parser below.
static inline char * findNL_abs(char *a) {
__m128i nl = _mm_set1_epi8('\n');
while (1) {
__m128i v = _mm_lddqu_si128((void*)a);
__m128i e = _mm_cmpeq_epi8(v,nl);
uint16_t x = _mm_movemask_epi8(e);
if (x) return a + __builtin_ctz(x);
a += 16;
}
}
// General version of the above that stops also at nulls.
static inline char * findNL_or_eol(char *a) {
__m128i nil = _mm_set1_epi8(0), nl = _mm_set1_epi8('\n');
while (1) {
__m128i v = _mm_lddqu_si128((void*)a);
__m128i e = _mm_cmpeq_epi8(v,nl);
uint16_t x = _mm_movemask_epi8(e);
if (x) return a + __builtin_ctz(x);
__m128i n = _mm_cmpeq_epi8(v,nil);
uint16_t z = _mm_movemask_epi8(n);
if (z) return a + __builtin_ctz(z);
a += 16;
}
}
// Faster fasta parser intended for large query files. Reads in entire
// queries file, then indexes into it directly to access queries.
size_t parse_tl_faster(char * filename, char ***HeadersP, char ***SeqsP, uint32_t **LengthsP) {
FILE *file = fopen(filename,"rb");
if (!file) {
fprintf(stderr,"Cannot open FASTA file: %s.\n",filename); exit(2); }
uint64_t sz = 0;
fseeko(file,0,SEEK_END); sz = ftello(file); rewind(file);
double wt = omp_get_wtime();
char *dump = malloc(16+sz);
uint64_t bytesRead = fread(dump, 1, sz, file);
memset(dump+sz,0,16);
if (*dump != '>') {fputs("ERROR: Malformatted FASTA file.\n",stderr); exit(1);}
wt = omp_get_wtime();
uint64_t numNL = 0, numLT = 0;
#pragma omp parallel for simd reduction(+:numNL,numLT)
for (uint64_t i = 0; i < sz; ++i)
numNL += dump[i]=='\n',
numLT += dump[i]=='>';
numNL += numNL & 1;
if (numLT != numNL/2) {fputs("ERROR: line count != '>' * 2\n",stderr); exit(1);}
char **Headers = malloc(numLT * sizeof(*Headers)),
**Seqs = malloc(numLT * sizeof(*Seqs));
uint32_t *Lengths = malloc(numLT * sizeof(*Lengths));
if (!Headers || !Seqs || !Lengths) {
fputs("OOM:parse_tlX_fasta\n",stderr); exit(3); }\
wt = omp_get_wtime();
Headers[0] = dump+1;
char *nl = findNL_abs(Headers[0]);
*nl = 0;
Seqs[0] = nl + 1;
nl = findNL_or_eol(Seqs[0]);
*nl = 0;
Lengths[0] = nl - Seqs[0];
uint64_t ix = 1;
#pragma omp parallel for
for (uint64_t i = (nl-dump); i < sz; ++i) if (dump[i]=='>') {
uint64_t tix;
#pragma omp atomic capture
tix = ix++;
char *s = dump + i + 1;
Headers[tix] = s;
char *nl = findNL_abs(s);
*nl = 0;
if (*(nl-1) == '\r') *(nl-1) = 0;
s = nl + 1;
Seqs[tix] = s;
nl = findNL_or_eol(s);
*nl = 0;
if (*(nl - 1) == '\r') *--nl = 0;
Lengths[tix] = nl - s;
}
if (ix > UINT32_MAX) puts("Note: more than 4 billion queries.");
*HeadersP = Headers; *SeqsP = Seqs; *LengthsP = Lengths;
return ix;
}
/// Prototypes for nucleotide-nucleotide scoring functions.
// An alphabet-agnostic scorer (for "xalpha" mode) penalizing inequality
// Because of SSE vectorization constraints, must flip from "true" (-1)
// to zero-penalty (0) with overflow, and false (0) to penalty (1)
#define DIAGSC_XALPHA _mm_add_epi8(_mm_cmpeq_epi8(_mm_set1_epi8(qLet), \
rChunk),_mm_set1_epi8(1))
// SSSE3 shuffle is used as a single-instruction score lookup per letter
#ifdef __SSSE3__
#define DIAGSC_MAT16 _mm_shuffle_epi8(SCOREFAST[qLet], rChunk)
#else
// If SSSE3 isn't available, operate on pre-computed score profiles
#define DIAGSC_MAT16 _mm_load_si128((void*)(profile + (x-1)*SCD + qLet))
#endif
// Scoring function for hybrid edit-distance BLAST-id sequence comparison.
// Intended to "re-score" within an already-determined set of initial alignments,
// it is intended to break ties by BLAST-id as well as populate the alignment
// result with more statistics about the alignment (number of gaps, etc).
// The setup of the alignment also differs from primary scorers (further below)
// in that it is not designed to allow 'seeking' into the alignment matrix, so
// it optimizes calculation from the first row.
#define RESCOREM_PROTYPE(DIAG_FUNC) {\
uint32_t y, x; \
--query; --ref; ++qlen; /* ++rwidth; */ \
/* __m128i maxEDv = _mm_set1_epi8(maxED+1 < 255 ? maxED+1 : 255); for checking if BAD via score >= maxED+1 */ \
__m128i maxEDv = _mm_set1_epi8(maxED+1); \
DualCoil *restrict prevSc = Matrix + width, *restrict prevSh = Shifts + width, \
*restrict prevShR = ShiftR + width, *restrict curSc = Matrix, *restrict curSh = Shifts, \
*restrict curShR = ShiftR; \
uint32_t LB = 1, HB = rwidth, LBN = 1, HBN = rwidth; \
{ /* Iteration 1 only */ \
_mm_store_si128((void*)(curSh),_mm_setzero_si128()); \
_mm_store_si128((void*)(curShR),_mm_set1_epi8(1)); \
_mm_store_si128((void*)(curSc),_mm_set1_epi8(1)); /* column 0 */ \
char qLet = query[1]; \
for (x = 1; x < rwidth; ++x) {\
__m128i rChunk = _mm_load_si128((void*)(ref+x)); /* refRow += 16; */ \
__m128i curRow_x_1 = _mm_load_si128((void*)(curSc+(x-1))); \
__m128i score = DIAG_FUNC; \
/* test: if I'm a 1 and the left is a zero, give me a shift of 1 else 0 */ \
__m128i getshiftL = _mm_and_si128(_mm_cmpeq_epi8(score,_mm_set1_epi8(1)), \
_mm_cmpeq_epi8(curRow_x_1,_mm_setzero_si128())); \
__m128i shift = _mm_and_si128(getshiftL,_mm_set1_epi8(1)); /* its left shift will be 1 */ \
_mm_store_si128((void*)(curSc+x),score); \
_mm_store_si128((void*)(curSh+x),shift); \
_mm_store_si128((void*)(curShR+x),_mm_setzero_si128()); \
} \
} \
for (y=2; y < qlen; ++y) { \
LB = LBN, HB = HBN; \
LBN = 0; \
char qLet = query[y]; \
DualCoil *temp = curSc; curSc = prevSc; prevSc = temp; \
temp = curSh; curSh = prevSh; prevSh = temp; \
temp = curShR; curShR = prevShR; prevShR = temp; \
_mm_store_si128((void*)(curSh),_mm_setzero_si128()); \
__m128i newMin = _mm_set1_epi8(MIN(y,255)); \
_mm_store_si128((void*)(curSc),newMin); /* col 0 */ \
_mm_store_si128((void*)(curShR),newMin); \
for (x = LB; x < HB; ++x) { \
__m128i rChunk = _mm_load_si128((void*)(ref+x)); /* refRow += 16; */ \
__m128i prevRow_x = _mm_load_si128((void*)(prevSc+x)); \
__m128i prevShf_x = _mm_load_si128((void*)(prevSh+x)); \
__m128i prevShfR_x = _mm_load_si128((void*)(prevShR+x)); \
__m128i prevRow_x_1 = _mm_load_si128((void*)(prevSc+(x-1))); \
__m128i prevShf_x_1 = _mm_load_si128((void*)(prevSh+(x-1))); \
__m128i prevShfR_x_1 = _mm_load_si128((void*)(prevShR+(x-1))); \
__m128i curRow_x_1 = _mm_load_si128((void*)(curSc+(x-1))); \
__m128i curShf_x_1 = _mm_load_si128((void*)(curSh+(x-1))); \
__m128i curShfR_x_1 = _mm_load_si128((void*)(curShR+(x-1))); \
\
__m128i diagSc = DIAG_FUNC; \
__m128i scoreOld = prevRow_x_1; \
__m128i shift = prevShf_x_1; \
__m128i shiftR = prevShfR_x_1; \
__m128i score = _mm_adds_epu8(scoreOld, diagSc); \
__m128i scoreU = _mm_adds_epu8(prevRow_x, _mm_set1_epi8(GAP)); \
__m128i shiftU = prevShf_x; \
__m128i shiftRU = _mm_adds_epu8(prevShfR_x, _mm_set1_epi8(1)); \
__m128i scoreM = _mm_min_epu8(scoreU,score); \
__m128i shiftM = _mm_min_epu8(shiftU,shift); \
__m128i shiftU_le_shift = _mm_cmpeq_epi8(shiftM,shiftU); \
__m128i scoreU_eq_score = _mm_cmpeq_epi8(scoreU,score); \
__m128i scoreM_eq_score = _mm_cmpeq_epi8(score,scoreM); /* O <= U */ \
__m128i tiebreak = _mm_andnot_si128(shiftU_le_shift,scoreU_eq_score); \
__m128i condition = _mm_andnot_si128(tiebreak,scoreM_eq_score); \
shift = _mm_blendv_epi8(shiftU,shift,condition); \
shiftR = _mm_blendv_epi8(shiftRU,shiftR,condition); \
score = scoreM; \
\
/* consider L */ \
__m128i scoreLold = curRow_x_1; \
__m128i shiftLold = curShf_x_1; \
__m128i shiftRLold = curShfR_x_1; \
__m128i scoreL = _mm_adds_epu8(scoreLold,_mm_set1_epi8(GAP)); \
__m128i shiftL = _mm_adds_epu8(shiftLold,_mm_set1_epi8(1)); \
__m128i shiftRL = shiftRLold; \
scoreM = _mm_min_epu8(scoreL,score); \
shiftM = _mm_min_epu8(shiftL,shift); \
__m128i shiftL_le_shift = _mm_cmpeq_epi8(shiftM,shiftL); \
__m128i scoreL_eq_score = _mm_cmpeq_epi8(scoreL, score); \
scoreM_eq_score = _mm_cmpeq_epi8(score,scoreM); /* O <= U */ \
tiebreak = _mm_andnot_si128(shiftL_le_shift,scoreL_eq_score); \
condition = _mm_andnot_si128(tiebreak,scoreM_eq_score); \
\
shift = _mm_blendv_epi8(shiftL,shift,condition); \
shiftR = _mm_blendv_epi8(shiftRL,shiftR,condition); \
score = scoreM; \
\
/* Do bounds handling */ \
__m128i anyBad = _mm_cmpeq_epi8(maxEDv,_mm_min_epu8(maxEDv,score)); \
score = _mm_or_si128(anyBad,score); \
if (_mm_movemask_epi8(anyBad) != 0xFFFF) { \
if (!LBN) LBN = x; \
HBN = x; \
} \
_mm_store_si128((void*)(curSc+x),score); \
_mm_store_si128((void*)(curSh+x),shift); \
_mm_store_si128((void*)(curShR+x),shiftR); \
} \
if (!LBN) { \
printf("\nCRITICAL ERROR: Truncation within known good path.\n"); \
printf("--> maxED = %u, y = %u, qlen=%u\n", maxED, y, qlen); \
exit(1); \
} \
LBN+=y>maxED, ++HBN; \
_mm_store_si128((void*)(curSc+HBN),_mm_set1_epi8(-1)); /* max the right block */ \
_mm_store_si128((void*)(prevSc+LBN - 1), _mm_set1_epi8(-1)); /* max left of new */ \
HBN += HBN < rwidth; \
} \
\
/* Do scores (hybrid) */ \
__m128i curShfV = _mm_setzero_si128(), curShfRV = _mm_setzero_si128(), \
minIntV = _mm_set1_epi8(-1); \
for (uint32_t i = LB; i < HB; ++i) { \
__m128i score = _mm_load_si128((void*)(curSc + i)), \
shift = _mm_load_si128((void*)(curSh + i)), \
shiftR = _mm_load_si128((void*)(curShR + i)); \
__m128i scoreM = _mm_min_epu8(score, minIntV); \
__m128i shiftM = _mm_min_epu8(shift, curShfV); \
\
__m128i shift_le_curShfV = _mm_cmpeq_epi8(shiftM,shift); \
__m128i score_eq_minIntV = _mm_cmpeq_epi8(score, minIntV); \
__m128i scoreM_eq_minIntV = _mm_cmpeq_epi8(scoreM,minIntV); \
__m128i tiebreak = _mm_andnot_si128(shift_le_curShfV,score_eq_minIntV); \
__m128i condition = _mm_andnot_si128(tiebreak,scoreM_eq_minIntV); \
\
curShfV = _mm_blendv_epi8(shift,curShfV,condition); \
curShfRV = _mm_blendv_epi8(shiftR,curShfRV,condition); \
minIntV = scoreM; \
} \
\
__m128 QLm1 = _mm_set1_ps(qlen - 1); \
__m128 sc = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(minIntV)); \
__m128 sh = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(curShfV)); \
sc = _mm_sub_ps(_mm_set1_ps(1), _mm_div_ps(sc,_mm_add_ps(QLm1,sh))); \
_mm_stream_ps(M16->score,sc); \
__m128 sc2 = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(minIntV,4))); \
__m128 sh2 = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(curShfV,4))); \
sc2 = _mm_sub_ps(_mm_set1_ps(1), _mm_div_ps(sc2,_mm_add_ps(QLm1,sh2))); \
_mm_stream_ps(M16->score+4,sc2); \
__m128 sc3 = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(minIntV,8))); \
__m128 sh3 = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(curShfV,8))); \
sc3 = _mm_sub_ps(_mm_set1_ps(1), _mm_div_ps(sc3,_mm_add_ps(QLm1,sh3))); \
_mm_stream_ps(M16->score+8,sc3); \
__m128 sc4 = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(minIntV,12))); \
__m128 sh4 = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(curShfV,12))); \
sc4 = _mm_sub_ps(_mm_set1_ps(1), _mm_div_ps(sc4,_mm_add_ps(QLm1,sh4))); \
_mm_stream_ps(M16->score+12,sc4); \
\
/* calculate alignment index */ \
__m128i t = _mm_set1_epi32(255); \
__m128i I1 = _mm_set1_epi32(-1), I2 = _mm_set1_epi32(-1), \
I3 = _mm_set1_epi32(-1), I4 = _mm_set1_epi32(-1); \
for (uint32_t i = LB; i < HB; ++i) { \
__m128i score = _mm_load_si128((void*)(curSc + i)), \
shift = _mm_load_si128((void*)(curSh + i)); \
__m128i isGood = _mm_and_si128(_mm_cmpeq_epi8(score,minIntV), \
_mm_cmpeq_epi8(shift,curShfV)); \
__m128i set1 = _mm_cmpeq_epi32(_mm_cvtepu8_epi32(isGood),t); \
__m128i set2 = _mm_cmpeq_epi32(_mm_cvtepu8_epi32(_mm_srli_si128(isGood,4)),t); \
__m128i set3 = _mm_cmpeq_epi32(_mm_cvtepu8_epi32(_mm_srli_si128(isGood,8)),t); \
__m128i set4 = _mm_cmpeq_epi32(_mm_cvtepu8_epi32(_mm_srli_si128(isGood,12)),t); \
I1 = _mm_blendv_epi8(I1,_mm_set1_epi32(i),set1); \
I2 = _mm_blendv_epi8(I2,_mm_set1_epi32(i),set2); \
I3 = _mm_blendv_epi8(I3,_mm_set1_epi32(i),set3); \
I4 = _mm_blendv_epi8(I4,_mm_set1_epi32(i),set4); \
} \
_mm_stream_si128((void*)(M16->finalPos),I1); \
_mm_stream_si128((void*)(M16->finalPos+4),I2); \
_mm_stream_si128((void*)(M16->finalPos+8),I3); \
_mm_stream_si128((void*)(M16->finalPos+12),I4); \
_mm_stream_si128((void*)M16->numGapR,curShfRV); \
_mm_stream_si128((void*)M16->numGapQ,curShfV); \
}
// Two implementations of primary re-scoring aligner for the alphabet-sensitive
// and alphabet agnostic cases, respectively
static inline void reScoreM_mat16(DualCoil *ref, char *query, uint32_t rwidth, uint32_t qlen, uint32_t width,
DualCoil *Matrix, DualCoil *Shifts, DualCoil *ShiftR, uint32_t maxED, DualCoil *profile, MetaPack *M16)
RESCOREM_PROTYPE(DIAGSC_MAT16)
static inline void reScoreM_xalpha(DualCoil *ref, char *query, uint32_t rwidth, uint32_t qlen, uint32_t width,
DualCoil *Matrix, DualCoil *Shifts, DualCoil *ShiftR, uint32_t maxED, DualCoil *profile, MetaPack *M16)
RESCOREM_PROTYPE(DIAGSC_XALPHA)
// Aligner specialized for the heuristic mode ('-p'). Differs from primary
// aligners (futher below) in that it doesn't enable seeking into the alignment
// matrix. It is hence a hybrid of 'rescoreM' (above) and 'aded' (below) aligners
inline uint32_t prune_ed_mat16(DualCoil *ref, char *query, uint32_t rwidth, uint32_t qlen,
uint32_t width, DualCoil *Matrix, DualCoil *profile, uint32_t maxED, uint8_t *MinA) {
uint32_t y, x;
__m128i maxEDv = _mm_set1_epi8(maxED+1); // ensure: maxED <= 254! /* BAD if score >= maxED+1 */
--query, --ref, ++qlen, ++rwidth;
//maxED = maxED < qlen ? maxED : qlen;
uint32_t LB = 1, HB = rwidth + maxED - qlen + 2, LBN, HBN;
HB = MIN(rwidth, HB);
DualCoil *restrict prev = Matrix + width, *restrict cur = Matrix;
_mm_store_si128((void*)(cur),_mm_set1_epi8(1));
char qLet = query[1];
for (x = 1; x < HB; ++x) {
__m128i rChunk = _mm_load_si128((void*)(ref+x));
__m128i score = DIAGSC_MAT16;
_mm_store_si128((void*)(cur+x),score);
}
_mm_store_si128((void*)(cur+HB),_mm_set1_epi8(-1));
HB += HB < rwidth;
for (y = 2; y <= maxED; ++y) {
char qLet = query[y];
DualCoil *temp = cur; cur = prev, prev = temp;
_mm_store_si128((void*)(cur),_mm_set1_epi8(MIN(y,255))); /* column 0 */
for (x = 1; x < HB; ++x) {
__m128i rChunk = _mm_load_si128((void*)(ref+x));
__m128i prevRow_x = _mm_load_si128((void*)(prev+x));
__m128i prevRow_x_1 = _mm_load_si128((void*)(prev+(x-1)));
__m128i curRow_x_1 = _mm_load_si128((void*)(cur+(x-1)));
__m128i diagSc = DIAGSC_MAT16;
__m128i score = _mm_adds_epu8(prevRow_x_1, diagSc);
__m128i scoreU = _mm_adds_epu8(prevRow_x, _mm_set1_epi8(GAP));
score = _mm_min_epu8(scoreU,score);
__m128i scoreL = _mm_adds_epu8(curRow_x_1,_mm_set1_epi8(GAP));
score = _mm_min_epu8(scoreL,score);
_mm_store_si128((void*)(cur+x),score);
}
_mm_store_si128((void*)(cur+HB),_mm_set1_epi8(-1)); /* kill the right block */
HB += HB < rwidth;
}
HBN = HB; LBN = 1;
for (; y < qlen; ++y) {
LB = LBN, HB = HBN;
LBN = 0;
char qLet = query[y];
DualCoil *temp = cur; cur = prev, prev = temp;
_mm_store_si128((void*)(cur),_mm_set1_epi8(MIN(y,255)));
for (x = LB; x < HB; ++x) {
__m128i rChunk = _mm_load_si128((void*)(ref+x));
__m128i prevRow_x = _mm_load_si128((void*)(prev+x));
__m128i prevRow_x_1 = _mm_load_si128((void*)(prev+(x-1)));
__m128i curRow_x_1 = _mm_load_si128((void*)(cur+(x-1)));
__m128i diagSc = DIAGSC_MAT16;
__m128i score = _mm_adds_epu8(prevRow_x_1, diagSc);
__m128i scoreU = _mm_adds_epu8(prevRow_x, _mm_set1_epi8(GAP));
score = _mm_min_epu8(scoreU,score);
__m128i scoreL = _mm_adds_epu8(curRow_x_1,_mm_set1_epi8(GAP));
score = _mm_min_epu8(scoreL,score);
/* Do bounds handling */
__m128i anyBad = _mm_cmpeq_epi8(maxEDv,_mm_min_epu8(maxEDv,score));
score = _mm_or_si128(anyBad,score);
if (_mm_movemask_epi8(anyBad) != 0xFFFF) {
if (!LBN) LBN = x;
HBN = x;
}
_mm_store_si128((void*)(cur+x),score);
}
if (!LBN) return -1;
++LBN; /* we guarantee the bounds will close in */
++HBN; /* since it'll bound the 'for (...x < rightBound; ++x)' */
_mm_store_si128((void*)(cur+HBN),_mm_set1_epi8(-1)); /* kill the right block */
/*if (LBN > 1)*/
_mm_store_si128((void*)(prev+LBN - 1), _mm_set1_epi8(-1)); /* kill left of new */
HBN += HBN < rwidth;
}
/* Vectorized min reduction */
__m128i minIntV = _mm_set1_epi8(-1);
for (uint32_t i = LB; i < HB; ++i)
minIntV = _mm_min_epu8(minIntV,_mm_load_si128((void*)(cur+i)));
//__m128i mc = _mm_min_epu8(minIntV,_mm_set1_epi8(err_ceil));
//*minA = _mm_movemask_epi8(_mm_cmpeq_epi8(minIntV,mc));
_mm_store_si128((void*)MinA,minIntV);
__m128i c = _mm_srli_si128(minIntV,8);
minIntV = _mm_min_epu8(minIntV,c);
c = _mm_srli_si128(minIntV,4);
minIntV = _mm_min_epu8(minIntV,c);
c = _mm_srli_si128(minIntV,2);
minIntV = _mm_min_epu8(minIntV,c);
c = _mm_srli_si128(minIntV,1);
minIntV = _mm_min_epu8(minIntV,c);
return _mm_extract_epi8(minIntV,0); /* save min of mins as new bound */
}
// The primary high-speed alignment core in BURST. This is the stripped, speed-
// oriented aligner using the synergistic pruning and caching construct in
// the manuscript. It is designed to allow arbitrary seeking into the alignment
// matrix to bypass search space, in addition to narrowing the search space as