-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathrapidnj.h
898 lines (847 loc) · 38.2 KB
/
rapidnj.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
//
// rapidnj.h - RapidNJ and RapidBIONJ distance matrix tree construction.
//
// BoundingNJ implementation loosely based on ideas from
// https://birc.au.dk/software/rapidnj/.
// Paper: "Inference of Large Phylogenies using Neighbour-Joining."
// Martin Simonsen, Thomas Mailund, Christian N. S. Pedersen.
// Communications in Computer and Information Science
// (Biomedical Engineering Systems and Technologies:
// 3rd International Joint Conference, BIOSTEC 2010,
// Revised Selected Papers), volume 127, pages 334-344,
// Springer Verlag, 2011.
// Tag: [SMP2011].
// (but, optionally, using a variance matrix, as in BIONJ, and
// keeping the distance and variance matrices square -
// they're not triangular because
// (i) *read* memory access patterns are more favourable
// (ii) *writes* don't require conditional transposition
// of the row and column coordinates (but their
// access patterns aren't as favourable, but
// (iii) reads vastly outnumber writes)
// (there's no code yet for removing duplicated rows either;
// those that have distance matrix rows identical to earlier rows;
// Rapid NJ "hates" them) (this is also covered in section 2.5)
//
// The BoundingMatrix class adds branch-and-bound optimization
// to *other* distance matrix implementations. In this file, only
// to NJ and BIONJ, via the RapidNJ and RapidBIONJ classes.
//
// It sets up auxiliary S and I matrices. In each row:
// S = unadjusted distances to clusters that were in play
// when this row was set up (ascending order)
// I = the cluster indices that corresponded to each of
// the cells in S.
//
// Rows of S and I are sorted via a mergesort.
// (During matrix set up, a sequential mergesort, because
// row construction is parallelized; later, during
// clustering, a parallel one).
// (S is sorted, I is permuted to match).
// (S and I are the names of these matrices in [SMP2011]).
//
//Notes: 1.An SI matrix, of pair<T,int> would probably be better,
// as that could be sorted faster.
// 2.An adaptive row-sorting routine could be used,
// particularly if new SI rows (after cluster joins) were
// constructed (not merely in cluster index order) in an
// order "suggested" by the content of one (or both?) of the
// *existing* SI rows.
// (but... row sorting is only ~10% of running time)
//
// This file, created by James Barbetti on 31-Oct-2020.
// (But the bulk of the code in it was from bionj2.cpp,
// which dates back to 18-Jun-2020).
//
// LICENSE:
//* This program is free software; you can redistribute it and/or modify
//* it under the terms of the GNU General Public License as published by
//* the Free Software Foundation; either version 2 of the License, or
//* (at your option) any later version.
//*
//* This program is distributed in the hope that it will be useful,
//* but WITHOUT ANY WARRANTY; without even the implied warranty of
//* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//* GNU General Public License for more details.
//*
//* You should have received a copy of the GNU General Public License
//* along with this program; if not, write to the
//* Free Software Foundation, Inc.,
//* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//
#ifndef rapidnj_h
#define rapidnj_h
#include "nj.h"
#include <utils/parallel_mergesort.h>
#include <utils/timeutil.h> //for getRealTime
namespace StartTree
{
/**
* @brief Applies branch-and-bound trickery (as per the
* RapidNJ paper - see details at the top of this file)
* @tparam T the distance type
* @tparam SUPER the superclass, that is to be "munged" into a branch-and-bound
* version that will (on average), run a lot faster. It is assumed that
* SUPER is NJMatrix, inherits from NJMatrix, or is
* "duck-type-equivalent" to NJMatrix.
* @note The mutable members are calculated repeatedly, from others, in
* member functions marked as const. They're declared at the class
* level so that they don't need to be reallocated over and over again.
* It's all about amortizing allocation and deallocation cost (the
* price paid for that is a slight reduction in readability. -James B)
* @note Mapping members to the RapidNJ papers:
* rows is the D matrix
* entriesSorted is the S matrix
* entryToCluster is the I matrix
* @note scaledMaxEarlierClusterTotal[c] is the largest row total for a
* cluster with a lower number, than cluster c (where c is the index
* and the cluster number) (if c indicates a cluster for which there
* are still rows in the distance matrix: call this an in-play cluster).
* This is a tighter bound, when searching for the minimum Qij...
* and processing distances from cluster c to earlier clusters,
* than the largest row total for ALL the live clusters.
* See section 2.5 of Simonsen, Mailund, Pedersen [2011].
* @note rowOrderChosen is a vector of int rather than bool because
* simultaneous non-overlapping random-access writes to elements of
* std::vector<bool> *can* interfere with each other (because
* std::vector<bool> maps multiple nearby elements onto bitfields,
* so the writes... *do* overlap) (ouch!).
* @note If _OPENMP is defined, it is assumed that the threadcount,
* as returned by omp_get_max_threads() won't change during
* execution! Or at least, it won't change between the construction
* of a BoundingMatrix<T,S> instance, and its destruction.
* @note several "hide the OMP" functions, getThreadCount() and
* getThreadNumber() should perhaps move to NJMatrix<T>.
*/
template <class T=NJFloat, class SUPER=BIONJMatrix<T>>
class BoundingMatrix: public SUPER
{
public:
typedef SUPER super;
typedef MirrorMergeSorter<T, int> Sorter;
protected:
using super::rows;
using super::row_count;
using super::rowMinima;
using super::rowTotals;
using super::rowToCluster;
using super::clusters;
using super::silent;
using super::isRooted;
using super::finishClustering;
using super::clusterDuplicates;
using super::prepareToConstructTree;
std::vector<intptr_t> clusterToRow; //Maps clusters to their rows (-1 means not mapped)
//for each cluster
std::vector<T> clusterTotals; //"Row" totals indexed by cluster
mutable std::vector<intptr_t> clusterPartners;//Counts the number of possible partners,
mutable std::vector<T> scaledClusterTotals; //The same, multiplied by
//(1.0 / (n-2)).
mutable std::vector<T> scaledMaxEarlierClusterTotal;
mutable std::vector<int> rowOrderChosen; //Indicates if a row's scanning
//order chosen has been chosen.
//Only used in... getRowScanningOrder().
mutable std::vector<size_t> rowScanOrder; //Order in which rows are to be scanned
//Only used in... getRowMinima().
SquareMatrix<T> entriesSorted; //The S matrix: Entries in distance matrix
//(each row sorted by ascending value)
SquareMatrix<int> entryToCluster;//The I matrix: for each entry in S, which
//cluster the row (that the entry came from)
//was mapped to (at the time).
int threadCount;
std::vector<Sorter> sorters;
public:
typedef T distance_type;
BoundingMatrix() : super() {
#ifdef _OPENMP
threadCount = omp_get_max_threads();
#else
threadCount = 1;
#endif
sorters.resize(threadCount);
}
/**
* @brief Get the (current) thread count (returns 1 if _OPENMP is
* not defined)
* @return intptr_t - the current rhead count
*/
intptr_t getThreadCount() const {
intptr_t t=1;
#ifdef _OPENMP
#pragma omp parallel
{
if (omp_get_thread_num()==0) {
t = omp_get_num_threads();
}
}
#endif
return t;
}
/**
* @brief Get the (current) threads thread number, between 0
* inclusive and getThreadCount() exclusive (well, so we
* hope! If the number of threads hasn't changed since
* we last called getThreadCount()! -James B)
* (returns 0 if _OPENMP is not defined)
* @return intptr_t - the current rhead count
*/
intptr_t getThreadNumber() const {
#ifdef _OPENMP
return omp_get_thread_num();
#else
return 0;
#endif
}
virtual std::string getAlgorithmName() const {
return "Rapid" + super::getAlgorithmName();
}
/**
* @brief Sets up auxiliary arrays, and infers a phylogenetic tree.
* @return true
* @return false
*/
virtual bool constructTree() {
//0. Ensure variance matrix is initialized
prepareToConstructTree(); //Need this if SUPER is BIONJMatrix
//1. Set up vectors indexed by cluster number,
clusterToRow.resize(row_count);
clusterTotals.resize(row_count);
clusterPartners.resize(row_count*2-2);
for (intptr_t r=0; r<row_count; ++r) {
clusterToRow[r] = static_cast<int>(r);
clusterTotals[r] = rowTotals[r];
}
//2. Set up "scratch" vectors used in getRowMinima
// so that it won't be necessary to reallocate them
// for each call.
scaledClusterTotals.resize(row_count);
scaledMaxEarlierClusterTotal.resize(row_count);
rowOrderChosen.resize(row_count);
rowScanOrder.resize(row_count);
{
#if USE_PROGRESS_DISPLAY
const char* taskName = silent
? "" : "Setting up auxiliary I and S matrices";
progress_display setupProgress(row_count, taskName, "sorting", "row");
#endif
//2. Set up the matrix with row sorted by distance
// And the matrix that tracks which distance is
// to which cluster (the S and I matrices, in the
// RapidNJ papers).
entriesSorted.setSize(row_count);
entryToCluster.setSize(row_count);
#ifdef _OPENMP
#pragma omp parallel num_threads(threadCount)
#endif
{
intptr_t threadNum = getThreadNumber();
for (intptr_t r=threadNum; r<row_count; r+=threadCount) {
sortRow(r,r,false,sorters[threadNum]);
//copies the "left of the diagonal" portion of
//row r from the D matrix and sorts it
//into ascending order.
#if USE_PROGRESS_DISPLAY
++setupProgress;
#endif
}
}
}
clusterDuplicates();
{
intptr_t nextPurge = (row_count+row_count)/3;
#if USE_PROGRESS_DISPLAY
std::string taskName;
if (!silent) {
taskName = "Constructing " + getAlgorithmName() + " tree";
}
double triangle = row_count * (row_count + 1.0) * 0.5;
progress_display show_progress(triangle, taskName.c_str(), "", "");
#endif
intptr_t degree_of_root = isRooted ? 2 : 3;
while (degree_of_root<row_count) {
Position<T> best;
super::getMinimumEntry(best);
cluster(best.column, best.row);
if ( row_count == nextPurge ) {
#ifdef _OPENMP
#pragma omp parallel num_threads(threadCount)
#endif
{
for (intptr_t r=getThreadNumber();
r<row_count; r+=threadCount) {
purgeRow(r);
}
}
nextPurge = (row_count + row_count)/3;
}
#if USE_PROGRESS_DISPLAY
show_progress+=row_count;
#endif
}
#if USE_PROGRESS_DISPLAY
show_progress.done();
#endif
finishClustering();
}
return true;
}
/**
* @brief Calculate row r of the S and I matrices, from row r
* of the D matrix.
* @param r - the row number
* @param c - upper bound on cluster index (but NOT a cluster index!)
* @param parallel - true if a the sort should be parallel (when running
* in code that was multi-threaded higher up, this should
* be false. When running in code that wasn't this could
* be true; it is up to the caller).
* @param sorter - a sorter (a sorting algorithm implementation, perhaps
* with memory allocated for auxiliary arrays that it uses
* to sort), that is "owned" by the current thread.
* @note
*/
void sortRow(size_t r /*row index*/, size_t c /*upper bound on cluster index*/
, bool parallel, Sorter& sorter) {
//1. copy data from a row of the D matrix into the S matrix
// (and write the cluster identifiers that correspond to
// the values in the D row into the same-numbered
// row in the I matrix), for distances between the cluster
// in that row, and other live clusters (up to, but not including c).
T* sourceRow = rows[r];
T* values = entriesSorted.rows[r];
int* clusterIndices = entryToCluster.rows[r];
intptr_t w = 0;
for (intptr_t i=0; i<row_count; ++i) {
values[w] = sourceRow[i];
clusterIndices[w] = static_cast<int>(rowToCluster[i]);
if ( i != r && clusterIndices[w] < c ) {
++w;
}
}
values[w] = infiniteDistance; //sentinel value, to stop row search
clusterIndices[w] = static_cast<int>(rowToCluster[r]);
//Always room for this, because distance to self
//was excluded via the i!=r check above.
//2. Sort the row in the S matrix and mirror the sort
// on the same row of the I matrix.
if (parallel) {
sorter.parallel_mirror_sort(values, w, clusterIndices);
} else {
sorter.single_thread_mirror_sort(values, w, clusterIndices);
}
clusterPartners[c] = w;
}
/**
* @brief Scan a row of the I matrix, so as to remove entries that refer
* to clusters that are no longer in-play. Remove the corresponding
* values in the same row of the S matrix, at the same time.
* @param r the row number
*/
void purgeRow(intptr_t r /*row index*/) const {
T* values = entriesSorted.rows[r];
int* clusterIndices = entryToCluster.rows[r];
size_t c = rowToCluster[r];
intptr_t w = 0;
intptr_t i = 0;
for (; i<row_count ; ++i ) {
values[w] = values[i];
clusterIndices[w] = clusterIndices[i];
if ( infiniteDistance <= values[i] ) {
break;
}
bool liveRow = ( clusterToRow[clusterIndices[i]] !=
notMappedToRow);
w += (liveRow ? 1 : 0);
}
if (w<row_count) {
values[w] = infiniteDistance;
}
clusterPartners[c] = w;
}
/**
* @brief Join the clusters (clusterA and clusterB, below) that correspond
* to rows a and b of the working distance matrix (0<=a<b<n, where
* n is the current rank of the working distance matrix).
* @param a the lower-numbered row
* @param b the higher-numbered row
* @note It is assumed a<b, but this is NOT checked.
* @note It is assumed that distances, for the merged cluster,
* will be written over the top of row a of the D
* matrix and the D matrix's rank (row count and column count),
* reduced by one, by super::cluster().
* Yes, this IS tight long-range coupling. -James B.
*/
virtual void cluster(intptr_t a, intptr_t b) {
size_t clusterA = rowToCluster[a];
size_t clusterB = rowToCluster[b];
size_t clusterMoved = rowToCluster[row_count-1];
clusterToRow[clusterA] = notMappedToRow;
clusterTotals[clusterA] = -infiniteDistance;
clusterToRow[clusterB] = notMappedToRow;
clusterTotals[clusterB] = -infiniteDistance;
size_t clusterC = clusters.size(); //cluster # of new cluster
super::cluster(a,b);
if (b<row_count) {
clusterToRow[clusterMoved] = static_cast<int>(b);
}
clusterToRow.emplace_back(a);
clusterTotals.emplace_back(rowTotals[a]);
scaledClusterTotals.emplace_back(rowTotals[a] / (T)( row_count - 1.0 ) );
scaledMaxEarlierClusterTotal.emplace_back((T)0.0);
//Mirror row rearrangement done on the D (distance) matrix
//(and possibly also on the V (variance estimate) matrix),
//onto the S and I matrices.
entriesSorted.removeRowOnly(b);
entryToCluster.removeRowOnly(b);
//Recalculate cluster totals.
for (size_t wipe = 0; wipe<clusterC; ++wipe) {
clusterTotals[wipe] = -infiniteDistance;
//A trick. This way we don't need to check if clusters
//are still "live" in the inner loop of getRowMinimum().
//When we are "subtracting" cluster totals to calculate
//entries in Q, they will come out so big they won't be
//considered as candidates for neighbour join.
//If we didn't do this we'd have to check, all the time,
//when calculating entries in Q, if clusters are still
//"live" (have corresponding rows in the D matrix).
}
for (intptr_t r = 0; r<row_count; ++r) {
size_t cluster_num = rowToCluster[r];
clusterTotals[cluster_num] = rowTotals[r];
}
sortRow(a, clusterC, true, sorters[0]);
}
/**
* @brief Derive a bound (on the best adjusted distance), by looking
* at the first entry (for a live cluster), in each row of
* the S matrix.
* @param qBest
* @note Since we always have to check these entries when we process
* the row, why not process them up front, hoping to get a
* better bound on min(V) (and perhaps even "rule" entire rows
* "out of consideration", using that bound)? -James B).
*/
void deriveBoundFromFirstColumn(T& qBest) const {
intptr_t rSize = rowMinima.size();
std::vector<T> qLocalBestVector;
qLocalBestVector.resize( threadCount, qBest);
T* qLocalBest = qLocalBestVector.data();
#ifdef _OPEN_MP
#pragma omp parallel for num_threads(threadCount)
#endif
for (int b=0; b<threadCount; ++b) {
T qBestForThread = qBest;
int b_plus_1 = (b + 1);
size_t rStart = (b*rSize) / threadCount;
size_t rStop = (b_plus_1*rSize) / threadCount;
for (size_t r=rStart; r < rStop
&& rowMinima[r].value < infiniteDistance; ++r) {
intptr_t rowA = rowMinima[r].row;
intptr_t rowB = rowMinima[r].column;
if (rowA < row_count && rowB < row_count ) {
size_t clusterA = rowToCluster[rowA];
size_t clusterB = rowToCluster[rowB];
T qHere = this->rows[rowA][rowB]
- scaledClusterTotals[clusterA]
- scaledClusterTotals[clusterB];
if (qHere < qBestForThread) {
qBestForThread = qHere;
}
}
}
qLocalBest[b] = qBestForThread;
}
for (int b=0; b<threadCount; ++b) {
if ( qLocalBest[b] < qBest ) {
qBest = qLocalBest[b];
}
}
}
/**
* @brief Partially sort the row minima (in parallel)
* @note The algorithm used here is roughly to compare and perhaps
* reorder rows in the first half of the input (rounded up)
* with rows in the second (rounded down) (in parallel!),
* then repeat...) to find the minimum.
* @note the point is that each compare-perhaps-reorder operation
* in a given "reduce the problem by half" pass is entirely
* independent of all the others, and they can be done in
* parallel.
* @note A "conditional-move" version might be faster than
* swapping conditionally (if (rowMinima[j] < rowMinima[i])).
* (I don't remember whether I tried that. -James B)
*/
void partiallySortRowMinima() const {
intptr_t rSize = rowMinima.size();
#ifdef _OPENMP
int threshold = threadCount << 7; /* multiplied by 128*/
#endif
//Note, rowMinima might have size 0 (the first time this member
//function is called during processing of a distance matrix)
//Or it might have a size of n+1 (later times), but it won't be n.
for ( intptr_t len = rSize; 1<len; len=(len+1)/2 ) {
intptr_t halfLen = len/2; //rounded down
intptr_t gap = len-halfLen;
#ifdef _OPENMP
#pragma omp parallel for if(threshold<halfLen) num_threads(threadCount)
#endif
for ( intptr_t i = 0; i<halfLen; ++i) {
intptr_t j = i + gap;
if ( rowMinima[j] < rowMinima[i] ) {
std::swap(rowMinima[i], rowMinima[j]);
}
}
}
}
/**
* @brief Rig the order in which rows are scanned based on which rows
* (might) have the lowest row minima based on what we saw last
* time.
* @param qBest
* @note The original RapidNJ puts the second-best row from last time
* first. And, apart from that, goes in (cyclical) row order.
* But rows in the D, S, and I matrices are (all) shuffled
* in memory, so why not do all the rows in ascending order
* of their best Q-values from the last iteration?
* Or, better yet... From this iteration?!
* The idea is to (hopefully) find better bounds sooner.
*/
void decideOnRowScanningOrder(T& qBest) const {
deriveBoundFromFirstColumn(qBest);
partiallySortRowMinima();
#ifdef _OPENMP
#pragma omp parallel for if((threadCount<<7)<row_count) num_threads(threadCount)
#endif
for (intptr_t i = 0; i < row_count; ++i) {
rowOrderChosen[i]=0; //Not chosen yet
}
intptr_t rSize = rowMinima.size();
intptr_t w = 0;
for (intptr_t r=0; r < rSize
&& rowMinima[r].row < row_count
&& rowMinima[r].value < infiniteDistance; ++r) {
intptr_t rowA = rowMinima[r].row;
intptr_t rowB = rowMinima[r].column;
size_t clusterA = (rowA < row_count) ? rowToCluster[rowA] : 0;
size_t clusterB = (rowB < row_count) ? rowToCluster[rowB] : 0;
size_t row = (clusterA < clusterB) ? rowA : rowB;
if (row < (size_t)row_count) {
rowScanOrder[w] = row;
w += rowOrderChosen[row] ? 0 : 1;
rowOrderChosen[row] = 1; //Chosen
}
}
//The weird-looking middle term in the for loop is as
//intended: when w reaches n all of the rows (0..n-1)
//must be in rowScanOrder, so there's no need to continue
//until row==n.
for (intptr_t row=0; w < row_count ; ++row) {
rowScanOrder[w] = row;
w += ( rowOrderChosen[row] ? 0 : 1 );
}
}
/**
* @brief Get the Row Minima (that is, the minimum adjusted
* inter-cluster distance, for each row, r), by looking up
* (for each row, r), the associated cluster, y, and
* in the S row for y, finding the adjusted distance
* for each other in-play cluster, x, that might be less
* than our current upper bound on the global minimum
* adjusted distance.
* @note Rather than multiplying distances by (n-2) repeatedly,
* it is cheaper to work with scaled cluster totals,
* multiplied by (1.0/(T)(n-2)).
* Better n multiplications than 0.5*n*(n-1).
*/
virtual void getRowMinima() const {
size_t c = clusters.size();
T nless2 = (T)( row_count - 2 );
T tMultiplier = ( row_count <= 2 ) ? (T)0.0 : ((T)1.0 / nless2);
T maxTot = -infiniteDistance; //maximum row total divided by (n-2)
for (size_t i=0; i<c; ++i) {
scaledClusterTotals[i] = clusterTotals[i] * tMultiplier;
scaledMaxEarlierClusterTotal[i] = maxTot;
if ( clusterToRow[i] != notMappedToRow ) {
if (maxTot < scaledClusterTotals[i] ) {
maxTot=scaledClusterTotals[i];
}
}
}
T qBest = infiniteDistance;
//upper bound on minimum Q[row,col]
// = D[row,col] - R[row]*tMultipler - R[col]*tMultiplier
//
decideOnRowScanningOrder(qBest);
rowMinima.resize(row_count);
#ifdef _OPENMP
#pragma omp parallel num_threads(threadCount)
#endif
{
for (intptr_t r=getThreadNumber(); r<row_count; r+=threadCount) {
size_t row = rowScanOrder[r];
size_t cluster_num = rowToCluster[row];
T maxEarlierTotal = scaledMaxEarlierClusterTotal[cluster_num];
rowMinima[r] = getRowMinimum(row, maxEarlierTotal, qBest);
if (rowMinima[r].value < qBest) {
qBest = rowMinima[r].value;
}
}
}
}
/**
* @brief Get the cluster merge description (Position<T>), with
* the minimum adjusted distance between the two merged clusters,
* that will merge a cluster, x, with a cluster number less
* than that of the cluster, y, the cluster correpsonding to
* row (row) in the current distance matrix, with y.
* @param row - row number in the current distance matrix
* @param maxTot - the maximum (scaled) cluster distance total,
* for any cluster, with a cluster number less
* than y (used to get a slightly tighter bound,
* on how bad a raw distance, read from S can be,
* if it is still to be possible for the adjusted
* distance, to the same cluster the raw distance
* is to, to be equal to or better than qBest).
* @param qBest - a copy of an upper bound on the global minimum
* (possibly found earlier).
* @return Position<T>
*/
virtual Position<T> getRowMinimum(intptr_t row, T maxTot, T qBest) const {
T nless2 = (T)( row_count - 2 );
T tMultiplier = ( row_count <= 2 ) ? (T)0.0 : ( (T)1.0 / nless2 );
auto tot = scaledClusterTotals.data();
T rowTotal = rowTotals[row] * tMultiplier; //scaled by (1/(n-2)).
T rowBound = qBest + maxTot + rowTotal;
//Upper bound for distance, in this row, that
//could (after row totals subtracted) provide a
//better min(Q).
Position<T> pos(row, 0, infiniteDistance, 0);
const T* rowData = entriesSorted.rows[row];
const int* toCluster = entryToCluster.rows[row];
for (size_t i=0; ; ++i) {
T Drc = rowData[i];
if (rowBound<Drc && 0<i) {
break;
}
size_t cluster_num = toCluster[i];
//The cluster associated with this distance
//The c in Qrc and Drc.
T Qrc = Drc - tot[cluster_num] - rowTotal;
if (Qrc < pos.value) {
intptr_t otherRow = clusterToRow[cluster_num];
if (otherRow != notMappedToRow) {
pos.column = (otherRow < row ) ? otherRow : row;
pos.row = (otherRow < row ) ? row : otherRow;
pos.value = Qrc;
if (Qrc < qBest ) {
qBest = Qrc;
rowBound = qBest + maxTot + rowTotal;
}
}
}
}
return pos;
}
};
typedef BoundingMatrix<NJFloat, NJMatrix<NJFloat>> RapidNJ;
typedef BoundingMatrix<NJFloat, BIONJMatrix<NJFloat>> RapidBIONJ;
#if USE_VECTORCLASS_LIBRARY
/**
* @brief Munging class, that adds vectorization to a BoundingMatrix<T> class
* (or to another class that is duck-type equivalent to
* BoundingMatrix<T>)
* @tparam T - the distance type
* @tparam V - the vector type (for a continguous blocks of T, that can
* be Single Instruction Multiple Data (SMD) instructions)
* @tparam VB - the corresponding boolean vector type
* @tparam M - the matrix type
* @tparam SUPER - the immediate super class
* @note binary search - see indexOfFirstGreater() - is used, to figure
* out how many distances will need to be read from the row of S
* (the hope is that log(w) unpredictable distance comparisons will
* be cheaper than the w predictable comparisons, that would
* otherwise be needed)
* (in practice this hope seems to be realized. -James B)
*/
template <class T=NJFloat, class V=FloatVector,
class VB=FloatBoolVector,
class M=BIONJMatrix<T>, class SUPER=BoundingMatrix<T,M> >
class VectorizedBoundingMatrix: public SUPER
{
protected:
const intptr_t block_size;
std::vector<T> vector_storage;
std::vector<T*> per_thread_vector_storage;
public:
typedef SUPER super;
using super::threadCount;
using super::getThreadNumber;
using super::getThreadCount;
using super::row_count;
using super::rowTotals;
using super::rowToCluster;
using super::scaledClusterTotals;
using super::clusterToRow;
using super::clusterPartners;
using super::entriesSorted;
using super::entryToCluster;
VectorizedBoundingMatrix(): super(), block_size(VB().size()),
vector_storage (block_size*4) {
#ifdef _OPENMP
vector_storage.resize(threadCount*block_size*4);
#endif
for (int i=0; i<threadCount; ++i) {
T* ptr = vector_storage.data() + block_size * i * 4;
per_thread_vector_storage.push_back( ptr ) ;
}
}
/**
* @brief Binary search
* @param data pointer to (sorted) block of (distance) T
* @param hi number of distances in the block
* @param bound the bound to search for
* @return size_t - the index (in data) of the first element in the
* block that is greater than bound. Or, if there isn't one, hi.
* @note this is a binary search after the style of Bottenbruch
* (see his 1962 paper).
*/
size_t indexOfFirstGreater(const T* data, size_t hi, T bound) const {
size_t lo = 0;
while (lo<hi) {
size_t guess = lo + (hi-lo) / 2;
if (data[guess]<=bound) {
lo = guess + 1;
} else {
hi = guess;
}
}
return lo;
}
/**
* @brief Get the Postion<T> description of the best cluster merge,
* for which the higher-numbered cluster, y, is the one that
* corresponds to row (row) of the current distance matrix,
* by reading row (row) of both the S and I matrices.
* @param row - the row number
* @param maxTot - the (deep breath) maximum, of the scaled cluster
* distance totals, for all of the clusters, that have
* a cluster number, less than y.
* @param qBest - a *copy* of the current upper bound on the global
* minimum adjusted distance, between clusters.
* @return Position<T> - describes the best cluster merge
* (if there is one!)
*/
virtual Position<T> getRowMinimum(intptr_t row, T maxTot, T qBest) const override {
T nless2 = (T)( row_count - 2 );
T tMultiplier = ( row_count <= 2 ) ? (T)0.0 : ( (T)1.0 / nless2 );
auto tot = scaledClusterTotals.data();
T rowTotal = rowTotals[row] * tMultiplier; //scaled by (1/(n-2)).
T rowBound = qBest + maxTot + rowTotal;
//Upper bound for distance, in this row, that
//could (after row totals subtracted) provide a
//better min(Q).
Position<T> pos(row, 0, infiniteDistance, 0);
const T* rowData = entriesSorted.rows[row];
const int* toCluster = entryToCluster.rows[row];
const size_t cluster = rowToCluster[row];
size_t partners = indexOfFirstGreater
(rowData, clusterPartners[cluster], rowBound);
size_t v_partners = (partners/block_size)*block_size;
auto thread_num = getThreadNumber();
T* blockRawDist = per_thread_vector_storage[thread_num];
T* blockCluster = blockRawDist + block_size;
T* blockIndex = blockCluster + block_size;
T* blockHCDist = blockIndex + block_size;
V best_hc_vector ((T)(infiniteDistance));
V best_ix_vector ((T)(-1));
V raw(0);
V c_tot(0);
V ix(0);
for (size_t i=0; i<v_partners; i+=block_size) {
for (int j=0; j<block_size; ++j) {
int k = toCluster[i+j];
blockCluster[j] = tot[k];
blockIndex[j] = (T)k;
}
raw.load(rowData+i);
c_tot.load(blockCluster);
ix.load(blockIndex);
V hc = raw - c_tot; //subtract cluster totals to get half-cooked distances?
VB less = hc < best_hc_vector; //which are improvements?
best_hc_vector = select(less, hc, best_hc_vector);
best_ix_vector = select(less, ix, best_ix_vector);
}
if (0<v_partners) {
best_hc_vector.store(blockHCDist);
best_ix_vector.store(blockIndex);
for (int j=0; j<block_size; ++j) {
T Qrc = blockHCDist[j] - rowTotal;
if (Qrc < pos.value) {
size_t cluster_num = (size_t)blockIndex[j];
intptr_t otherRow = clusterToRow[cluster_num];
if (otherRow != notMappedToRow) {
pos.column = (otherRow < row ) ? otherRow : row;
pos.row = (otherRow < row ) ? row : otherRow;
pos.value = Qrc;
if (Qrc < qBest ) {
qBest = Qrc;
rowBound = qBest + maxTot + rowTotal;
}
}
}
}
}
adjustRowMinimumFromLeftovers(row, rowData, rowBound, rowTotal, maxTot, qBest,
toCluster, tot, v_partners, partners, pos);
return pos;
}
/**
* @brief Consider the last partners-v_partners items in the S and I
* rows, "in" getRowMinimum() (where v_partners is the number of
* items that were already handled by the vectorized code),
* using NON-vectorized code.
* @note this function only exists, seperately from getRowMinimum(),
* to trick Lizard into *not* complaining about the Cyclomatic
* Complexity of getRowMinimum(). This function only exists,
* seperately from getRowMinimum(), to trick Lizard into *not*
* complaining about the Cyclomatic Complexity of getRowMinimum().
* @note All the parameters are merely copies, of parameters to,
* or local variables declared in, getRowMinimum().
*/
//note:
inline void adjustRowMinimumFromLeftovers
( intptr_t row, const T* rowData, T rowBound, T rowTotal, T maxTot, T qBest,
const int* toCluster, T* tot, size_t v_partners, size_t partners,
Position<T>& pos ) const {
for (size_t i=v_partners; i<partners; ++i) {
T Drc = rowData[i];
if (rowBound<Drc && 0<i) {
break;
}
size_t cluster_num = toCluster[i];
//The cluster associated with this distance
//The c in Qrc and Drc.
T Qrc = Drc - tot[cluster_num] - rowTotal;
if (Qrc < pos.value) {
intptr_t otherRow = clusterToRow[cluster_num];
if (otherRow != notMappedToRow) {
pos.column = (otherRow < row ) ? otherRow : row;
pos.row = (otherRow < row ) ? row : otherRow;
pos.value = Qrc;
if (Qrc < qBest ) {
qBest = Qrc;
rowBound = qBest + maxTot + rowTotal;
}
}
}
}
}
};
typedef VectorizedBoundingMatrix
<NJFloat, FloatVector, FloatBoolVector, NJMatrix<NJFloat> >
Vectorized_RapidNJ;
typedef VectorizedBoundingMatrix
<NJFloat, FloatVector, FloatBoolVector, BIONJMatrix<NJFloat> >
Vectorized_RapidBIONJ;
#endif //USE_VECTORCLASS_LIBRARY
}
#endif /* rapidnj_h */