Skip to content

Commit

Permalink
Fannkuch redux d (#341)
Browse files Browse the repository at this point in the history
* Init version

Used intrinsics based on CPP version

* Two intermediate results for approach:2

Based on Zig (2.zig) and Rust (2-i.rs) implementations. I was trying to use only std libraries (core.simd and ldc.simd). But it seems shuffle realization via template not very convenient for usage. Need to use some CT magic.

* Added 1.d based on Crystal

Simple static array currently faster than intel-intrinsics implementations

* The init version of 2-im for D

It compiles, but results is not correct

* Tests now passed

Fixed the bug in count

* Added static immutable

For CTFE functions and better performance insted of enum

* Added parallel version

Based on 1-im.cpp - distinguish for two versions: with and without parallel usage
Fix type in 2-im.d

* Update bench_d.yaml

Remove 2-i.d usage. Added 1-im.d to bench_d.yaml

* Fixed shared variables

* Update bench_d.yaml

fixed conflict

* Update bench_d.yaml

switch on 2-i version
  • Loading branch information
cyrusmsk authored Jan 16, 2023
1 parent aa03cff commit d454527
Show file tree
Hide file tree
Showing 7 changed files with 718 additions and 2 deletions.
174 changes: 174 additions & 0 deletions bench/algorithm/fannkuch-redux/1-i.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
// Based on 1-im.cpp implementation
//@safe:

import std.stdio : writeln;
import std.stdint : int_fast8_t, int_fast64_t;
import std.algorithm.mutation : bringToFront;
import std.conv : to;
import inteli.emmintrin, inteli.tmmintrin, inteli.smmintrin;

alias smallInt = int_fast8_t;
alias bigInt = int_fast64_t;

immutable static smallInt maxN = 16;
immutable static int maxBlocks = 24;

class Masks
{
__m128i[16] masksReverse;
__m128i[16] masksShift;

this()
{
masksReverse = [
__m128i.init,
_mm_setr_epi8(1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(2, 1, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(4, 3, 2, 1, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(5, 4, 3, 2, 1, 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(6, 5, 4, 3, 2, 1, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 11, 12, 13, 14, 15),
_mm_setr_epi8(11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 12, 13, 14, 15),
_mm_setr_epi8(12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 13, 14, 15),
_mm_setr_epi8(13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 14, 15),
_mm_setr_epi8(14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15),
_mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
];
masksShift = [
__m128i.init,
_mm_setr_epi8(1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 0, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 0, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0),
];
}
}



// check better implementation. maybe CT, if it used in zig versions
bigInt[maxN] computeFactorials(smallInt n)
{
bigInt[maxN] factorials;
factorials[0] = 1;
for (smallInt i = 1; i <= n; ++i)
factorials[i] = factorials[i - 1] * i;
return factorials;
}

int[2] getBlocksAndSize(smallInt n, ref bigInt[maxN] factorials)
{
int blocks = maxBlocks;
if (blocks > factorials[n])
blocks = 1;
int blockSize = cast(int) factorials[n] / blocks;
return [blocks, blockSize];
}

bigInt[maxN] createCount(smallInt n, bigInt start, ref bigInt[maxN] factorials)
{
bigInt[maxN] count;
for (smallInt i = cast(smallInt)(n - 1); i >= 0; i--)
{
bigInt d = start / factorials[i];
start = start % factorials[i];
count[i] = d;
}
return count;
}

__m128i createCurrent(smallInt n, ref bigInt[maxN] count)
{
align(16) smallInt[maxN] currentAux = [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
];
for (smallInt i = cast(smallInt)(n - 1); i >= 0; --i)
bringToFront(currentAux[0 .. count[i]], currentAux[count[i] .. i + 1]);
__m128i current = _mm_load_si128(cast(__m128i*)(currentAux.ptr));
return current;
}

pragma(inline, true):
__m128i incrementPermutation(ref bigInt[maxN] count, __m128i current, ref Masks masks)
{
for (smallInt i = 1;; ++i)
{
current = _mm_shuffle_epi8(current, masks.masksShift[i]);
if (++count[i] <= i)
break;
count[i] = 0;
}
return current;
}

pragma(inline, true):
__m128i reverse(__m128i x, smallInt idx, ref Masks masks)
{
return _mm_shuffle_epi8(x, masks.masksReverse[idx]);
}

void main(string[] args)
{
immutable smallInt n = args[1].to!smallInt;
auto factorials = computeFactorials(n);
auto blockSize = getBlocksAndSize(n, factorials)[1];

smallInt maxFlips = 0;
bigInt checksum = 0;

for (bigInt blockStart = 0; blockStart < factorials[n]; blockStart += blockSize)
{
Masks masks = new Masks();
bigInt[maxN] count = createCount(n, blockStart, factorials);

__m128i current = createCurrent(n, count);
__m128i currentStart = current;

smallInt first = cast(smallInt) _mm_extract_epi8(current, 0);

bigInt crtIdx = blockStart;
bigInt blockEnd = blockStart + blockSize;

while (crtIdx < blockEnd)
{
if (first > 0)
{
smallInt flips = 0;
while (first != 0)
{
auto next = (*cast(char[16]*)(current.ptr))[first]; // or try Union here
current = reverse(current, first, masks);
first = cast(smallInt*) next;
++flips;
}

checksum += (crtIdx % 2) == 0 ? flips : -flips;

if (flips > maxFlips)
maxFlips = flips;
}

current = incrementPermutation(count, currentStart, masks);
currentStart = current;

first = cast(smallInt) _mm_extract_epi8(current, 0);
++crtIdx;
}
}
writeln(cast(int) checksum, "\nPfannkuchen(", n, ") = ", cast(int) maxFlips);
}
177 changes: 177 additions & 0 deletions bench/algorithm/fannkuch-redux/1-im.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
// Based on 1-im.cpp implementation
//@safe:

import std;
import inteli.emmintrin, inteli.tmmintrin, inteli.smmintrin;

alias smallInt = int_fast8_t;
alias bigInt = int_fast64_t;

immutable static smallInt maxN = 16;
immutable static int maxBlocks = 24;

class Masks
{
__m128i[16] masksReverse;
__m128i[16] masksShift;

this()
{
masksReverse = [
__m128i.init,
_mm_setr_epi8(1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(2, 1, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(4, 3, 2, 1, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(5, 4, 3, 2, 1, 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(6, 5, 4, 3, 2, 1, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 11, 12, 13, 14, 15),
_mm_setr_epi8(11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 12, 13, 14, 15),
_mm_setr_epi8(12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 13, 14, 15),
_mm_setr_epi8(13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 14, 15),
_mm_setr_epi8(14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15),
_mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
];
masksShift = [
__m128i.init,
_mm_setr_epi8(1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 0, 8, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 0, 9, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 10, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 11, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 12, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 13, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 14, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 15),
_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0),
];
}
}



// check better implementation. maybe CT, if it used in zig versions
bigInt[maxN] computeFactorials(smallInt n)
{
bigInt[maxN] factorials;
factorials[0] = 1;
for (smallInt i = 1; i <= n; ++i)
factorials[i] = factorials[i - 1] * i;
return factorials;
}

int[2] getBlocksAndSize(smallInt n, ref bigInt[maxN] factorials)
{
int blocks = maxBlocks;
if (blocks > factorials[n])
blocks = 1;
int blockSize = cast(int) factorials[n] / blocks;
return [blocks, blockSize];
}

bigInt[maxN] createCount(smallInt n, bigInt start, ref bigInt[maxN] factorials)
{
bigInt[maxN] count;
for (smallInt i = cast(smallInt)(n - 1); i >= 0; i--)
{
bigInt d = start / factorials[i];
start = start % factorials[i];
count[i] = d;
}
return count;
}

__m128i createCurrent(smallInt n, ref bigInt[maxN] count)
{
align(16) smallInt[maxN] currentAux = [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
];
for (smallInt i = cast(smallInt)(n - 1); i >= 0; --i)
bringToFront(currentAux[0 .. count[i]], currentAux[count[i] .. i + 1]);
__m128i current = _mm_load_si128(cast(__m128i*)(currentAux.ptr));
return current;
}

pragma(inline, true):
__m128i incrementPermutation(ref bigInt[maxN] count, __m128i current, ref Masks masks)
{
for (smallInt i = 1;; ++i)
{
current = _mm_shuffle_epi8(current, masks.masksShift[i]);
if (++count[i] <= i)
break;
count[i] = 0;
}
return current;
}

pragma(inline, true):
__m128i reverse(__m128i x, smallInt idx, ref Masks masks)
{
return _mm_shuffle_epi8(x, masks.masksReverse[idx]);
}

void main(string[] args)
{
immutable smallInt n = args[1].to!smallInt;
auto factorials = computeFactorials(n);
auto blockSize = getBlocksAndSize(n, factorials)[1];

shared smallInt maxres = 0;
shared bigInt checkres = 0;

foreach(bigInt blockStart; parallel(iota(cast(bigInt) 0, factorials[n], blockSize)))
{
smallInt maxFlips = 0;
bigInt checksum = 0;
Masks masks = new Masks();
bigInt[maxN] count = createCount(n, blockStart, factorials);

__m128i current = createCurrent(n, count);
__m128i currentStart = current;

smallInt first = cast(smallInt) _mm_extract_epi8(current, 0);

bigInt crtIdx = blockStart;
bigInt blockEnd = blockStart + blockSize;

while (crtIdx < blockEnd)
{
if (first > 0)
{
smallInt flips = 0;
while (first != 0)
{
auto next = (*cast(char[16]*)(current.ptr))[first]; // or try Union here
current = reverse(current, first, masks);
first = cast(smallInt*) next;
++flips;
}

checksum += (crtIdx % 2) == 0 ? flips : -flips;

if (flips > maxFlips)
maxFlips = flips;
}

current = incrementPermutation(count, currentStart, masks);
currentStart = current;

first = cast(smallInt) _mm_extract_epi8(current, 0);
++crtIdx;
}
synchronized {
maxres = max(maxFlips,maxres);
checkres = checkres + checksum;
}
}
writeln(cast(int) checkres, "\nPfannkuchen(", n, ") = ", cast(int) maxres);
}
Loading

0 comments on commit d454527

Please sign in to comment.