diff --git a/chacha8.h b/chacha8.h index f07bf3b..b666666 100644 --- a/chacha8.h +++ b/chacha8.h @@ -2,6 +2,7 @@ #define CHACHA8_H #include #include +#include // ChaCha8 using SIMD. // u8.h: using 8 blocks at a time (AVX2). @@ -9,11 +10,11 @@ // DO NOT USE THIS CODE FOR CRYPTOGRAPHIC PURPOSES. typedef struct prng_state { - __uint32_t state[16]; + uint32_t state[16]; } prng_state; #define ROUNDS 8 -#define U8TO32_LITTLE(p) (((__uint32_t*)(p))[0]) +#define U8TO32_LITTLE(p) (((uint32_t*)(p))[0]) // SIMD primitives @@ -90,9 +91,9 @@ typedef struct prng_state { // buf's size must be a multiple of 512 bytes. -inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) { +static inline void prng_gen(prng_state *s, uint64_t buf[], size_t size) { char *out = (char *)buf; - __uint64_t bytes = size * 2; + uint64_t bytes = size * 2; int i; if (!bytes || bytes < 512) { return; } @@ -100,7 +101,7 @@ inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) { /* constant for shuffling bytes (replacing multiple-of-8 rotates) */ __m256i rot16 = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2); __m256i rot8 = _mm256_set_epi8(14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3,14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3); - __uint32_t in12, in13; + uint32_t in12, in13; __m256i x_0 = _mm256_set1_epi32(s->state[0]); __m256i x_1 = _mm256_set1_epi32(s->state[1]); @@ -179,7 +180,7 @@ inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) { __m256i t12, t13; in12 = s->state[12]; in13 = s->state[13]; - __uint64_t in1213 = ((__uint64_t)in12) | (((__uint64_t)in13) << 32); + uint64_t in1213 = ((uint64_t)in12) | (((uint64_t)in13) << 32); x_12 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213)); x_13 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213)); diff --git a/intertwine.c b/intertwine.c index 0819f5d..f1a69b5 100644 --- a/intertwine.c +++ b/intertwine.c @@ -17,7 +17,7 @@ int main(int argc, char **argv) { char reachedEOF = 0; while (!reachedEOF) { int readBytes = outputBufsize; - for (char i = 0; i < a.fnum; i++) { + for (int i = 0; i < a.fnum; i++) { ssize_t r = read(a.files[i], ibuf, BUFSIZE); for (int bi = 0; bi < r; bi++) { obuf[a.fnum * bi + i] = ibuf[bi]; @@ -35,7 +35,7 @@ int main(int argc, char **argv) { } // Discharging allocations. - for (char i = 0; i < a.fnum; i++) { close(a.files[i]); } + for (int i = 0; i < a.fnum; i++) { close(a.files[i]); } free(a.files); free(obuf); return 0; } diff --git a/lehmer128.h b/lehmer128.h index 24d718b..e6a5f34 100644 --- a/lehmer128.h +++ b/lehmer128.h @@ -1,8 +1,11 @@ #ifndef LEHMER_H #define LEHMER_H +#include +#include // LEHMER128: https://lemire.me/blog/2019/03/19/the-fastest-conventional-random-number-generator-that-can-pass-big-crush/ +#ifdef __SIZEOF_INT128__ typedef struct prng_state { __uint128_t state; } prng_state; @@ -10,8 +13,8 @@ typedef struct prng_state { #define ROTL(a,n) (((a) << (n)) | ((a) >> (64 - (n)))) // buf's size must be a multiple of 8 bytes. -inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) { - for (__uint64_t i = 0; i < size; i++) { +static inline void prng_gen(prng_state *s, uint64_t buf[], size_t size) { + for (size_t i = 0; i < size; i++) { buf[i] = (__uint128_t)(s->state *= 0xda942042e4dd58b5) >> 64; } } @@ -24,4 +27,51 @@ prng_state prng_init(SEEDTYPE seed[4]) { if (s.state == 0) { s.state = 1; } return s; } +#else + +typedef struct prng_state { + uint64_t state[2]; +} prng_state; + +#define ROTL(a,n) (((a) << (n)) | ((a) >> (64 - (n)))) +#if defined(_M_X64) && defined(_MSC_VER) +#include +#pragma intrinsic(_umul128) +static inline void prng_mult128by64(uint64_t lhs[2], uint64_t rhs) { + uint64_t hi; + uint64_t lo = _umul128(lhs[0], rhs, &hi); + lhs[0] = lo; + lhs[1] = hi + (lhs[1] * rhs); +} +#else +static inline void prng_mult128by64(uint64_t lhs[2], uint64_t rhs) { +#define PRNG_MULL(x, y) ((uint64_t)(uint32_t)(x) * (uint64_t)(uint32_t)(y)) + uint64_t lo_lo = PRNG_MULL(lhs[0] & 0xffffffff, rhs & 0xffffffff); + uint64_t hi_lo = PRNG_MULL(lhs[0] >> 32, rhs & 0xffffffff); + uint64_t lo_hi = PRNG_MULL(lhs[0] & 0xffffffff, rhs >> 32); + uint64_t hi_hi = PRNG_MULL(lhs[0] >> 32, rhs >> 32); +#undef PRNG_MULL + uint64_t cross = hi_lo + (lo_lo >> 32) + (lo_hi & 0xffffffff); + uint64_t lower = (cross << 32) | (lo_lo & 0xffffffff); + uint64_t upper = hi_hi + (cross >> 32) + (lo_hi >> 32); + lhs[0] = lower; + lhs[1] = upper + (lhs[1] * rhs); +} +#endif +// buf's size must be a multiple of 8 bytes. +static inline void prng_gen(prng_state *s, uint64_t buf[], size_t size) { + for (size_t i = 0; i < size; i++) { + prng_mult128by64(s->state, 0xda942042e4dd58b5); + buf[i] = s->state[1]; + } +} + +prng_state prng_init(SEEDTYPE seed[4]) { + prng_state s; + s.state[1] = seed[0] ^ seed[2]; + s.state[0] = seed[1] ^ seed[3]; + if (s.state[0] == 0 && s.state[1] == 0) { s.state[0] = 1; } + return s; +} +#endif // __SIZEOF_INT128__ #endif diff --git a/prng.c b/prng.c index befa003..02362f0 100644 --- a/prng.c +++ b/prng.c @@ -1,22 +1,22 @@ #include #include +#include #include #include #include #define BUFSIZE (1<<14) -#define SEEDTYPE __uint64_t +#define SEEDTYPE uint64_t #include "./prng.h" -typedef struct args { __int64_t bytes; SEEDTYPE seed[4]; int rval; } args_t; +typedef struct args { uint64_t bytes; SEEDTYPE seed[4]; int rval; } args_t; args_t parseArgs(int argc, char **argv); -void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size); int main(int argc, char **argv) { args_t a = parseArgs(argc, argv); if (a.rval < 0) { return a.rval; } prng_state s = prng_init(a.seed); - __uint64_t buf[BUFSIZE] __attribute__ ((aligned (64))); - __int64_t cycles = 0, start; - for (__int64_t bytes = a.bytes; bytes >= 0; bytes -= sizeof(buf)) { + uint64_t buf[BUFSIZE] __attribute__ ((aligned (64))); + int64_t cycles = 0, start; + for (uint64_t bytes = a.bytes; bytes >= 0; bytes -= sizeof(buf)) { int wbytes = bytes < sizeof(buf)? bytes: sizeof(buf); start = _rdtsc(); prng_gen(&s, buf, BUFSIZE); @@ -28,7 +28,7 @@ int main(int argc, char **argv) { } args_t parseArgs(int argc, char **argv) { - args_t a; + args_t a = {0}; a.rval = 0; a.bytes = 0x7fffffffffffffff; for (int i = 0; i < argc; i++) { @@ -40,9 +40,9 @@ args_t parseArgs(int argc, char **argv) { fprintf(stderr, " --seed: as hexadecimal.\n"); a.rval = -1; } else if (strcmp(argv[i], "-b") == 0 || strcmp(argv[i], "--bytes") == 0) { - a.bytes = atoll(argv[++i]); + a.bytes = strtoull(argv[++i], NULL, 0); } else if (strcmp(argv[i], "-s") == 0 || strcmp(argv[i], "--seed") == 0) { - a.seed[0] = strtol(argv[++i], NULL, 16); + a.seed[0] = strtoull(argv[++i], NULL, 0); } } return a; diff --git a/rc4.h b/rc4.h index f033432..ef467c3 100644 --- a/rc4.h +++ b/rc4.h @@ -1,25 +1,27 @@ #ifndef RC4_H #define RC4_H +#include +#include // RC4 (aka arc4random). // DO NOT USE THIS CODE FOR CRYPTOGRAPHIC PURPOSES. typedef struct prng_state { - __uint8_t shuffle[256]; - __uint8_t i, j; + uint8_t shuffle[256]; + uint8_t i, j; } prng_state; #define ROTL(a,n) (((a) << (n)) | ((a) >> (64 - (n)))) #define SWAP(i, j) { \ - char si = shuffle[(i)]; \ + uint8_t si = shuffle[(i)]; \ shuffle[(i)] = shuffle[(j)]; \ shuffle[(j)] = si; } // buf's size must be a multiple of 8 bytes; it could get bytes one at a time. -inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) { - __uint8_t *shuffle = s->shuffle, *b = buf; - for (__uint64_t i = 0; i < size; i++) { - for (char j = 0; j < 8; j++) { +static inline void prng_gen(prng_state *s, uint64_t buf[], size_t size) { + uint8_t *shuffle = s->shuffle, *b = buf; + for (size_t i = 0; i < size; i++) { + for (size_t j = 0; j < 8; j++) { s->i++; s->j += shuffle[s->i]; SWAP(s->i, s->j); b[8*i + j] = shuffle[shuffle[s->i] + shuffle[s->j]]; @@ -29,9 +31,9 @@ inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) { prng_state prng_init(SEEDTYPE seed[4]) { prng_state s; - __uint8_t *shuffle = s.shuffle, *key = (char *)seed; - for (int i = 0; i < 256; i++) { shuffle[i] = i; } - for (int i = 0, j = 0; i < 256; i++, j = (j + shuffle[i] + key[i % 32]) % 256) { + uint8_t *shuffle = s.shuffle, *key = (uint8_t *)seed; + for (size_t i = 0; i < 256; i++) { shuffle[i] = i; } + for (size_t i = 0, j = 0; i < 256; i++, j = (j + shuffle[i] + key[i % 32]) % 256) { SWAP(i, j); } s.i = s.j = 0; diff --git a/romu.h b/romu.h index 7ca6a99..d5ed16c 100644 --- a/romu.h +++ b/romu.h @@ -1,17 +1,18 @@ #ifndef ROMU_H #define ROMU_H - +#include +#include typedef struct prng_state { - __uint64_t state[3]; + uint64_t state[3]; } prng_state; #define ROTL(a,n) (((a) << (n)) | ((a) >> (64 - (n)))) // buf's size must be a multiple of 8 bytes. -inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) { - for (__uint64_t i = 0; i < size; i++) { +static inline void prng_gen(prng_state *s, uint64_t buf[], size_t size) { + for (size_t i = 0; i < size; i++) { // ROMU: http://www.romu-random.org/romupaper.pdf - __uint64_t xp = s->state[0], yp = s->state[1], zp = s->state[2]; + uint64_t xp = s->state[0], yp = s->state[1], zp = s->state[2]; s->state[0] = 15241094284759029579u * zp; s->state[1] = ROTL(yp - xp, 12); s->state[2] = ROTL(zp - yp, 44); @@ -21,7 +22,7 @@ inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) { prng_state prng_init(SEEDTYPE seed[4]) { prng_state s; - for (char i = 0; i < 3; i++) { s.state[i] = seed[i]; } + for (size_t i = 0; i < 3; i++) { s.state[i] = seed[i]; } if (s.state[2] == 0) { s.state[2] = 1; } return s; } diff --git a/shishua-half.h b/shishua-half.h index 8e2d8d9..a9cdb9f 100644 --- a/shishua-half.h +++ b/shishua-half.h @@ -1,5 +1,8 @@ #ifndef SHISHUA_H #define SHISHUA_H +#include +#include +#include #include typedef struct prng_state { __m256i state[2]; @@ -8,7 +11,7 @@ typedef struct prng_state { } prng_state; // buf's size must be a multiple of 32 bytes. -inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) { +static inline void prng_gen(prng_state *s, uint64_t buf[], size_t size) { __m256i s0 = s->state[0], counter = s->counter, s1 = s->state[1], o = s->output, t0, t1, t2, t3, u0, u1, u2, u3; @@ -30,7 +33,7 @@ inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) { // for a tiny amount of variation stirring. // I used the smallest odd numbers to avoid having a magic number. __m256i increment = _mm256_set_epi64x(1, 3, 5, 7); - for (__uint64_t i = 0; i < size; i += 4) { + for (size_t i = 0; i < size; i += 4) { _mm256_storeu_si256((__m256i*)&buf[i], o); // I apply the counter to s1, @@ -61,22 +64,22 @@ inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) { // Nothing up my sleeve: those are the hex digits of Φ, // the least approximable irrational number. // $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc -static __uint64_t phi[8] = { +static uint64_t phi[8] = { 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, }; prng_state prng_init(SEEDTYPE seed[4]) { prng_state s; - s.counter = _mm256_set_epi64x(0, 0, 0, 0); + memset(&s, 0, sizeof(prng_state)); # define STEPS 5 # define ROUNDS 4 - __uint64_t buf[4 * STEPS]; // 4 64-bit numbers per 256-bit SIMD. + uint64_t buf[4 * STEPS]; // 4 64-bit numbers per 256-bit SIMD. // Diffuse first two seed elements in s0, then the last two. Same for s1. // We must keep half of the state unchanged so users cannot set a bad state. s.state[0] = _mm256_set_epi64x(phi[3], phi[2] ^ seed[1], phi[1], phi[0] ^ seed[0]); s.state[1] = _mm256_set_epi64x(phi[7], phi[6] ^ seed[3], phi[5], phi[4] ^ seed[2]); - for (char i = 0; i < ROUNDS; i++) { + for (size_t i = 0; i < ROUNDS; i++) { prng_gen(&s, buf, 4 * STEPS); s.state[0] = s.state[1]; s.state[1] = s.output; diff --git a/shishua.h b/shishua.h index de9aa5e..77a98a8 100644 --- a/shishua.h +++ b/shishua.h @@ -1,5 +1,8 @@ #ifndef SHISHUA_H #define SHISHUA_H +#include +#include +#include #include typedef struct prng_state { __m256i state[4]; @@ -8,7 +11,7 @@ typedef struct prng_state { } prng_state; // buf's size must be a multiple of 128 bytes. -inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) { +static inline void prng_gen(prng_state *s, uint64_t buf[], size_t size) { __m256i o0 = s->output[0], o1 = s->output[1], o2 = s->output[2], o3 = s->output[3], s0 = s->state[0], s1 = s->state[1], s2 = s->state[2], s3 = s->state[3], t0, t1, t2, t3, u0, u1, u2, u3, counter = s->counter; @@ -30,7 +33,7 @@ inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) { // for a tiny amount of variation stirring. // I used the smallest odd numbers to avoid having a magic number. __m256i increment = _mm256_set_epi64x(1, 3, 5, 7); - for (__uint64_t i = 0; i < size; i += 16) { + for (size_t i = 0; i < size; i += 16) { _mm256_storeu_si256((__m256i*)&buf[i+ 0], o0); _mm256_storeu_si256((__m256i*)&buf[i+ 4], o1); _mm256_storeu_si256((__m256i*)&buf[i+ 8], o2); @@ -72,7 +75,7 @@ inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) { // Nothing up my sleeve: those are the hex digits of Φ, // the least approximable irrational number. // $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc -static __uint64_t phi[16] = { +static uint64_t phi[16] = { 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, 0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1, @@ -81,18 +84,18 @@ static __uint64_t phi[16] = { prng_state prng_init(SEEDTYPE seed[4]) { prng_state s; - s.counter = _mm256_set_epi64x(0, 0, 0, 0); -# define STEPS 13 -# define ROUNDS 1 - __uint64_t buf[16 * ROUNDS]; // 16 64-bit numbers per 128-byte output. + memset(&s, 0, sizeof(prng_state)); +# define STEPS 1 +# define ROUNDS 13 + uint64_t buf[16 * STEPS]; // 16 64-bit numbers per 128-byte output. // Diffuse first two seed elements in s0, then the last two. Same for s1. // We must keep half of the state unchanged so users cannot set a bad state. s.state[0] = _mm256_set_epi64x(phi[ 3], phi[ 2] ^ seed[1], phi[ 1], phi[ 0] ^ seed[0]); s.state[1] = _mm256_set_epi64x(phi[ 7], phi[ 6] ^ seed[3], phi[ 5], phi[ 4] ^ seed[2]); s.state[2] = _mm256_set_epi64x(phi[11], phi[10] ^ seed[3], phi[ 9], phi[ 8] ^ seed[2]); s.state[3] = _mm256_set_epi64x(phi[15], phi[14] ^ seed[1], phi[13], phi[12] ^ seed[0]); - for (char i = 0; i < STEPS; i++) { - prng_gen(&s, buf, 16 * ROUNDS); + for (size_t i = 0; i < ROUNDS; i++) { + prng_gen(&s, buf, 16 * STEPS); s.state[0] = s.output[3]; s.state[1] = s.output[2]; s.state[2] = s.output[1]; s.state[3] = s.output[0]; } diff --git a/wyrand.h b/wyrand.h index 977b2a8..c13d6a3 100644 --- a/wyrand.h +++ b/wyrand.h @@ -1,21 +1,49 @@ #ifndef WYRAND_H #define WYRAND_H +#include +#include + // wyrand: https://github.com/wangyi-fudan/wyhash/blob/master/wyhash.h // A (hash-table) hash derivative. typedef struct prng_state { - __uint64_t counter; + uint64_t counter; } prng_state; #define ROTL(a,n) (((a) << (n)) | ((a) >> (64 - (n)))) - +#ifdef __SIZEOF_INT128__ +static inline uint64_t prng_mult128_xorfold(uint64_t lhs, uint64_t rhs) { + __uint128_t product = (__uint128_t)lhs * rhs; + return (uint64_t)(product ^ (product >> 64)); +} +#elif defined(_M_X64) && defined(_MSC_VER) +#include +#pragma intrinsic(_umul128) +static inline uint64_t prng_mult128_xorfold(uint64_t lhs, uint64_t rhs) { + uint64_t hi; + uint64_t lo = _umul128(lhs, rhs, &hi); + return hi ^ lo; +} +#else +static inline uint64_t prng_mult128_xorfold(uint64_t lhs, uint64_t rhs) { +#define PRNG_MULL(x, y) ((uint64_t)(uint32_t)(x) * (uint64_t)(uint32_t)(y)) + uint64_t lo_lo = PRNG_MULL(lhs & 0xffffffff, rhs & 0xffffffff); + uint64_t hi_lo = PRNG_MULL(lhs >> 32, rhs & 0xffffffff); + uint64_t lo_hi = PRNG_MULL(lhs & 0xffffffff, rhs >> 32); + uint64_t hi_hi = PRNG_MULL(lhs >> 32, rhs >> 32); +#undef PRNG_MULL + uint64_t cross = hi_lo + (lo_lo >> 32) + (lo_hi & 0xffffffff); + uint64_t lower = (cross << 32) | (lo_lo & 0xffffffff); + uint64_t upper = hi_hi + (cross >> 32) + (lo_hi >> 32); + return lower ^ upper; +} +#endif // buf's size must be a multiple of 8 bytes. -inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) { - for (__uint64_t i = 0; i < size; i++) { +static inline void prng_gen(prng_state *s, uint64_t buf[], size_t size) { + for (size_t i = 0; i < size; i++) { s->counter += 0xa0761d6478bd642full; - __uint128_t r = (__uint128_t)(s->counter ^ 0xe7037ed1a0b428dbull) * s->counter; - buf[i] = r ^ r >> 64; + buf[i] = prng_mult128_xorfold(s->counter ^ 0xe7037ed1a0b428dbull, s->counter); } } diff --git a/xoshiro256+.h b/xoshiro256+.h index 5bf1dbe..268d1b5 100644 --- a/xoshiro256+.h +++ b/xoshiro256+.h @@ -1,6 +1,9 @@ #ifndef XOSHIRO_H #define XOSHIRO_H +#include +#include + // Eight alternating Xoshiro256+ states benefitting from SIMD. // Code from: http://prng.di.unimi.it/xoshiro256plus.c // Speed comparison: http://prng.di.unimi.it/#speed @@ -13,13 +16,13 @@ #define ROTL(a,n) (((a) << (n)) | ((a) >> (64 - (n)))) typedef struct prng_state { - __uint64_t state[4]; + uint64_t state[4]; } prng_state; // buf's size must be a multiple of 8 bytes. -inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) { - __uint64_t t; - for (__uint64_t i = 0; i < size; i++) { +static inline void prng_gen(prng_state *s, uint64_t buf[], size_t size) { + uint64_t t; + for (size_t i = 0; i < size; i++) { buf[i] = s->state[0] + s->state[3]; t = s->state[1] << 17; @@ -47,7 +50,7 @@ inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) { // fair. Ignoring bad splitmix64 gammas would hide severe seeding faults. prng_state prng_init(SEEDTYPE seed[4]) { prng_state s; - for (char j = 0; j < 4; j++) { s.state[j] = seed[j]; } + for (size_t j = 0; j < 4; j++) { s.state[j] = seed[j]; } if (s.state[0] == 0) { s.state[0] = 1; } return s; } diff --git a/xoshiro256+x8.h b/xoshiro256+x8.h index f512042..45e674b 100644 --- a/xoshiro256+x8.h +++ b/xoshiro256+x8.h @@ -1,6 +1,7 @@ #ifndef XOSHIRO_H #define XOSHIRO_H - +#include +#include // Eight alternating Xoshiro256+ states benefitting from SIMD. // Code from: http://prng.di.unimi.it/xoshiro256+-vect-speed.c // Speed comparison: http://prng.di.unimi.it/#speed @@ -13,32 +14,32 @@ #define ROTL(a,n) (((a) << (n)) | ((a) >> (64 - (n)))) typedef struct prng_state { - __uint64_t state[4][XOSHIRO256_UNROLL]; + uint64_t state[4][XOSHIRO256_UNROLL]; } prng_state; // buf's size must be a multiple of 8 bytes. -inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) { - __uint64_t t[XOSHIRO256_UNROLL]; - for (__uint64_t i = 0; i < size; i += XOSHIRO256_UNROLL) { - for (char j = 0; j < XOSHIRO256_UNROLL; j++) { buf[i + j] = s->state[0][j] + s->state[3][j]; } +static inline void prng_gen(prng_state *s, uint64_t buf[], size_t size) { + uint64_t t[XOSHIRO256_UNROLL]; + for (size_t i = 0; i < size; i += XOSHIRO256_UNROLL) { + for (size_t j = 0; j < XOSHIRO256_UNROLL; j++) { buf[i + j] = s->state[0][j] + s->state[3][j]; } - for (char j = 0; j < XOSHIRO256_UNROLL; j++) { t[j] = s->state[1][j] << 17; } + for (size_t j = 0; j < XOSHIRO256_UNROLL; j++) { t[j] = s->state[1][j] << 17; } - for (char j = 0; j < XOSHIRO256_UNROLL; j++) { s->state[2][j] ^= s->state[0][j]; } - for (char j = 0; j < XOSHIRO256_UNROLL; j++) { s->state[3][j] ^= s->state[1][j]; } - for (char j = 0; j < XOSHIRO256_UNROLL; j++) { s->state[1][j] ^= s->state[2][j]; } - for (char j = 0; j < XOSHIRO256_UNROLL; j++) { s->state[0][j] ^= s->state[3][j]; } + for (size_t j = 0; j < XOSHIRO256_UNROLL; j++) { s->state[2][j] ^= s->state[0][j]; } + for (size_t j = 0; j < XOSHIRO256_UNROLL; j++) { s->state[3][j] ^= s->state[1][j]; } + for (size_t j = 0; j < XOSHIRO256_UNROLL; j++) { s->state[1][j] ^= s->state[2][j]; } + for (size_t j = 0; j < XOSHIRO256_UNROLL; j++) { s->state[0][j] ^= s->state[3][j]; } - for (char j = 0; j < XOSHIRO256_UNROLL; j++) { s->state[2][j] ^= t[j]; } + for (size_t j = 0; j < XOSHIRO256_UNROLL; j++) { s->state[2][j] ^= t[j]; } - for (char j = 0; j < XOSHIRO256_UNROLL; j++) { s->state[3][j] = ROTL(s->state[3][j], 45); } + for (size_t j = 0; j < XOSHIRO256_UNROLL; j++) { s->state[3][j] = ROTL(s->state[3][j], 45); } } } prng_state prng_init(SEEDTYPE seed[4]) { prng_state s; - for (char i = 0; i < XOSHIRO256_UNROLL; i++) { - for (char j = 0; j < 4; j++) { s.state[j][i] = seed[j] ^ (1 << i); } + for (size_t i = 0; i < XOSHIRO256_UNROLL; i++) { + for (size_t j = 0; j < 4; j++) { s.state[j][i] = seed[j] ^ (1 << i); } } return s; }