Skip to content

Commit

Permalink
Various quick fixes
Browse files Browse the repository at this point in the history
It is better if these minor issues are fixed before adding different
shishua targets, as fixing them afterwards will be more difficult.

Portability:
 - Use `stdint.h` and the real type names
   - `__uint64_t` is NOT predefined by the compiler, and just happened
     to be defined in the included headers
 - Fallback `__uint128_t` emulated routines for lehmer128 and wyrand
   - Supports 32-bit and non-GCC targets
 - char as an integer type is banned.
 - Use `size_t` for indexing. Using `uint64_t` is pointless on 32-bit since
   it is impossible for an array to be larger than 4 GB. It just means
   extra `adc` instructions.
   - Similarly, don't use `char` for indexing - it doesn't save any space
     and it slows things down on non-x86 targets. Yes, compilers will
     still emit `and` instructions in a loop that will never overflow.
Bugfixes:
 - Initialized some arrays that were left uninitialized, causing
   unpredictable seeds in prng.c
 - The only `inline` is `static inline`.
Minor improvements:
 - Swap `STEPS` and `ROUNDS` in shishua.h to match shishua-half.h
  • Loading branch information
easyaspi314 committed Apr 26, 2020
1 parent aa01940 commit 5bd346c
Show file tree
Hide file tree
Showing 11 changed files with 168 additions and 76 deletions.
13 changes: 7 additions & 6 deletions chacha8.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,19 @@
#define CHACHA8_H
#include <immintrin.h>
#include <stdio.h>
#include <stdint.h>

// ChaCha8 using SIMD.
// u8.h: using 8 blocks at a time (AVX2).
// Inspired by https://github.com/floodyberry/supercop/tree/master/crypto_stream/chacha20/dolbeau/amd64-avx2
// DO NOT USE THIS CODE FOR CRYPTOGRAPHIC PURPOSES.

typedef struct prng_state {
__uint32_t state[16];
uint32_t state[16];
} prng_state;

#define ROUNDS 8
#define U8TO32_LITTLE(p) (((__uint32_t*)(p))[0])
#define U8TO32_LITTLE(p) (((uint32_t*)(p))[0])


// SIMD primitives
Expand Down Expand Up @@ -90,17 +91,17 @@ typedef struct prng_state {


// buf's size must be a multiple of 512 bytes.
inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) {
static inline void prng_gen(prng_state *s, uint64_t buf[], size_t size) {
char *out = (char *)buf;
__uint64_t bytes = size * 2;
uint64_t bytes = size * 2;
int i;

if (!bytes || bytes < 512) { return; }

/* constant for shuffling bytes (replacing multiple-of-8 rotates) */
__m256i rot16 = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2);
__m256i rot8 = _mm256_set_epi8(14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3,14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3);
__uint32_t in12, in13;
uint32_t in12, in13;

__m256i x_0 = _mm256_set1_epi32(s->state[0]);
__m256i x_1 = _mm256_set1_epi32(s->state[1]);
Expand Down Expand Up @@ -179,7 +180,7 @@ inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) {
__m256i t12, t13;
in12 = s->state[12];
in13 = s->state[13];
__uint64_t in1213 = ((__uint64_t)in12) | (((__uint64_t)in13) << 32);
uint64_t in1213 = ((uint64_t)in12) | (((uint64_t)in13) << 32);
x_12 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213));
x_13 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213));

Expand Down
4 changes: 2 additions & 2 deletions intertwine.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ int main(int argc, char **argv) {
char reachedEOF = 0;
while (!reachedEOF) {
int readBytes = outputBufsize;
for (char i = 0; i < a.fnum; i++) {
for (int i = 0; i < a.fnum; i++) {
ssize_t r = read(a.files[i], ibuf, BUFSIZE);
for (int bi = 0; bi < r; bi++) {
obuf[a.fnum * bi + i] = ibuf[bi];
Expand All @@ -35,7 +35,7 @@ int main(int argc, char **argv) {
}

// Discharging allocations.
for (char i = 0; i < a.fnum; i++) { close(a.files[i]); }
for (int i = 0; i < a.fnum; i++) { close(a.files[i]); }
free(a.files); free(obuf);
return 0;
}
Expand Down
54 changes: 52 additions & 2 deletions lehmer128.h
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
#ifndef LEHMER_H
#define LEHMER_H

#include <stdint.h>
#include <stddef.h>
// LEHMER128: https://lemire.me/blog/2019/03/19/the-fastest-conventional-random-number-generator-that-can-pass-big-crush/

#ifdef __SIZEOF_INT128__
typedef struct prng_state {
__uint128_t state;
} prng_state;

#define ROTL(a,n) (((a) << (n)) | ((a) >> (64 - (n))))

// buf's size must be a multiple of 8 bytes.
inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) {
for (__uint64_t i = 0; i < size; i++) {
static inline void prng_gen(prng_state *s, uint64_t buf[], size_t size) {
for (size_t i = 0; i < size; i++) {
buf[i] = (__uint128_t)(s->state *= 0xda942042e4dd58b5) >> 64;
}
}
Expand All @@ -24,4 +27,51 @@ prng_state prng_init(SEEDTYPE seed[4]) {
if (s.state == 0) { s.state = 1; }
return s;
}
#else

typedef struct prng_state {
uint64_t state[2];
} prng_state;

#define ROTL(a,n) (((a) << (n)) | ((a) >> (64 - (n))))
#if defined(_M_X64) && defined(_MSC_VER)
#include <intrin.h>
#pragma intrinsic(_umul128)
static inline void prng_mult128by64(uint64_t lhs[2], uint64_t rhs) {
uint64_t hi;
uint64_t lo = _umul128(lhs[0], rhs, &hi);
lhs[0] = lo;
lhs[1] = hi + (lhs[1] * rhs);
}
#else
static inline void prng_mult128by64(uint64_t lhs[2], uint64_t rhs) {
#define PRNG_MULL(x, y) ((uint64_t)(uint32_t)(x) * (uint64_t)(uint32_t)(y))
uint64_t lo_lo = PRNG_MULL(lhs[0] & 0xffffffff, rhs & 0xffffffff);
uint64_t hi_lo = PRNG_MULL(lhs[0] >> 32, rhs & 0xffffffff);
uint64_t lo_hi = PRNG_MULL(lhs[0] & 0xffffffff, rhs >> 32);
uint64_t hi_hi = PRNG_MULL(lhs[0] >> 32, rhs >> 32);
#undef PRNG_MULL
uint64_t cross = hi_lo + (lo_lo >> 32) + (lo_hi & 0xffffffff);
uint64_t lower = (cross << 32) | (lo_lo & 0xffffffff);
uint64_t upper = hi_hi + (cross >> 32) + (lo_hi >> 32);
lhs[0] = lower;
lhs[1] = upper + (lhs[1] * rhs);
}
#endif
// buf's size must be a multiple of 8 bytes.
static inline void prng_gen(prng_state *s, uint64_t buf[], size_t size) {
for (size_t i = 0; i < size; i++) {
prng_mult128by64(s->state, 0xda942042e4dd58b5);
buf[i] = s->state[1];
}
}

prng_state prng_init(SEEDTYPE seed[4]) {
prng_state s;
s.state[1] = seed[0] ^ seed[2];
s.state[0] = seed[1] ^ seed[3];
if (s.state[0] == 0 && s.state[1] == 0) { s.state[0] = 1; }
return s;
}
#endif // __SIZEOF_INT128__
#endif
18 changes: 9 additions & 9 deletions prng.c
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
#include <unistd.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <x86intrin.h>
#define BUFSIZE (1<<14)
#define SEEDTYPE __uint64_t
#define SEEDTYPE uint64_t
#include "./prng.h"
typedef struct args { __int64_t bytes; SEEDTYPE seed[4]; int rval; } args_t;
typedef struct args { uint64_t bytes; SEEDTYPE seed[4]; int rval; } args_t;
args_t parseArgs(int argc, char **argv);
void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size);

int main(int argc, char **argv) {
args_t a = parseArgs(argc, argv);
if (a.rval < 0) { return a.rval; }
prng_state s = prng_init(a.seed);
__uint64_t buf[BUFSIZE] __attribute__ ((aligned (64)));
__int64_t cycles = 0, start;
for (__int64_t bytes = a.bytes; bytes >= 0; bytes -= sizeof(buf)) {
uint64_t buf[BUFSIZE] __attribute__ ((aligned (64)));
int64_t cycles = 0, start;
for (uint64_t bytes = a.bytes; bytes >= 0; bytes -= sizeof(buf)) {
int wbytes = bytes < sizeof(buf)? bytes: sizeof(buf);
start = _rdtsc();
prng_gen(&s, buf, BUFSIZE);
Expand All @@ -28,7 +28,7 @@ int main(int argc, char **argv) {
}

args_t parseArgs(int argc, char **argv) {
args_t a;
args_t a = {0};
a.rval = 0;
a.bytes = 0x7fffffffffffffff;
for (int i = 0; i < argc; i++) {
Expand All @@ -40,9 +40,9 @@ args_t parseArgs(int argc, char **argv) {
fprintf(stderr, " --seed: as hexadecimal.\n");
a.rval = -1;
} else if (strcmp(argv[i], "-b") == 0 || strcmp(argv[i], "--bytes") == 0) {
a.bytes = atoll(argv[++i]);
a.bytes = strtoull(argv[++i], NULL, 0);
} else if (strcmp(argv[i], "-s") == 0 || strcmp(argv[i], "--seed") == 0) {
a.seed[0] = strtol(argv[++i], NULL, 16);
a.seed[0] = strtoull(argv[++i], NULL, 0);
}
}
return a;
Expand Down
22 changes: 12 additions & 10 deletions rc4.h
Original file line number Diff line number Diff line change
@@ -1,25 +1,27 @@
#ifndef RC4_H
#define RC4_H

#include <stdint.h>
#include <stddef.h>
// RC4 (aka arc4random).
// DO NOT USE THIS CODE FOR CRYPTOGRAPHIC PURPOSES.

typedef struct prng_state {
__uint8_t shuffle[256];
__uint8_t i, j;
uint8_t shuffle[256];
uint8_t i, j;
} prng_state;

#define ROTL(a,n) (((a) << (n)) | ((a) >> (64 - (n))))
#define SWAP(i, j) { \
char si = shuffle[(i)]; \
uint8_t si = shuffle[(i)]; \
shuffle[(i)] = shuffle[(j)]; \
shuffle[(j)] = si; }

// buf's size must be a multiple of 8 bytes; it could get bytes one at a time.
inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) {
__uint8_t *shuffle = s->shuffle, *b = buf;
for (__uint64_t i = 0; i < size; i++) {
for (char j = 0; j < 8; j++) {
static inline void prng_gen(prng_state *s, uint64_t buf[], size_t size) {
uint8_t *shuffle = s->shuffle, *b = buf;
for (size_t i = 0; i < size; i++) {
for (size_t j = 0; j < 8; j++) {
s->i++; s->j += shuffle[s->i];
SWAP(s->i, s->j);
b[8*i + j] = shuffle[shuffle[s->i] + shuffle[s->j]];
Expand All @@ -29,9 +31,9 @@ inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) {

prng_state prng_init(SEEDTYPE seed[4]) {
prng_state s;
__uint8_t *shuffle = s.shuffle, *key = (char *)seed;
for (int i = 0; i < 256; i++) { shuffle[i] = i; }
for (int i = 0, j = 0; i < 256; i++, j = (j + shuffle[i] + key[i % 32]) % 256) {
uint8_t *shuffle = s.shuffle, *key = (uint8_t *)seed;
for (size_t i = 0; i < 256; i++) { shuffle[i] = i; }
for (size_t i = 0, j = 0; i < 256; i++, j = (j + shuffle[i] + key[i % 32]) % 256) {
SWAP(i, j);
}
s.i = s.j = 0;
Expand Down
13 changes: 7 additions & 6 deletions romu.h
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
#ifndef ROMU_H
#define ROMU_H

#include <stdint.h>
#include <stddef.h>
typedef struct prng_state {
__uint64_t state[3];
uint64_t state[3];
} prng_state;

#define ROTL(a,n) (((a) << (n)) | ((a) >> (64 - (n))))

// buf's size must be a multiple of 8 bytes.
inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) {
for (__uint64_t i = 0; i < size; i++) {
static inline void prng_gen(prng_state *s, uint64_t buf[], size_t size) {
for (size_t i = 0; i < size; i++) {
// ROMU: http://www.romu-random.org/romupaper.pdf
__uint64_t xp = s->state[0], yp = s->state[1], zp = s->state[2];
uint64_t xp = s->state[0], yp = s->state[1], zp = s->state[2];
s->state[0] = 15241094284759029579u * zp;
s->state[1] = ROTL(yp - xp, 12);
s->state[2] = ROTL(zp - yp, 44);
Expand All @@ -21,7 +22,7 @@ inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) {

prng_state prng_init(SEEDTYPE seed[4]) {
prng_state s;
for (char i = 0; i < 3; i++) { s.state[i] = seed[i]; }
for (size_t i = 0; i < 3; i++) { s.state[i] = seed[i]; }
if (s.state[2] == 0) { s.state[2] = 1; }
return s;
}
Expand Down
15 changes: 9 additions & 6 deletions shishua-half.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#ifndef SHISHUA_H
#define SHISHUA_H
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <immintrin.h>
typedef struct prng_state {
__m256i state[2];
Expand All @@ -8,7 +11,7 @@ typedef struct prng_state {
} prng_state;

// buf's size must be a multiple of 32 bytes.
inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) {
static inline void prng_gen(prng_state *s, uint64_t buf[], size_t size) {
__m256i s0 = s->state[0], counter = s->counter,
s1 = s->state[1], o = s->output,
t0, t1, t2, t3, u0, u1, u2, u3;
Expand All @@ -30,7 +33,7 @@ inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) {
// for a tiny amount of variation stirring.
// I used the smallest odd numbers to avoid having a magic number.
__m256i increment = _mm256_set_epi64x(1, 3, 5, 7);
for (__uint64_t i = 0; i < size; i += 4) {
for (size_t i = 0; i < size; i += 4) {
_mm256_storeu_si256((__m256i*)&buf[i], o);

// I apply the counter to s1,
Expand Down Expand Up @@ -61,22 +64,22 @@ inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) {
// Nothing up my sleeve: those are the hex digits of Φ,
// the least approximable irrational number.
// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
static __uint64_t phi[8] = {
static uint64_t phi[8] = {
0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
};

prng_state prng_init(SEEDTYPE seed[4]) {
prng_state s;
s.counter = _mm256_set_epi64x(0, 0, 0, 0);
memset(&s, 0, sizeof(prng_state));
# define STEPS 5
# define ROUNDS 4
__uint64_t buf[4 * STEPS]; // 4 64-bit numbers per 256-bit SIMD.
uint64_t buf[4 * STEPS]; // 4 64-bit numbers per 256-bit SIMD.
// Diffuse first two seed elements in s0, then the last two. Same for s1.
// We must keep half of the state unchanged so users cannot set a bad state.
s.state[0] = _mm256_set_epi64x(phi[3], phi[2] ^ seed[1], phi[1], phi[0] ^ seed[0]);
s.state[1] = _mm256_set_epi64x(phi[7], phi[6] ^ seed[3], phi[5], phi[4] ^ seed[2]);
for (char i = 0; i < ROUNDS; i++) {
for (size_t i = 0; i < ROUNDS; i++) {
prng_gen(&s, buf, 4 * STEPS);
s.state[0] = s.state[1];
s.state[1] = s.output;
Expand Down
21 changes: 12 additions & 9 deletions shishua.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#ifndef SHISHUA_H
#define SHISHUA_H
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <immintrin.h>
typedef struct prng_state {
__m256i state[4];
Expand All @@ -8,7 +11,7 @@ typedef struct prng_state {
} prng_state;

// buf's size must be a multiple of 128 bytes.
inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) {
static inline void prng_gen(prng_state *s, uint64_t buf[], size_t size) {
__m256i o0 = s->output[0], o1 = s->output[1], o2 = s->output[2], o3 = s->output[3],
s0 = s->state[0], s1 = s->state[1], s2 = s->state[2], s3 = s->state[3],
t0, t1, t2, t3, u0, u1, u2, u3, counter = s->counter;
Expand All @@ -30,7 +33,7 @@ inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) {
// for a tiny amount of variation stirring.
// I used the smallest odd numbers to avoid having a magic number.
__m256i increment = _mm256_set_epi64x(1, 3, 5, 7);
for (__uint64_t i = 0; i < size; i += 16) {
for (size_t i = 0; i < size; i += 16) {
_mm256_storeu_si256((__m256i*)&buf[i+ 0], o0);
_mm256_storeu_si256((__m256i*)&buf[i+ 4], o1);
_mm256_storeu_si256((__m256i*)&buf[i+ 8], o2);
Expand Down Expand Up @@ -72,7 +75,7 @@ inline void prng_gen(prng_state *s, __uint64_t buf[], __uint64_t size) {
// Nothing up my sleeve: those are the hex digits of Φ,
// the least approximable irrational number.
// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
static __uint64_t phi[16] = {
static uint64_t phi[16] = {
0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1,
Expand All @@ -81,18 +84,18 @@ static __uint64_t phi[16] = {

prng_state prng_init(SEEDTYPE seed[4]) {
prng_state s;
s.counter = _mm256_set_epi64x(0, 0, 0, 0);
# define STEPS 13
# define ROUNDS 1
__uint64_t buf[16 * ROUNDS]; // 16 64-bit numbers per 128-byte output.
memset(&s, 0, sizeof(prng_state));
# define STEPS 1
# define ROUNDS 13
uint64_t buf[16 * STEPS]; // 16 64-bit numbers per 128-byte output.
// Diffuse first two seed elements in s0, then the last two. Same for s1.
// We must keep half of the state unchanged so users cannot set a bad state.
s.state[0] = _mm256_set_epi64x(phi[ 3], phi[ 2] ^ seed[1], phi[ 1], phi[ 0] ^ seed[0]);
s.state[1] = _mm256_set_epi64x(phi[ 7], phi[ 6] ^ seed[3], phi[ 5], phi[ 4] ^ seed[2]);
s.state[2] = _mm256_set_epi64x(phi[11], phi[10] ^ seed[3], phi[ 9], phi[ 8] ^ seed[2]);
s.state[3] = _mm256_set_epi64x(phi[15], phi[14] ^ seed[1], phi[13], phi[12] ^ seed[0]);
for (char i = 0; i < STEPS; i++) {
prng_gen(&s, buf, 16 * ROUNDS);
for (size_t i = 0; i < ROUNDS; i++) {
prng_gen(&s, buf, 16 * STEPS);
s.state[0] = s.output[3]; s.state[1] = s.output[2];
s.state[2] = s.output[1]; s.state[3] = s.output[0];
}
Expand Down
Loading

0 comments on commit 5bd346c

Please sign in to comment.