diff --git a/.appveyor.yml b/.appveyor.yml index 5cc9fd2..ab43d1d 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,4 +1,4 @@ -version: 1.1.3.{build} +version: 1.2.0.{build} environment: matrix: @@ -21,6 +21,14 @@ platform: - x86 - x64 - ARM +- ARM64 + +matrix: + exclude: + - platform: ARM64 + TOOLSET: v140 + - platform: ARM64 + TOOLSET: v120 build_script: - ps: > diff --git a/.gitignore b/.gitignore index 2b2cc3a..aded5fd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,18 @@ +.vs/ +ARM/ +ARM64/ +Win32/ libt1ha.a libt1ha.so +t1ha-dll.VC.db +t1ha-static.VC.db +t1ha-test.VC.db +t1ha.VC.VC.opendb +t1ha.VC.db +t1ha.config +t1ha.creator.user +t1ha.includes +t1ha0.o t1ha0_aes_avx.o t1ha0_aes_noavx.o t1ha0_ia32aes_avx.bc @@ -8,15 +21,6 @@ t1ha0_ia32aes_avx.s t1ha0_ia32aes_noavx.bc t1ha0_ia32aes_noavx.i t1ha0_ia32aes_noavx.s -t1ha0.o t1ha1.o -t1ha-dll.VC.db -t1ha-static.VC.db -t1ha-test.VC.db -t1ha.VC.db -t1ha.VC.VC.opendb test -.vs/ -ARM/ -Win32/ x64/ diff --git a/.travis.yml b/.travis.yml index f5f8cdc..a3a9f3a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,28 +10,20 @@ os: - linux - osx -script: if [ "${COVERITY_SCAN_BRANCH}" != 1 ]; then make all check; fi - env: global: - secure: "fQvkBkC9lpnhl6L9DsfXQWBaHR4UMR1wtLjPrSn7Sl8cwKfJ5MAR+K1Khz7BDkKV7MCN6no7QJ1MYbwSSp67XnvwWL10yt/KbaAZCMNRWgwenfiCvdN67sw4rliX3q3X6PllI2jUSJy8gFmHDZzqEH4ark6uq3Gb30gt2lSlxXd/vF4xDOjy/IyLhzbFQIhJScHLApxeOYuoHLEmta5gsXjwhK6dsftDAHGc6alyPu19h5p4wvv+bP2WcREvMQHK6u3vfeUrUGdNYboEHyJmM9qHnR7JPmGMEhuF/lWjc7DtWrvb5qAVSayeKbTOQWnk5sINN46mTEDfGXkHsjlFIau6mdIGDNIAhdge6ODuVyCTS4pTd0LQNTJG5pL0dFDTtiqaSkDU4M0/ofsO2O535dwFhdJz7qqqWacTYDJCPGvku1TmhfxNWyrFlp/I8shtYdMKmNtHQrtEBlLYCdnypow2VB+3tyvwve5LpwSY2BD7gY2NrP6fp7vHqfoan5PXfsxXuBt7LJDmHDBOvTils2RPbqiF0jG1Xk1YYWTUopLqEl2iYUqnOeg4XcS7wEwOpgbEqfrvHJ4BPUI+Rz1TMN19P9sgWS+hWuUMg+hka3ZVvzAI619eqzMnYVNTJjbDHZya3kSxfBuyO7RLjj+3UN88QClLoIKJ7Aa/50xZGY8=" -before_install: > - if [ "${COVERITY_SCAN_BRANCH}" = 1 ]; then +before_script: > + if [ "${TRAVIS_BRANCH}" = "coverity_scan" ]; then + # implement Coverity Scan with before_script instead of addons.coverity_scan if grep -q '[0-9]\+\.1$' <<< "${TRAVIS_JOB_NUMBER}"; then + export COVERITY_SCAN_BRANCH=1 echo -n | openssl s_client -connect scan.coverity.com:443 | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' | sudo tee -a /etc/ssl/certs/ca- + curl -s 'https://scan.coverity.com/scripts/travisci_build_coverity_scan.sh' | COVERITY_SCAN_PROJECT_NAME="$TRAVIS_REPO_SLUG" COVERITY_SCAN_NOTIFICATION_EMAIL="leo@yuriev.ru" COVERITY_SCAN_BUILD_COMMAND="make" COVERITY_SCAN_BUILD_COMMAND_PREPEND="" COVERITY_SCAN_BRANCH_PATTERN="$TRAVIS_BRANCH" bash else echo "Skip CoverityScan for unrelated os/compiler" - exit 0 fi fi -addons: - coverity_scan: - project: - name: "leo-yuriev/t1ha" - description: "Build submitted via Travis CI" - version: 1.0 - notification_email: leo@yuriev.ru - build_command: make - branch_pattern: coverity_scan +script: if [ "${COVERITY_SCAN_BRANCH}" != 1 ]; then make all check; fi diff --git a/LICENSE b/LICENSE index 7b9792c..d02db65 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ - Copyright (c) 2016-2017 Positive Technologies, https://www.ptsecurity.com, + Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, Fast Positive Hash. Portions Copyright (c) 2010-2013 Leonid Yuriev , diff --git a/Makefile b/Makefile index 6498c17..872022c 100644 --- a/Makefile +++ b/Makefile @@ -5,60 +5,119 @@ # So, define it to 0 for calmness if doubt. T1HA_USE_FAST_ONESHOT_READ ?=1 -CFLAGS ?= -std=c99 +CFLAGS ?= -std=c99 -O3 -DNDEBUG -D_DEFAULT_SOURCE CC ?= gcc -TARGET_ARCHx86 = $(shell (export LC_ALL=C; ($(CC) --version 2>&1; $(CC) -v 2>&1) | grep -q -i -e '^Target: \(x86_64\)\|\([iI][3-6]86\)-.*' && echo yes || echo no)) - -OBJ_LIST := t1ha0.o t1ha1.o -BENCH_EXTRA := -ifeq ($(TARGET_ARCHx86),yes) +TARGET_ARCH_e2k = $(shell (export LC_ALL=C; ($(CC) --version 2>&1; $(CC) -v 2>&1) | grep -q -i 'e2k' && echo yes || echo no)) +TARGET_ARCH_ia32 = $(shell (export LC_ALL=C; ($(CC) --version 2>&1; $(CC) -v 2>&1) | grep -q -i -e '^Target: \(x86_64\)\|\([iI][3-6]86\)-.*' && echo yes || echo no)) + +OBJ_LIST := t1ha0.o t1ha1.o t1ha2.o +BENCH_EXTRA := bench.o mera.o test.o 4bench_xxhash.o +ifeq ($(TARGET_ARCH_e2k),yes) +TARGET_ARCH := e2k +CFLAGS += -mtune=native +OBJ_LIST += t1ha0_aes_noavx.o t1ha0_aes_avx.o +BENCH_EXTRA += 4bench_t1ha0_aes_noavx.o 4bench_t1ha0_aes_avx.o +else ifeq ($(TARGET_ARCH_ia32),yes) +TARGET_ARCH := ia32 +CFLAGS += -mtune=native OBJ_LIST += t1ha0_aes_noavx.o t1ha0_aes_avx.o t1ha0_aes_avx2.o BENCH_EXTRA += 4bench_t1ha0_aes_noavx.o 4bench_t1ha0_aes_avx.o 4bench_t1ha0_aes_avx2.o +else +TARGET_ARCH := portable endif -CFLAGS_TEST ?= -Wextra -Werror -O -g $(CFLAGS) -CFLAGS_LIB ?= -Wall -ffunction-sections -O3 -fPIC -g $(CFLAGS) -fvisibility=hidden -Dt1ha_EXPORTS +CFLAGS_TEST ?= -Wextra -Werror $(CFLAGS) +CFLAGS_LIB ?= -Wall -ffunction-sections -fPIC $(CFLAGS) -fvisibility=hidden -Dt1ha_EXPORTS all: test libt1ha.a libt1ha.so +clean: + rm -f test test32 test64 *.i *.bc *.s *.o *.a *.so t1ha0.o: t1ha.h src/t1ha_bits.h src/t1ha0.c Makefile $(CC) $(CFLAGS_LIB) -c -o $@ src/t1ha0.c +t1ha0_aes_noavx.o_ARCH_ia32_CFLAGS = -mno-avx2 -mno-avx -maes t1ha0_aes_noavx.o: t1ha.h src/t1ha_bits.h src/t1ha0_ia32aes_a.h src/t1ha0_ia32aes_b.h src/t1ha0_ia32aes_noavx.c Makefile - $(CC) $(CFLAGS_LIB) -save-temps -mno-avx2 -mno-avx -maes -c -o $@ src/t1ha0_ia32aes_noavx.c + $(CC) $(CFLAGS_LIB) -save-temps $($(@)_ARCH_$(TARGET_ARCH)_CFLAGS) -c -o $@ src/t1ha0_ia32aes_noavx.c +t1ha0_aes_avx.o_ARCH_ia32_CFLAGS = -mno-avx2 -mavx -maes t1ha0_aes_avx.o: t1ha.h src/t1ha_bits.h src/t1ha0_ia32aes_a.h src/t1ha0_ia32aes_b.h src/t1ha0_ia32aes_avx.c Makefile - $(CC) $(CFLAGS_LIB) -save-temps -mno-avx2 -mavx -maes -c -o $@ src/t1ha0_ia32aes_avx.c + $(CC) $(CFLAGS_LIB) -save-temps $($(@)_ARCH_$(TARGET_ARCH)_CFLAGS) -c -o $@ src/t1ha0_ia32aes_avx.c +t1ha0_aes_avx2.o_ARCH_ia32_CFLAGS = -mavx2 -mavx -maes t1ha0_aes_avx2.o: t1ha.h src/t1ha_bits.h src/t1ha0_ia32aes_a.h src/t1ha0_ia32aes_b.h src/t1ha0_ia32aes_avx2.c Makefile - $(CC) $(CFLAGS_LIB) -save-temps -mavx2 -mavx -maes -c -o $@ src/t1ha0_ia32aes_avx2.c + $(CC) $(CFLAGS_LIB) -save-temps $($(@)_ARCH_$(TARGET_ARCH)_CFLAGS) -c -o $@ src/t1ha0_ia32aes_avx2.c +4bench_t1ha0_aes_noavx.o_ARCH_ia32_CFLAGS = -mno-avx2 -mno-avx -maes 4bench_t1ha0_aes_noavx.o: t1ha.h src/t1ha_bits.h src/t1ha0_ia32aes_a.h src/t1ha0_ia32aes_b.h tests/4bench_t1ha0_ia32aes_noavx.c Makefile - $(CC) $(CFLAGS_LIB) -mno-avx2 -mno-avx -maes -c -o $@ tests/4bench_t1ha0_ia32aes_noavx.c + $(CC) $(CFLAGS_LIB) $($(@)_ARCH_$(TARGET_ARCH)_CFLAGS) -c -o $@ tests/4bench_t1ha0_ia32aes_noavx.c +4bench_t1ha0_aes_avx.o_ARCH_ia32_CFLAGS = -mno-avx2 -mavx -maes 4bench_t1ha0_aes_avx.o: t1ha.h src/t1ha_bits.h src/t1ha0_ia32aes_a.h src/t1ha0_ia32aes_b.h tests/4bench_t1ha0_ia32aes_avx.c Makefile - $(CC) $(CFLAGS_LIB) -mno-avx2 -mavx -maes -c -o $@ tests/4bench_t1ha0_ia32aes_avx.c + $(CC) $(CFLAGS_LIB) $($(@)_ARCH_$(TARGET_ARCH)_CFLAGS) -c -o $@ tests/4bench_t1ha0_ia32aes_avx.c +4bench_t1ha0_aes_avx2.o_ARCH_ia32_CFLAGS = -mavx2 -mavx -maes 4bench_t1ha0_aes_avx2.o: t1ha.h src/t1ha_bits.h src/t1ha0_ia32aes_a.h src/t1ha0_ia32aes_b.h tests/4bench_t1ha0_ia32aes_avx2.c Makefile - $(CC) $(CFLAGS_LIB) -mavx2 -mavx -maes -c -o $@ tests/4bench_t1ha0_ia32aes_avx2.c + $(CC) $(CFLAGS_LIB) $($(@)_ARCH_$(TARGET_ARCH)_CFLAGS) -c -o $@ tests/4bench_t1ha0_ia32aes_avx2.c t1ha1.o: t1ha.h src/t1ha_bits.h src/t1ha1.c Makefile $(CC) $(CFLAGS_LIB) -c -o $@ src/t1ha1.c -libt1ha.a: $(OBJ_LIST) test Makefile +t1ha2.o: t1ha.h src/t1ha_bits.h src/t1ha2.c Makefile + $(CC) $(CFLAGS_LIB) -c -o $@ src/t1ha2.c + +libt1ha.a: $(OBJ_LIST) Makefile $(AR) rs $@ $(OBJ_LIST) -libt1ha.so: $(OBJ_LIST) test Makefile +libt1ha.so: $(OBJ_LIST) Makefile $(CC) $(CFLAGS) -shared -s -o $@ $(OBJ_LIST) -test: $(OBJ_LIST) $(BENCH_EXTRA) tests/main.c Makefile - @echo "Target-ARCHx86: $(TARGET_ARCHx86)" || true +############################################################################### + +mera.o: t1ha.h tests/mera.h tests/mera.c \ + Makefile + $(CC) $(CFLAGS_TEST) -save-temps -c -o $@ tests/mera.c + +bench.o: t1ha.h tests/common.h tests/mera.h tests/bench.c \ + Makefile + $(CC) $(CFLAGS_TEST) -c -o $@ tests/bench.c + +test.o: t1ha.h tests/common.h tests/mera.h tests/test.c \ + Makefile + $(CC) $(CFLAGS_TEST) -c -o $@ tests/test.c + +4bench_xxhash.o: tests/xxhash/xxhash.h tests/xxhash/xxhash.c \ + Makefile + $(CC) $(CFLAGS_TEST) -Wno-error -c -o $@ tests/xxhash/xxhash.c + +test: $(OBJ_LIST) $(BENCH_EXTRA) tests/main.c Makefile \ + t1ha.h tests/common.h tests/mera.h \ + mera.o bench.o test.o + @echo "Target-ARCH: $(TARGET_ARCH)" || true $(CC) $(CFLAGS_TEST) -o $@ tests/main.c $(OBJ_LIST) $(BENCH_EXTRA) check: test ./test || rm -rf libt1ha.a libt1ha.so -clean: - rm -f test test32 test64 *.i *.bc *.s *.o *.a *.so +bench-verbose: test + ./test --bench-verbose || rm -rf libt1ha.a libt1ha.so + +############################################################################### + +# sparc64-linux-gnu-gcc - qemu troubles (sigaction, etc...) +# hppa-linux-gnu-gcc - don't supported by qemu +# hppa64-linux-gnu-gcc - gcc unable to cross-compiler +# s390x-linux-gnu-gcc - qemu troubles (hang) + +CROSS_LIST = sh4-linux-gnu-gcc alpha-linux-gnu-gcc \ + powerpc64-linux-gnu-gcc powerpc-linux-gnu-gcc \ + mips64-linux-gnuabi64-gcc mips-linux-gnu-gcc \ + arm-linux-gnueabihf-gcc aarch64-linux-gnu-gcc + +cross-gcc: + for CC in $(CROSS_LIST); do make clean && CC=$$CC make all || exit $$?; done + +cross-qemu: + for CC in $(CROSS_LIST); do make clean && CC=$$CC CFLAGS_TEST="-std=c99 -static" make bench-verbose || exit $$?; done diff --git a/README.md b/README.md index 8d96967..efc8c57 100644 --- a/README.md +++ b/README.md @@ -4,17 +4,19 @@ Fast Positive Hash, aka "Позитивный Хэш" by [Positive Technologies](https://www.ptsecurity.com). *The Future will Positive. Всё будет хорошо.* -[![Build Status](https://travis-ci.org/leo-yuriev/t1ha.svg?branch=master)](https://travis-ci.org/leo-yuriev/t1ha) -[![Build status](https://ci.appveyor.com/api/projects/status/ptug5fl2ouxdo68h/branch/master?svg=true)](https://ci.appveyor.com/project/leo-yuriev/t1ha/branch/master) -[![CircleCI](https://circleci.com/gh/leo-yuriev/t1ha/tree/master.svg?style=svg)](https://circleci.com/gh/leo-yuriev/t1ha/tree/master) +[![Build Status](https://travis-ci.org/leo-yuriev/t1ha.svg?branch=devel)](https://travis-ci.org/leo-yuriev/t1ha) +[![Build status](https://ci.appveyor.com/api/projects/status/ptug5fl2ouxdo68h/branch/devel?svg=true)](https://ci.appveyor.com/project/leo-yuriev/t1ha/branch/devel) +[![CircleCI](https://circleci.com/gh/leo-yuriev/t1ha/tree/devel.svg?style=svg)](https://circleci.com/gh/leo-yuriev/t1ha/tree/devel) [![Coverity Scan Status](https://scan.coverity.com/projects/12918/badge.svg)](https://scan.coverity.com/projects/leo-yuriev-t1ha) ## Briefly, it is a portable 64-bit hash function: - 1. Intended for 64-bit little-endian platforms, predominantly for x86_64, + 1. Intended for 64-bit little-endian platforms, predominantly for Elbrus and x86_64, but portable and without penalties it can run on any 64-bit CPU. - 2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash + 2. In most cases up to 15% faster than City, xxHash, mum-hash, metro-hash, etc. and all others portable hash-functions (which do not use specific hardware tricks). - 3. Currently not suitable for cryptography. + 3. Provides a set of _terraced_ hash functions. + 4. Currently not suitable for cryptography. + 5. Licensed under [Zlib License](https://en.wikipedia.org/wiki/Zlib_License). Also pay attention to [Erlang](https://github.com/lemenkov/erlang-t1ha) and [Golang](https://github.com/dgryski/go-t1ha) implementations. @@ -24,9 +26,11 @@ and [Golang](https://github.com/dgryski/go-t1ha) implementations. # Usage The `t1ha` library provides several terraced hash functions with the dissimilar properties and for a different cases. - These functions briefly described below, see [t1ha.h](t1ha.h) for more API details. +To use in your own project you may link with the t1ha-library, +or just add to your project corresponding source files from `/src` directory. + Please, feel free to fill an issue or make pull request. @@ -48,8 +52,8 @@ Please, feel free to fill an issue or make pull request. Also should be noted, the quality of t1ha0() hashing is a subject for tradeoffs with performance. Therefore the quality and strength - of t1ha0() may be lower than t1ha1(), especially on 32-bit targets, - but then much faster. + of `t1ha0()` may be lower than `t1ha1()` and `t1ha2()`, + especially on 32-bit targets, but then much faster. However, guaranteed that it passes all SMHasher tests. Internally t1ha0() selects most faster implementation for current CPU, @@ -66,10 +70,10 @@ Please, feel free to fill an issue or make pull request. | `t1ha1_be()` | 64-bit big-endian | -`t1ha1` = 64 bits, fast portable hash +`t1ha1` = 64 bits, baseline fast portable hash ------------------------------------- - The main generic version of "Fast Positive Hash" with reasonable quality + The first version of "Fast Positive Hash" with reasonable quality for checksum, hash tables and thin fingerprinting. It is stable, e.g. returns same result on all architectures and CPUs. @@ -77,18 +81,33 @@ Please, feel free to fill an issue or make pull request. 2. Efficiency on modern 64-bit CPUs, but not in a hardware. 3. Strong as possible, until no penalties on performance. - The main version is intended for little-endian systems and will run + Unfortunatelly, [Yves Orton](https://github.com/demerphq/smhasher) discovered + that `t1ha1()` fails the strict avalanche criteria in some cases. + This flaw is insignificant for the `t1ha1()` purposes and imperceptible + from a practical point of view. + However, nowadays this issue has resolved in the next `t1ha2()` function, + that was initially planned to providing a bit more quality. + + The basic version of 't1ha1()' intends for little-endian systems and will run slowly on big-endian. Therefore a dedicated big-endian version is also - provided, but returns the different result than the main version. + provided, but returns the different result than the basic version. -`t1ha2` = 64 bits, little more attention for quality and strength +`t1ha2` = 64 and 128 bits, slightly more attention for quality and strength ----------------------------------------------------------------- - The next-step version of "Fast Positive Hash", - but not yet finished and therefore not available. + The recommended version of "Fast Positive Hash" with good quality + for checksum, hash tables and fingerprinting. It is stable, e.g. + returns same result on all architectures and CPUs. + 1. Portable and extremely efficiency on modern 64-bit CPUs. + 2. Great quality of hashing and still faster than other non-t1ha hashes. + 3. Provides streaming mode and 128-bit result. -`t1ha3` = 128 bits, fast non-cryptographic fingerprinting + The `t1ha2()` is intended for little-endian systems and will run + slightly slowly on big-endian systems. + + +`t1ha3` = 128 and 256 bits, fast non-cryptographic fingerprinting --------------------------------------------------------- The next-step version of "Fast Positive Hash", but not yet finished and therefore not available. @@ -124,6 +143,57 @@ for _The 1Hippeus project - zerocopy messaging in the spirit of Sparta!_ ******************************************************************************** ## Benchmarking and Testing + +Current version of t1ha library includes tool for basic testing and benchmarking. +Just try `make check` from t1ha directory. + +To comparison benchmark also includes 32- and 64-bit versions of `xxhash()` function. +For example: +``` +$ CC=clang-5.0 make all && sudo make check +... +Preparing to benchmarking... + - suggest enable rdpmc for usermode (echo 2 | sudo tee /sys/devices/cpu/rdpmc) + - running on CPU#3 + - use RDPMC_perf as clock source for benchmarking + - assume it cheap and stable + - measure granularity and overhead: 53 cycle, 0.0188679 iteration/cycle + +Bench for tiny keys (5 bytes): +t1ha2_atonce : 13.070 cycle/hash, 2.614 cycle/byte, 0.383 byte/cycle, 1.148 Gb/s @3GHz +t1ha1_64le : 14.055 cycle/hash, 2.811 cycle/byte, 0.356 byte/cycle, 1.067 Gb/s @3GHz +t1ha0 : 14.070 cycle/hash, 2.814 cycle/byte, 0.355 byte/cycle, 1.066 Gb/s @3GHz +xxhash64 : 17.203 cycle/hash, 3.441 cycle/byte, 0.291 byte/cycle, 0.872 Gb/s @3GHz + +Bench for medium keys (1024 bytes): +t1ha2_atonce : 266.500 cycle/hash, 0.260 cycle/byte, 3.842 byte/cycle, 11.527 Gb/s @3GHz +t1ha1_64le : 245.750 cycle/hash, 0.240 cycle/byte, 4.167 byte/cycle, 12.501 Gb/s @3GHz +t1ha0 : 86.625 cycle/hash, 0.085 cycle/byte, 11.821 byte/cycle, 35.463 Gb/s @3GHz +xxhash64 : 283.000 cycle/hash, 0.276 cycle/byte, 3.618 byte/cycle, 10.855 Gb/s @3GHz +``` + +The `test` tool support a set of command line options to selecting functions and size of keys for benchmarking. +For more info please run `./test --help`. + + +One noteable option is `--hash-stdin-strings`, it intended to estimate hash collisions on your custom data. +With this option `test` tool will hash each line from standard input and print its hash to standard output. + +For instance, you could count collisions for lines from some `words.list` file by bash's command: +``` + ./t1ha/test --hash-stdin-strings < words.list | sort | uniq -c -d | wc -l +``` + +More complex example - count `xxhash()` collisions for lines from `words.list` and 0...10000 numbers, +with distinction only in 32 bit of hash values: +``` + (cat words.list && seq 0 10000) | \ + ./t1ha/test --xxhash --hash-stdin-strings | \ + cut --bytes=-8 | sort | uniq -c -d | wc -l +``` + + +### SMHasher [_SMHasher_](https://github.com/aappleby/smhasher/wiki) is a wellknown test suite designed to test the distribution, collision, and performance properties of non-cryptographic hash functions. diff --git a/src/t1ha0.c b/src/t1ha0.c index f567985..c50fa60 100644 --- a/src/t1ha0.c +++ b/src/t1ha0.c @@ -1,8 +1,8 @@ -/* - * Copyright (c) 2016-2017 Positive Technologies, https://www.ptsecurity.com, +/* + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, * Fast Positive Hash. * - * Portions Copyright (c) 2010-2017 Leonid Yuriev , + * Portions Copyright (c) 2010-2018 Leonid Yuriev , * The 1Hippeus project (t1h). * * This software is provided 'as-is', without any express or implied @@ -41,14 +41,9 @@ * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! */ -#if defined(_MSC_VER) && _MSC_VER > 1800 -#pragma warning(disable : 4464) /* relative include path contains '..' */ -#endif - -#include "../t1ha.h" #include "t1ha_bits.h" -static __inline uint32_t tail32_le(const void *v, size_t tail) { +static __always_inline uint32_t tail32_le(const void *v, size_t tail) { const uint8_t *p = (const uint8_t *)v; #ifdef can_read_underside /* On some systems (e.g. x86) we can perform a 'oneshot' read, which @@ -99,7 +94,7 @@ static __inline uint32_t tail32_le(const void *v, size_t tail) { unreachable(); } -static __inline uint32_t tail32_be(const void *v, size_t tail) { +static __always_inline uint32_t tail32_be(const void *v, size_t tail) { const uint8_t *p = (const uint8_t *)v; #ifdef can_read_underside /* On some systems we can perform a 'oneshot' read, which is little bit @@ -146,36 +141,39 @@ static __inline uint32_t tail32_be(const void *v, size_t tail) { /***************************************************************************/ #ifndef rot32 -static maybe_unused __inline uint32_t rot32(uint32_t v, unsigned s) { +static __maybe_unused __always_inline uint32_t rot32(uint32_t v, unsigned s) { return (v >> s) | (v << (32 - s)); } #endif /* rot32 */ -static __inline uint64_t remix32(uint32_t a, uint32_t b) { - a ^= rot32(b, 13); - uint64_t l = a | (uint64_t)b << 32; - l *= p0; - l ^= l >> 41; - return l; -} - -static __inline void mixup32(uint32_t *a, uint32_t *b, uint32_t v, uint32_t p) { - uint64_t l = mul_32x32_64(*b + v, p); +static __always_inline void mixup32(uint32_t *a, uint32_t *b, uint32_t v, + uint32_t prime) { + uint64_t l = mul_32x32_64(*b + v, prime); *a ^= (uint32_t)l; *b += (uint32_t)(l >> 32); } +static __always_inline uint64_t final32(uint32_t a, uint32_t b) { + uint64_t l = (b ^ rot32(a, 13)) | (uint64_t)a << 32; + l *= prime_0; + l ^= l >> 41; + l *= prime_4; + l ^= l >> 47; + l *= prime_6; + return l; +} + /* 32-bit 'magic' primes */ -static const uint32_t q0 = UINT32_C(0x92D78269); -static const uint32_t q1 = UINT32_C(0xCA9B4735); -static const uint32_t q2 = UINT32_C(0xA4ABA1C3); -static const uint32_t q3 = UINT32_C(0xF6499843); -static const uint32_t q4 = UINT32_C(0x86F0FD61); -static const uint32_t q5 = UINT32_C(0xCA2DA6FB); -static const uint32_t q6 = UINT32_C(0xC4BB3575); +static const uint32_t prime32_0 = UINT32_C(0x92D78269); +static const uint32_t prime32_1 = UINT32_C(0xCA9B4735); +static const uint32_t prime32_2 = UINT32_C(0xA4ABA1C3); +static const uint32_t prime32_3 = UINT32_C(0xF6499843); +static const uint32_t prime32_4 = UINT32_C(0x86F0FD61); +static const uint32_t prime32_5 = UINT32_C(0xCA2DA6FB); +static const uint32_t prime32_6 = UINT32_C(0xC4BB3575); uint64_t t1ha0_32le(const void *data, size_t len, uint64_t seed) { - uint32_t a = rot32((uint32_t)len, s1) + (uint32_t)seed; + uint32_t a = rot32((uint32_t)len, 17) + (uint32_t)seed; uint32_t b = (uint32_t)len ^ (uint32_t)(seed >> 32); const int need_align = (((uintptr_t)data) & 3) != 0 && !UNALIGNED_OK; @@ -196,19 +194,19 @@ uint64_t t1ha0_32le(const void *data, size_t len, uint64_t seed) { uint32_t w3 = fetch32_le(v + 3); uint32_t c02 = w0 ^ rot32(w2 + c, 11); - uint32_t d13 = w1 + rot32(w3 + d, s1); + uint32_t d13 = w1 + rot32(w3 + d, 17); c ^= rot32(b + w1, 7); d ^= rot32(a + w0, 3); - b = q1 * (c02 + w3); - a = q0 * (d13 ^ w2); + b = prime32_1 * (c02 + w3); + a = prime32_0 * (d13 ^ w2); data = (const uint32_t *)data + 4; } while (likely(data < detent)); c += a; d += b; - a ^= q6 * (rot32(c, 16) + d); - b ^= q5 * (c + rot32(d, 16)); + a ^= prime32_6 * (rot32(c, 16) + d); + b ^= prime32_5 * (c + rot32(d, 16)); len &= 15; } @@ -219,36 +217,36 @@ uint64_t t1ha0_32le(const void *data, size_t len, uint64_t seed) { switch (len) { default: - mixup32(&a, &b, fetch32_le(v), q4); + mixup32(&a, &b, fetch32_le(v), prime32_4); v += 4; /* fall through */ case 12: case 11: case 10: case 9: - mixup32(&b, &a, fetch32_le(v), q3); + mixup32(&b, &a, fetch32_le(v), prime32_3); v += 4; /* fall through */ case 8: case 7: case 6: case 5: - mixup32(&a, &b, fetch32_le(v), q2); + mixup32(&a, &b, fetch32_le(v), prime32_2); v += 4; /* fall through */ case 4: case 3: case 2: case 1: - mixup32(&b, &a, tail32_le(v, len), q1); + mixup32(&b, &a, tail32_le(v, len), prime32_1); /* fall through */ case 0: - return remix32(a, b); + return final32(a, b); } } uint64_t t1ha0_32be(const void *data, size_t len, uint64_t seed) { - uint32_t a = rot32((uint32_t)len, s1) + (uint32_t)seed; + uint32_t a = rot32((uint32_t)len, 17) + (uint32_t)seed; uint32_t b = (uint32_t)len ^ (uint32_t)(seed >> 32); const int need_align = (((uintptr_t)data) & 3) != 0 && !UNALIGNED_OK; @@ -269,19 +267,19 @@ uint64_t t1ha0_32be(const void *data, size_t len, uint64_t seed) { uint32_t w3 = fetch32_be(v + 3); uint32_t c02 = w0 ^ rot32(w2 + c, 11); - uint32_t d13 = w1 + rot32(w3 + d, s1); + uint32_t d13 = w1 + rot32(w3 + d, 17); c ^= rot32(b + w1, 7); d ^= rot32(a + w0, 3); - b = q1 * (c02 + w3); - a = q0 * (d13 ^ w2); + b = prime32_1 * (c02 + w3); + a = prime32_0 * (d13 ^ w2); data = (const uint32_t *)data + 4; } while (likely(data < detent)); c += a; d += b; - a ^= q6 * (rot32(c, 16) + d); - b ^= q5 * (c + rot32(d, 16)); + a ^= prime32_6 * (rot32(c, 16) + d); + b ^= prime32_5 * (c + rot32(d, 16)); len &= 15; } @@ -292,41 +290,37 @@ uint64_t t1ha0_32be(const void *data, size_t len, uint64_t seed) { switch (len) { default: - mixup32(&a, &b, fetch32_be(v), q4); + mixup32(&a, &b, fetch32_be(v), prime32_4); v += 4; /* fall through */ case 12: case 11: case 10: case 9: - mixup32(&b, &a, fetch32_be(v), q3); + mixup32(&b, &a, fetch32_be(v), prime32_3); v += 4; /* fall through */ case 8: case 7: case 6: case 5: - mixup32(&a, &b, fetch32_be(v), q2); + mixup32(&a, &b, fetch32_be(v), prime32_2); v += 4; /* fall through */ case 4: case 3: case 2: case 1: - mixup32(&b, &a, tail32_be(v, len), q1); + mixup32(&b, &a, tail32_be(v, len), prime32_1); /* fall through */ case 0: - return remix32(a, b); + return final32(a, b); } } /***************************************************************************/ -#undef T1HA_ia32aes_AVAILABLE -#if defined(__x86_64__) || (defined(_M_IX86) && _MSC_VER > 1800) || \ - defined(_M_X64) || defined(i386) || defined(_X86_) || defined(__i386__) || \ - defined(_X86_64_) - +#ifdef __ia32__ static uint64_t x86_cpu_features(void) { uint32_t features = 0; uint32_t extended = 0; @@ -353,23 +347,19 @@ static uint64_t x86_cpu_features(void) { #endif return features | (uint64_t)extended << 32; } - -#define T1HA_ia32aes_AVAILABLE - -uint64_t t1ha0_ia32aes_avx(const void *data, size_t len, uint64_t seed); -uint64_t t1ha0_ia32aes_noavx(const void *data, size_t len, uint64_t seed); - -#endif /* __i386__ || __x86_64__ */ +#endif /* __ia32__ */ /***************************************************************************/ +#ifdef T1HA0_RUNTIME_SELECT + static #if __GNUC_PREREQ(4, 0) || __has_attribute(used) __attribute__((used)) #endif uint64_t (*t1ha0_resolve(void))(const void *, size_t, uint64_t) { -#ifdef T1HA_ia32aes_AVAILABLE +#ifdef __ia32__ uint64_t features = x86_cpu_features(); if (features & UINT32_C(0x02000000) /* check for AES-NI */) { if ((features & UINT32_C(0x1A000000)) == @@ -378,16 +368,21 @@ static return ((features >> 32) & 32) ? t1ha0_ia32aes_avx2 : t1ha0_ia32aes_avx; return t1ha0_ia32aes_noavx; } -#endif /* T1HA_ia32aes_AVAILABLE */ +#endif /* __ia32__ */ - return (sizeof(size_t) >= 8) #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - ? t1ha1_be - : t1ha0_32be; +#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul + return t1ha1_be; +#else + return t1ha0_32be; +#endif +#else /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */ +#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul + return t1ha1_le; #else - ? t1ha1_le - : t1ha0_32le; + return t1ha0_32le; #endif +#endif /* __BYTE_ORDER__ */ } #ifdef __ELF__ @@ -417,4 +412,5 @@ static uint64_t t1ha0_proxy(const void *data, size_t len, uint64_t seed) { uint64_t (*t1ha0_funcptr)(const void *, size_t, uint64_t) = t1ha0_proxy; -#endif +#endif /* !ELF */ +#endif /* T1HA0_RUNTIME_SELECT */ diff --git a/src/t1ha0_ia32aes_a.h b/src/t1ha0_ia32aes_a.h index acdcecb..91c06a5 100644 --- a/src/t1ha0_ia32aes_a.h +++ b/src/t1ha0_ia32aes_a.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2016-2017 Positive Technologies, https://www.ptsecurity.com, + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, * Fast Positive Hash. * - * Portions Copyright (c) 2010-2017 Leonid Yuriev , + * Portions Copyright (c) 2010-2018 Leonid Yuriev , * The 1Hippeus project (t1h). * * This software is provided 'as-is', without any express or implied @@ -41,16 +41,9 @@ * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! */ -#if defined(_MSC_VER) && _MSC_VER > 1800 -#pragma warning(disable : 4464) /* relative include path contains '..' */ -#endif - -#include "../t1ha.h" #include "t1ha_bits.h" -#if defined(_X86_64_) || defined(__x86_64__) || defined(_M_X64) || \ - defined(__i386__) || (defined(_M_IX86) && _MSC_VER > 1800) || \ - defined(i386) || defined(_X86_) +#ifdef T1HA0_AESNI_AVAILABLE uint64_t T1HA_IA32AES_NAME(const void *data, size_t len, uint64_t seed) { uint64_t a = seed; @@ -58,7 +51,7 @@ uint64_t T1HA_IA32AES_NAME(const void *data, size_t len, uint64_t seed) { if (unlikely(len > 32)) { __m128i x = _mm_set_epi64x(a, b); - __m128i y = _mm_aesenc_si128(x, _mm_set_epi64x(p0, p1)); + __m128i y = _mm_aesenc_si128(x, _mm_set_epi64x(prime_5, prime_6)); const __m128i *__restrict v = (const __m128i *)data; const __m128i *__restrict const detent = @@ -146,7 +139,7 @@ uint64_t T1HA_IA32AES_NAME(const void *data, size_t len, uint64_t seed) { const uint64_t *v = (const uint64_t *)data; switch (len) { default: - b += mux64(*v++, p4); + mixup64(&a, &b, *v++, prime_4); /* fall through */ case 24: case 23: @@ -156,7 +149,7 @@ uint64_t T1HA_IA32AES_NAME(const void *data, size_t len, uint64_t seed) { case 19: case 18: case 17: - a += mux64(*v++, p3); + mixup64(&b, &a, *v++, prime_3); /* fall through */ case 16: case 15: @@ -166,7 +159,7 @@ uint64_t T1HA_IA32AES_NAME(const void *data, size_t len, uint64_t seed) { case 11: case 10: case 9: - b += mux64(*v++, p2); + mixup64(&a, &b, *v++, prime_2); /* fall through */ case 8: case 7: @@ -176,12 +169,12 @@ uint64_t T1HA_IA32AES_NAME(const void *data, size_t len, uint64_t seed) { case 3: case 2: case 1: - a += mux64(tail64_le(v, len), p1); + mixup64(&b, &a, tail64_le(v, len), prime_1); /* fall through */ case 0: - return mux64(rot64(a + b, s1), p4) + mix64(a ^ b, p0); + return final64(a, b); } } -#endif /* __i386__ || __x86_64__ */ +#endif /* T1HA0_AESNI_AVAILABLE */ #undef T1HA_IA32AES_NAME diff --git a/src/t1ha0_ia32aes_b.h b/src/t1ha0_ia32aes_b.h index da90f0e..573736e 100644 --- a/src/t1ha0_ia32aes_b.h +++ b/src/t1ha0_ia32aes_b.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2016-2017 Positive Technologies, https://www.ptsecurity.com, + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, * Fast Positive Hash. * - * Portions Copyright (c) 2010-2017 Leonid Yuriev , + * Portions Copyright (c) 2010-2018 Leonid Yuriev , * The 1Hippeus project (t1h). * * This software is provided 'as-is', without any express or implied @@ -41,16 +41,9 @@ * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! */ -#if defined(_MSC_VER) && _MSC_VER > 1800 -#pragma warning(disable : 4464) /* relative include path contains '..' */ -#endif - -#include "../t1ha.h" #include "t1ha_bits.h" -#if defined(_X86_64_) || defined(__x86_64__) || defined(_M_X64) || \ - defined(__i386__) || (defined(_M_IX86) && _MSC_VER > 1800) || \ - defined(i386) || defined(_X86_) +#ifdef T1HA0_AESNI_AVAILABLE uint64_t T1HA_IA32AES_NAME(const void *data, size_t len, uint64_t seed) { uint64_t a = seed; @@ -58,7 +51,7 @@ uint64_t T1HA_IA32AES_NAME(const void *data, size_t len, uint64_t seed) { if (unlikely(len > 32)) { __m128i x = _mm_set_epi64x(a, b); - __m128i y = _mm_aesenc_si128(x, _mm_set_epi64x(p0, p1)); + __m128i y = _mm_aesenc_si128(x, _mm_set_epi64x(prime_0, prime_1)); const __m128i *v = (const __m128i *)data; const __m128i *const detent = @@ -84,7 +77,7 @@ uint64_t T1HA_IA32AES_NAME(const void *data, size_t len, uint64_t seed) { t = _mm_aesdec_si128(t, _mm_loadu_si128(v++)); t = _mm_aesdec_si128(t, _mm_loadu_si128(v++)); - salt = _mm_add_epi64(salt, _mm_set_epi64x(p2, p3)); + salt = _mm_add_epi64(salt, _mm_set_epi64x(prime_5, prime_6)); t = _mm_aesenc_si128(x, t); x = _mm_add_epi64(y, x); y = t; @@ -131,7 +124,7 @@ uint64_t T1HA_IA32AES_NAME(const void *data, size_t len, uint64_t seed) { const uint64_t *v = (const uint64_t *)data; switch (len) { default: - b += mux64(*v++, p4); + mixup64(&a, &b, *v++, prime_4); /* fall through */ case 24: case 23: @@ -141,7 +134,7 @@ uint64_t T1HA_IA32AES_NAME(const void *data, size_t len, uint64_t seed) { case 19: case 18: case 17: - a += mux64(*v++, p3); + mixup64(&b, &a, *v++, prime_3); /* fall through */ case 16: case 15: @@ -151,7 +144,7 @@ uint64_t T1HA_IA32AES_NAME(const void *data, size_t len, uint64_t seed) { case 11: case 10: case 9: - b += mux64(*v++, p2); + mixup64(&a, &b, *v++, prime_2); /* fall through */ case 8: case 7: @@ -161,12 +154,12 @@ uint64_t T1HA_IA32AES_NAME(const void *data, size_t len, uint64_t seed) { case 3: case 2: case 1: - a += mux64(tail64_le(v, len), p1); + mixup64(&b, &a, tail64_le(v, len), prime_1); /* fall through */ case 0: - return mux64(rot64(a + b, s1), p4) + mix64(a ^ b, p0); + return final64(a, b); } } -#endif /* __i386__ || __x86_64__ */ +#endif /* T1HA0_AESNI_AVAILABLE */ #undef T1HA_IA32AES_NAME diff --git a/src/t1ha1.c b/src/t1ha1.c index 8c8e498..6bc4adc 100644 --- a/src/t1ha1.c +++ b/src/t1ha1.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2016-2017 Positive Technologies, https://www.ptsecurity.com, + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, * Fast Positive Hash. * - * Portions Copyright (c) 2010-2017 Leonid Yuriev , + * Portions Copyright (c) 2010-2018 Leonid Yuriev , * The 1Hippeus project (t1h). * * This software is provided 'as-is', without any express or implied @@ -41,13 +41,22 @@ * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! */ -#if defined(_MSC_VER) && _MSC_VER > 1800 -#pragma warning(disable : 4464) /* relative include path contains '..' */ -#endif - -#include "../t1ha.h" #include "t1ha_bits.h" +/* xor-mul-xor mixer */ +static __inline uint64_t mix64(uint64_t v, uint64_t p) { + v *= p; + return v ^ rot64(v, 41); +} + +static __inline uint64_t final_weak_avalanche(uint64_t a, uint64_t b) { + /* LY: for performance reason on a some not high-end CPUs + * I replaced the second mux64() operation by mix64(). + * Unfortunately this approach fails the "strict avalanche criteria", + * see test results at https://github.com/demerphq/smhasher. */ + return mux64(rot64(a + b, 17), prime_4) + mix64(a ^ b, prime_0); +} + uint64_t t1ha1_le(const void *data, size_t len, uint64_t seed) { uint64_t a = seed; uint64_t b = len; @@ -56,8 +65,8 @@ uint64_t t1ha1_le(const void *data, size_t len, uint64_t seed) { uint64_t align[4]; if (unlikely(len > 32)) { - uint64_t c = rot64(len, s1) + seed; - uint64_t d = len ^ rot64(seed, s1); + uint64_t c = rot64(len, 17) + seed; + uint64_t d = len ^ rot64(seed, 17); const void *detent = (const uint8_t *)data + len - 31; do { const uint64_t *v = (const uint64_t *)data; @@ -69,17 +78,17 @@ uint64_t t1ha1_le(const void *data, size_t len, uint64_t seed) { uint64_t w2 = fetch64_le(v + 2); uint64_t w3 = fetch64_le(v + 3); - uint64_t d02 = w0 ^ rot64(w2 + d, s1); - uint64_t c13 = w1 ^ rot64(w3 + c, s1); - c += a ^ rot64(w0, s0); - d -= b ^ rot64(w1, s2); - a ^= p1 * (d02 + w3); - b ^= p0 * (c13 + w2); + uint64_t d02 = w0 ^ rot64(w2 + d, 17); + uint64_t c13 = w1 ^ rot64(w3 + c, 17); + c += a ^ rot64(w0, 41); + d -= b ^ rot64(w1, 31); + a ^= prime_1 * (d02 + w3); + b ^= prime_0 * (c13 + w2); data = (const uint64_t *)data + 4; } while (likely(data < detent)); - a ^= p6 * (rot64(c, s1) + d); - b ^= p5 * (c + rot64(d, s1)); + a ^= prime_6 * (rot64(c, 17) + d); + b ^= prime_5 * (c + rot64(d, 17)); len &= 31; } @@ -89,7 +98,7 @@ uint64_t t1ha1_le(const void *data, size_t len, uint64_t seed) { switch (len) { default: - b += mux64(fetch64_le(v++), p4); + b += mux64(fetch64_le(v++), prime_4); /* fall through */ case 24: case 23: @@ -99,7 +108,7 @@ uint64_t t1ha1_le(const void *data, size_t len, uint64_t seed) { case 19: case 18: case 17: - a += mux64(fetch64_le(v++), p3); + a += mux64(fetch64_le(v++), prime_3); /* fall through */ case 16: case 15: @@ -109,7 +118,7 @@ uint64_t t1ha1_le(const void *data, size_t len, uint64_t seed) { case 11: case 10: case 9: - b += mux64(fetch64_le(v++), p2); + b += mux64(fetch64_le(v++), prime_2); /* fall through */ case 8: case 7: @@ -119,10 +128,10 @@ uint64_t t1ha1_le(const void *data, size_t len, uint64_t seed) { case 3: case 2: case 1: - a += mux64(tail64_le(v, len), p1); + a += mux64(tail64_le(v, len), prime_1); /* fall through */ case 0: - return mux64(rot64(a + b, s1), p4) + mix64(a ^ b, p0); + return final_weak_avalanche(a, b); } } @@ -134,8 +143,8 @@ uint64_t t1ha1_be(const void *data, size_t len, uint64_t seed) { uint64_t align[4]; if (unlikely(len > 32)) { - uint64_t c = rot64(len, s1) + seed; - uint64_t d = len ^ rot64(seed, s1); + uint64_t c = rot64(len, 17) + seed; + uint64_t d = len ^ rot64(seed, 17); const void *detent = (const uint8_t *)data + len - 31; do { const uint64_t *v = (const uint64_t *)data; @@ -147,17 +156,17 @@ uint64_t t1ha1_be(const void *data, size_t len, uint64_t seed) { uint64_t w2 = fetch64_be(v + 2); uint64_t w3 = fetch64_be(v + 3); - uint64_t d02 = w0 ^ rot64(w2 + d, s1); - uint64_t c13 = w1 ^ rot64(w3 + c, s1); - c += a ^ rot64(w0, s0); - d -= b ^ rot64(w1, s2); - a ^= p1 * (d02 + w3); - b ^= p0 * (c13 + w2); + uint64_t d02 = w0 ^ rot64(w2 + d, 17); + uint64_t c13 = w1 ^ rot64(w3 + c, 17); + c += a ^ rot64(w0, 41); + d -= b ^ rot64(w1, 31); + a ^= prime_1 * (d02 + w3); + b ^= prime_0 * (c13 + w2); data = (const uint64_t *)data + 4; } while (likely(data < detent)); - a ^= p6 * (rot64(c, s1) + d); - b ^= p5 * (c + rot64(d, s1)); + a ^= prime_6 * (rot64(c, 17) + d); + b ^= prime_5 * (c + rot64(d, 17)); len &= 31; } @@ -167,7 +176,7 @@ uint64_t t1ha1_be(const void *data, size_t len, uint64_t seed) { switch (len) { default: - b += mux64(fetch64_be(v++), p4); + b += mux64(fetch64_be(v++), prime_4); /* fall through */ case 24: case 23: @@ -177,7 +186,7 @@ uint64_t t1ha1_be(const void *data, size_t len, uint64_t seed) { case 19: case 18: case 17: - a += mux64(fetch64_be(v++), p3); + a += mux64(fetch64_be(v++), prime_3); /* fall through */ case 16: case 15: @@ -187,7 +196,7 @@ uint64_t t1ha1_be(const void *data, size_t len, uint64_t seed) { case 11: case 10: case 9: - b += mux64(fetch64_be(v++), p2); + b += mux64(fetch64_be(v++), prime_2); /* fall through */ case 8: case 7: @@ -197,9 +206,9 @@ uint64_t t1ha1_be(const void *data, size_t len, uint64_t seed) { case 3: case 2: case 1: - a += mux64(tail64_be(v, len), p1); + a += mux64(tail64_be(v, len), prime_1); /* fall through */ case 0: - return mux64(rot64(a + b, s1), p4) + mix64(a ^ b, p0); + return final_weak_avalanche(a, b); } } diff --git a/src/t1ha2.c b/src/t1ha2.c new file mode 100644 index 0000000..e3b5168 --- /dev/null +++ b/src/t1ha2.c @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, + * Fast Positive Hash. + * + * Portions Copyright (c) 2010-2018 Leonid Yuriev , + * The 1Hippeus project (t1h). + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgement in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* + * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" } + * by [Positive Technologies](https://www.ptsecurity.ru) + * + * Briefly, it is a 64-bit Hash Function: + * 1. Created for 64-bit little-endian platforms, in predominantly for x86_64, + * but portable and without penalties it can run on any 64-bit CPU. + * 2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash + * and all others portable hash-functions (which do not use specific + * hardware tricks). + * 3. Not suitable for cryptography. + * + * The Future will Positive. Всё будет хорошо. + * + * ACKNOWLEDGEMENT: + * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев) + * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! + */ + +#include "t1ha_bits.h" + +static __always_inline void init_ab(t1ha_state256_t *s, uint64_t x, + uint64_t y) { + s->n.a = x; + s->n.b = y; +} + +static __always_inline void init_cd(t1ha_state256_t *s, uint64_t x, + uint64_t y) { + s->n.c = rot64(y, 23) + ~x; + s->n.d = ~y + rot64(x, 19); +} + +static __always_inline void update(t1ha_state256_t *__restrict s, + const uint64_t *__restrict v) { + uint64_t w0 = fetch64_le(v + 0); + uint64_t w1 = fetch64_le(v + 1); + uint64_t w2 = fetch64_le(v + 2); + uint64_t w3 = fetch64_le(v + 3); + + uint64_t d02 = w0 + rot64(w2 + s->n.d, 56); + uint64_t c13 = w1 + rot64(w3 + s->n.c, 19); +#ifdef __e2k__ + /* FIXME: temporary workaround for lcc's ELBRUS scheduling bug (LY) */ + s->n.c ^= s->n.a + rot64(w0, 57); + s->n.d ^= s->n.b + rot64(w1, 38); +#else + s->n.d ^= s->n.b + rot64(w1, 38); + s->n.c ^= s->n.a + rot64(w0, 57); +#endif + s->n.b ^= prime_6 * (c13 + w2); + s->n.a ^= prime_5 * (d02 + w3); +} + +static __always_inline void squash(t1ha_state256_t *s) { + s->n.a ^= prime_6 * (s->n.c + rot64(s->n.d, 23)); + s->n.b ^= prime_5 * (rot64(s->n.c, 19) + s->n.d); +} + +static __always_inline const void * +loop(bool need_copy4align, uint64_t *__restrict buffer4align, + t1ha_state256_t *__restrict s, const void *__restrict data, size_t len) { + const void *detent = (const uint8_t *)data + len - 31; + do { + const uint64_t *v = (const uint64_t *)data; + if (unlikely(need_copy4align)) + v = (const uint64_t *)memcpy(buffer4align, v, 32); + update(s, v); + data = (const uint64_t *)data + 4; + } while (likely(data < detent)); + return data; +} + +static __always_inline void tail_ab(t1ha_state256_t *__restrict s, + const uint64_t *__restrict v, size_t len) { + switch (len) { + default: + mixup64(&s->n.a, &s->n.b, fetch64_le(v++), prime_4); + /* fall through */ + case 24: + case 23: + case 22: + case 21: + case 20: + case 19: + case 18: + case 17: + mixup64(&s->n.b, &s->n.a, fetch64_le(v++), prime_3); + /* fall through */ + case 16: + case 15: + case 14: + case 13: + case 12: + case 11: + case 10: + case 9: + mixup64(&s->n.a, &s->n.b, fetch64_le(v++), prime_2); + /* fall through */ + case 8: + case 7: + case 6: + case 5: + case 4: + case 3: + case 2: + case 1: + mixup64(&s->n.b, &s->n.a, tail64_le(v, len), prime_1); + /* fall through */ + case 0: + return; + } +} + +static __always_inline void tail_abcd(t1ha_state256_t *__restrict s, + const uint64_t *__restrict v, + size_t len) { + switch (len) { + default: + mixup64(&s->n.a, &s->n.d, fetch64_le(v++), prime_4); + /* fall through */ + case 24: + case 23: + case 22: + case 21: + case 20: + case 19: + case 18: + case 17: + mixup64(&s->n.b, &s->n.a, fetch64_le(v++), prime_3); + /* fall through */ + case 16: + case 15: + case 14: + case 13: + case 12: + case 11: + case 10: + case 9: + mixup64(&s->n.c, &s->n.b, fetch64_le(v++), prime_2); + /* fall through */ + case 8: + case 7: + case 6: + case 5: + case 4: + case 3: + case 2: + case 1: + mixup64(&s->n.d, &s->n.c, tail64_le(v, len), prime_1); + /* fall through */ + case 0: + return; + } +} + +static __always_inline uint64_t final128(uint64_t a, uint64_t b, uint64_t c, + uint64_t d, uint64_t *h) { + mixup64(&a, &b, rot64(c, 41) ^ d, prime_0); + mixup64(&b, &c, rot64(d, 23) ^ a, prime_6); + mixup64(&c, &d, rot64(a, 19) ^ b, prime_5); + mixup64(&d, &a, rot64(b, 31) ^ c, prime_4); + *h = c + d; + return a ^ b; +} + +//------------------------------------------------------------------------------ + +uint64_t t1ha2_atonce(const void *data, size_t length, uint64_t seed) { + t1ha_state256_t state; + init_ab(&state, seed, length); + + const int need_copy4align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK; + uint64_t buffer4align[4]; + + if (unlikely(length > 32)) { + init_cd(&state, seed, length); + data = loop(need_copy4align, buffer4align, &state, data, length); + squash(&state); + length &= 31; + } + + const uint64_t *v = (const uint64_t *)data; + if (unlikely(need_copy4align) && length > 8) + v = (const uint64_t *)memcpy(&buffer4align, v, length); + + tail_ab(&state, v, length); + return final64(state.n.a, state.n.b); +} + +uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result, + const void *__restrict data, size_t length, + uint64_t seed) { + t1ha_state256_t state; + init_ab(&state, seed, length); + init_cd(&state, seed, length); + + const int need_copy4align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK; + uint64_t buffer4align[4]; + + if (unlikely(length > 32)) { + data = loop(need_copy4align, buffer4align, &state, data, length); + length &= 31; + } + + const uint64_t *v = (const uint64_t *)data; + if (unlikely(need_copy4align) && length > 8) + v = (const uint64_t *)memcpy(&buffer4align, v, length); + + tail_abcd(&state, v, length); + return final128(state.n.a, state.n.b, state.n.c, state.n.d, extra_result); +} + +//------------------------------------------------------------------------------ + +void t1ha2_init(t1ha_context_t *ctx, uint64_t seed_x, uint64_t seed_y) { + init_ab(&ctx->state, seed_x, seed_y); + init_cd(&ctx->state, seed_x, seed_y); + ctx->partial = 0; + ctx->total = 0; +} + +void t1ha2_update(t1ha_context_t *__restrict ctx, const void *__restrict data, + size_t length) { + ctx->total += length; + + if (ctx->partial) { + const size_t left = 32 - ctx->partial; + const size_t chunk = (length >= left) ? left : length; + memcpy(ctx->buffer.bytes + ctx->partial, data, chunk); + ctx->partial += chunk; + if (ctx->partial < 32) { + assert(left >= length); + return; + } + ctx->partial = 0; + data = (const uint8_t *)data + chunk; + length -= chunk; + update(&ctx->state, ctx->buffer.u64); + } + + if (length >= 32) { + const bool need_copy4align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK; + if (need_copy4align) + data = loop(true, ctx->buffer.u64, &ctx->state, data, length); + else + data = loop(false, NULL, &ctx->state, data, length); + length &= 31; + } + + if (length) + memcpy(ctx->buffer.bytes, data, ctx->partial = length); +} + +uint64_t t1ha2_final(t1ha_context_t *__restrict ctx, + uint64_t *__restrict extra_result) { + uint64_t bytes = (ctx->total << 3) ^ (UINT64_C(1) << 63); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + bytes = bswap64(bytes); +#endif + t1ha2_update(ctx, &bytes, 8); + + if (likely(!extra_result)) { + squash(&ctx->state); + tail_ab(&ctx->state, ctx->buffer.u64, ctx->partial); + return final64(ctx->state.n.a, ctx->state.n.b); + } + + tail_abcd(&ctx->state, ctx->buffer.u64, ctx->partial); + return final128(ctx->state.n.a, ctx->state.n.b, ctx->state.n.c, + ctx->state.n.d, extra_result); +} diff --git a/src/t1ha_bits.h b/src/t1ha_bits.h index aa42c16..1059b82 100644 --- a/src/t1ha_bits.h +++ b/src/t1ha_bits.h @@ -1,8 +1,8 @@ -/* - * Copyright (c) 2016-2017 Positive Technologies, https://www.ptsecurity.com, +/* + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, * Fast Positive Hash. * - * Portions Copyright (c) 2010-2017 Leonid Yuriev , + * Portions Copyright (c) 2010-2018 Leonid Yuriev , * The 1Hippeus project (t1h). * * This software is provided 'as-is', without any express or implied @@ -42,49 +42,28 @@ */ #pragma once -#ifndef T1HA_USE_FAST_ONESHOT_READ +#if defined(_MSC_VER) +#pragma warning(disable : 4201) /* nameless struct/union */ +#if _MSC_VER > 1800 +#pragma warning(disable : 4464) /* relative include path contains '..' */ +#endif /* 1800 */ +#endif /* MSVC */ +#include "../t1ha.h" + +#ifndef T1HA_USE_FAST_ONESHOT_READ /* Define it to 1 for little bit faster code. * Unfortunately this may triggering a false-positive alarms from Valgrind, * AddressSanitizer and other similar tool. * So, define it to 0 for calmness if doubt. */ #define T1HA_USE_FAST_ONESHOT_READ 1 - #endif /* T1HA_USE_FAST_ONESHOT_READ */ /*****************************************************************************/ -#include /* for memcpy() */ - -#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \ - !defined(__ORDER_BIG_ENDIAN__) -#ifndef _MSC_VER -#include /* for endianness */ -#endif -#if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN) -#define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN -#define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN -#define __BYTE_ORDER__ __BYTE_ORDER -#else -#define __ORDER_LITTLE_ENDIAN__ 1234 -#define __ORDER_BIG_ENDIAN__ 4321 -#if defined(__LITTLE_ENDIAN__) || defined(_LITTLE_ENDIAN) || \ - defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) || \ - defined(__MIPSEL__) || defined(_MIPSEL) || defined(__MIPSEL) || \ - defined(__i386) || defined(__x86_64__) || defined(_M_IX86) || \ - defined(_M_X64) || defined(i386) || defined(_X86_) || defined(__i386__) || \ - defined(_X86_64_) || defined(_M_ARM) || defined(_M_ARM64) || \ - defined(__e2k__) -#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ -#elif defined(__BIG_ENDIAN__) || defined(_BIG_ENDIAN) || defined(__ARMEB__) || \ - defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(__MIPSEB__) || \ - defined(_MIPSEB) || defined(__MIPSEB) || defined(_M_IA64) -#define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__ -#else -#error __BYTE_ORDER__ should be defined. -#endif -#endif -#endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */ +#include /* for assert() */ +#include /* for bool */ +#include /* for memcpy() */ #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ && \ __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ @@ -92,9 +71,7 @@ #endif #if !defined(UNALIGNED_OK) -#if defined(__i386) || defined(__x86_64__) || defined(_M_IX86) || \ - defined(_M_X64) || defined(i386) || defined(_X86_) || defined(__i386__) || \ - defined(_X86_64_) +#if defined(__ia32__) || defined(__e2k__) #define UNALIGNED_OK 1 #define PAGESIZE 4096 #else @@ -102,28 +79,47 @@ #endif #endif +/***************************************************************************/ + #ifndef __has_builtin #define __has_builtin(x) (0) #endif #if __GNUC_PREREQ(4, 4) || defined(__clang__) -#if defined(__i386__) || defined(__x86_64__) -#include +#if defined(__ia32__) || defined(__e2k__) #include #endif + +#if defined(__ia32__) +#include +#endif + +#ifndef likely #define likely(cond) __builtin_expect(!!(cond), 1) +#endif + +#ifndef unlikely #define unlikely(cond) __builtin_expect(!!(cond), 0) -#if __GNUC_PREREQ(4, 5) || defined(__clang__) +#endif + +#if __GNUC_PREREQ(4, 5) || __has_builtin(__builtin_unreachable) #define unreachable() __builtin_unreachable() #endif + #define bswap64(v) __builtin_bswap64(v) #define bswap32(v) __builtin_bswap32(v) #if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) #define bswap16(v) __builtin_bswap16(v) #endif -#if __GNUC_PREREQ(4, 3) || __has_attribute(unused) -#define maybe_unused __attribute__((unused)) + +#if !defined(__maybe_unused) && (__GNUC_PREREQ(4, 3) || __has_attribute(unused)) +#define __maybe_unused __attribute__((unused)) +#endif + +#if !defined(__always_inline) && \ + (__GNUC_PREREQ(3, 2) || __has_attribute(always_inline)) +#define __always_inline __inline __attribute__((always_inline)) #endif #elif defined(_MSC_VER) @@ -152,11 +148,14 @@ #define bswap16(v) _byteswap_ushort(v) #define rot64(v, s) _rotr64(v, s) #define rot32(v, s) _rotr(v, s) -#define __inline __forceinline +#define __always_inline __forceinline -#if defined(_M_ARM64) || defined(_M_X64) || defined(_M_IA64) +#if defined(_M_X64) || defined(_M_IA64) #pragma intrinsic(_umul128) #define mul_64x64_128(a, b, ph) _umul128(a, b, ph) +#endif + +#if defined(_M_ARM64) || defined(_M_X64) || defined(_M_IA64) #pragma intrinsic(__umulh) #define mul_64x64_high(a, b) __umulh(a, b) #endif @@ -184,8 +183,11 @@ #ifndef unlikely #define unlikely(cond) (cond) #endif -#ifndef maybe_unused -#define maybe_unused +#ifndef __maybe_unused +#define __maybe_unused +#endif +#ifndef __always_inline +#define __always_inline __inline #endif #ifndef unreachable #define unreachable() \ @@ -199,7 +201,7 @@ #elif defined(__bswap_64) #define bswap64 __bswap_64 #else -static __inline uint64_t bswap64(uint64_t v) { +static __always_inline uint64_t bswap64(uint64_t v) { return v << 56 | v >> 56 | ((v << 40) & UINT64_C(0x00ff000000000000)) | ((v << 24) & UINT64_C(0x0000ff0000000000)) | ((v << 8) & UINT64_C(0x000000ff00000000)) | @@ -216,7 +218,7 @@ static __inline uint64_t bswap64(uint64_t v) { #elif defined(__bswap_32) #define bswap32 __bswap_32 #else -static __inline uint32_t bswap32(uint32_t v) { +static __always_inline uint32_t bswap32(uint32_t v) { return v << 24 | v >> 24 | ((v << 8) & UINT32_C(0x00ff0000)) | ((v >> 8) & UINT32_C(0x0000ff00)); } @@ -229,13 +231,13 @@ static __inline uint32_t bswap32(uint32_t v) { #elif defined(__bswap_16) #define bswap16 __bswap_16 #else -static __inline uint16_t bswap16(uint16_t v) { return v << 8 | v >> 8; } +static __always_inline uint16_t bswap16(uint16_t v) { return v << 8 | v >> 8; } #endif #endif /* bswap16 */ /***************************************************************************/ -static __inline uint64_t fetch64_le(const void *v) { +static __always_inline uint64_t fetch64_le(const void *v) { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return *(const uint64_t *)v; #else @@ -243,7 +245,7 @@ static __inline uint64_t fetch64_le(const void *v) { #endif } -static __inline uint32_t fetch32_le(const void *v) { +static __always_inline uint32_t fetch32_le(const void *v) { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return *(const uint32_t *)v; #else @@ -251,7 +253,7 @@ static __inline uint32_t fetch32_le(const void *v) { #endif } -static __inline uint16_t fetch16_le(const void *v) { +static __always_inline uint16_t fetch16_le(const void *v) { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return *(const uint16_t *)v; #else @@ -265,7 +267,7 @@ static __inline uint16_t fetch16_le(const void *v) { ((size) <= sizeof(uintptr_t) && ((PAGESIZE - (size)) & (uintptr_t)(ptr)) != 0) #endif /* can_fast_read */ -static __inline uint64_t tail64_le(const void *v, size_t tail) { +static __always_inline uint64_t tail64_le(const void *v, size_t tail) { const uint8_t *p = (const uint8_t *)v; #ifdef can_read_underside /* On some systems (e.g. x86) we can perform a 'oneshot' read, which @@ -344,7 +346,7 @@ static __inline uint64_t tail64_le(const void *v, size_t tail) { unreachable(); } -static maybe_unused __inline uint64_t fetch64_be(const void *v) { +static __maybe_unused __always_inline uint64_t fetch64_be(const void *v) { #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ return *(const uint64_t *)v; #else @@ -352,7 +354,7 @@ static maybe_unused __inline uint64_t fetch64_be(const void *v) { #endif } -static maybe_unused __inline uint32_t fetch32_be(const void *v) { +static __maybe_unused __always_inline uint32_t fetch32_be(const void *v) { #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ return *(const uint32_t *)v; #else @@ -360,7 +362,7 @@ static maybe_unused __inline uint32_t fetch32_be(const void *v) { #endif } -static maybe_unused __inline uint16_t fetch16_be(const void *v) { +static __maybe_unused __always_inline uint16_t fetch16_be(const void *v) { #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ return *(const uint16_t *)v; #else @@ -368,7 +370,8 @@ static maybe_unused __inline uint16_t fetch16_be(const void *v) { #endif } -static maybe_unused __inline uint64_t tail64_be(const void *v, size_t tail) { +static __maybe_unused __always_inline uint64_t tail64_be(const void *v, + size_t tail) { const uint8_t *p = (const uint8_t *)v; #ifdef can_read_underside /* On some systems we can perform a 'oneshot' read, which is little bit @@ -438,33 +441,34 @@ static maybe_unused __inline uint64_t tail64_be(const void *v, size_t tail) { /***************************************************************************/ #ifndef rot64 -static __inline uint64_t rot64(uint64_t v, unsigned s) { +static __always_inline uint64_t rot64(uint64_t v, unsigned s) { return (v >> s) | (v << (64 - s)); } #endif /* rot64 */ #ifndef mul_32x32_64 -static __inline uint64_t mul_32x32_64(uint32_t a, uint32_t b) { +static __always_inline uint64_t mul_32x32_64(uint32_t a, uint32_t b) { return a * (uint64_t)b; } #endif /* mul_32x32_64 */ #ifndef mul_64x64_128 -static maybe_unused __inline unsigned add_with_carry(uint64_t *sum, - uint64_t addend) { +static __maybe_unused __always_inline unsigned add_with_carry(uint64_t *sum, + uint64_t addend) { *sum += addend; return *sum < addend; } -static maybe_unused __inline uint64_t mul_64x64_128(uint64_t a, uint64_t b, - uint64_t *h) { +static __maybe_unused __always_inline uint64_t mul_64x64_128(uint64_t a, + uint64_t b, + uint64_t *h) { #if defined(__SIZEOF_INT128__) || \ (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) __uint128_t r = (__uint128_t)a * (__uint128_t)b; /* modern GCC could nicely optimize this */ - *h = r >> 64; - return r; + *h = (uint64_t)(r >> 64); + return (uint64_t)r; #elif defined(mul_64x64_high) *h = mul_64x64_high(a, b); return a * b; @@ -486,7 +490,8 @@ static maybe_unused __inline uint64_t mul_64x64_128(uint64_t a, uint64_t b, #endif /* mul_64x64_128() */ #ifndef mul_64x64_high -static maybe_unused __inline uint64_t mul_64x64_high(uint64_t a, uint64_t b) { +static __maybe_unused __always_inline uint64_t mul_64x64_high(uint64_t a, + uint64_t b) { uint64_t h; mul_64x64_128(a, b, &h); return h; @@ -496,28 +501,151 @@ static maybe_unused __inline uint64_t mul_64x64_high(uint64_t a, uint64_t b) { /***************************************************************************/ /* 'magic' primes */ -static const uint64_t p0 = UINT64_C(0xEC99BF0D8372CAAB); -static const uint64_t p1 = UINT64_C(0x82434FE90EDCEF39); -static const uint64_t p2 = UINT64_C(0xD4F06DB99D67BE4B); -static const uint64_t p3 = UINT64_C(0xBD9CACC22C6E9571); -static const uint64_t p4 = UINT64_C(0x9C06FAF4D023E3AB); -static const uint64_t p5 = UINT64_C(0xC060724A8424F345); -static const uint64_t p6 = UINT64_C(0xCB5AF53AE3AAAC31); - -/* rotations */ -static const unsigned s0 = 41; -static const unsigned s1 = 17; -static const unsigned s2 = 31; +static const uint64_t prime_0 = UINT64_C(0xEC99BF0D8372CAAB); +static const uint64_t prime_1 = UINT64_C(0x82434FE90EDCEF39); +static const uint64_t prime_2 = UINT64_C(0xD4F06DB99D67BE4B); +static const uint64_t prime_3 = UINT64_C(0xBD9CACC22C6E9571); +static const uint64_t prime_4 = UINT64_C(0x9C06FAF4D023E3AB); +static const uint64_t prime_5 = UINT64_C(0xC060724A8424F345); +static const uint64_t prime_6 = UINT64_C(0xCB5AF53AE3AAAC31); /* xor high and low parts of full 128-bit product */ -static maybe_unused __inline uint64_t mux64(uint64_t v, uint64_t p) { +static __maybe_unused __always_inline uint64_t mux64(uint64_t v, + uint64_t prime) { uint64_t l, h; - l = mul_64x64_128(v, p, &h); + l = mul_64x64_128(v, prime, &h); return l ^ h; } -/* xor-mul-xor mixer */ -static maybe_unused __inline uint64_t mix64(uint64_t v, uint64_t p) { - v *= p; - return v ^ rot64(v, s0); +static __always_inline uint64_t final64(uint64_t a, uint64_t b) { + uint64_t x = (a + rot64(b, 41)) * prime_0; + uint64_t y = (rot64(a, 23) + b) * prime_6; + return mux64(x ^ y, prime_5); +} + +static __always_inline void mixup64(uint64_t *__restrict a, + uint64_t *__restrict b, uint64_t v, + uint64_t prime) { + uint64_t h; + *a ^= mul_64x64_128(*b + v, prime, &h); + *b += h; +} + +/***************************************************************************/ + +typedef union t1ha_uint128 { +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + __uint128_t v; +#endif + struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + uint64_t l, h; +#else + uint64_t h, l; +#endif + }; +} t1ha_uint128_t; + +static __always_inline t1ha_uint128_t not128(const t1ha_uint128_t v) { + t1ha_uint128_t r; +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + r.v = ~v.v; +#else + r.l = ~v.l; + r.h = ~v.h; +#endif + return r; +} + +static __always_inline t1ha_uint128_t left128(const t1ha_uint128_t v, + unsigned s) { + t1ha_uint128_t r; + assert(s < 128); +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + r.v = v.v << s; +#else + r.l = (s < 64) ? v.l << s : 0; + r.h = (s < 64) ? (v.h << s) | (s ? v.l >> (64 - s) : 0) : v.l << (s - 64); +#endif + return r; +} + +static __always_inline t1ha_uint128_t right128(const t1ha_uint128_t v, + unsigned s) { + t1ha_uint128_t r; + assert(s < 128); +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + r.v = v.v >> s; +#else + r.l = (s < 64) ? (s ? v.h << (64 - s) : 0) | (v.l >> s) : v.h >> (s - 64); + r.h = (s < 64) ? v.h >> s : 0; +#endif + return r; +} + +static __always_inline t1ha_uint128_t or128(t1ha_uint128_t x, + t1ha_uint128_t y) { + t1ha_uint128_t r; +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + r.v = x.v | y.v; +#else + r.l = x.l | y.l; + r.h = x.h | y.h; +#endif + return r; +} + +static __always_inline t1ha_uint128_t xor128(t1ha_uint128_t x, + t1ha_uint128_t y) { + t1ha_uint128_t r; +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + r.v = x.v ^ y.v; +#else + r.l = x.l ^ y.l; + r.h = x.h ^ y.h; +#endif + return r; +} + +static __always_inline t1ha_uint128_t rot128(t1ha_uint128_t v, unsigned s) { + s &= 127; +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + v.v = (v.v << (128 - s)) | (v.v >> s); + return v; +#else + return s ? or128(left128(v, 128 - s), right128(v, s)) : v; +#endif +} + +static __always_inline t1ha_uint128_t add128(t1ha_uint128_t x, + t1ha_uint128_t y) { + t1ha_uint128_t r; +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + r.v = x.v + y.v; +#else + r.l = x.l + y.l; + r.h = (r.l < x.l) + x.h + y.h; +#endif + return r; +} + +static __always_inline t1ha_uint128_t mul128(t1ha_uint128_t x, + t1ha_uint128_t y) { + t1ha_uint128_t r; +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + r.v = x.v * y.v; +#else + r.l = mul_64x64_128(x.l, y.l, &r.h); + r.h += x.l * y.h + y.l * x.h; +#endif + return r; } diff --git a/t1ha-dll.vcxproj b/t1ha-dll.vcxproj index 9bed918..174b156 100644 --- a/t1ha-dll.vcxproj +++ b/t1ha-dll.vcxproj @@ -5,6 +5,10 @@ Debug ARM + + Debug + ARM64 + Debug Win32 @@ -13,6 +17,10 @@ Release ARM + + Release + ARM64 + Release Win32 @@ -29,7 +37,7 @@ {E243D17B-648E-429B-ABEB-113B9175FB6C} ManagedCProj - 10.0.14393.0 + 8.1 @@ -50,10 +58,17 @@ true v140 + + DynamicLibrary + true + v140 + true + DynamicLibrary true v140 + true DynamicLibrary @@ -61,11 +76,19 @@ v140 true + + DynamicLibrary + false + v140 + true + true + DynamicLibrary false v140 true + true @@ -81,12 +104,18 @@ + + + + + + @@ -106,6 +135,10 @@ $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ $(Platform)\$(Configuration)\$(ProjectName)\ + + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ + $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ $(Platform)\$(Configuration)\$(ProjectName)\ @@ -115,6 +148,11 @@ $(Platform)\$(Configuration)\$(ProjectName)\ false + + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ + $(Platform)\$(Configuration)\$(ProjectName)\ + false + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ $(Platform)\$(Configuration)\$(ProjectName)\ @@ -177,6 +215,19 @@ MultiThreadedDebugDLL + + + false + false + true + false + t1ha_EXPORTS;WIN64;_DEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) + false + Level4 + true + MultiThreadedDebugDLL + + false @@ -209,6 +260,26 @@ true + + + false + Full + true + Speed + true + All + true + false + t1ha_EXPORTS;WIN64;NDEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) + AnySuitable + Level4 + MultiThreadedDLL + + + UseLinkTimeCodeGeneration + true + + false @@ -234,11 +305,15 @@ NoExtensions NoExtensions NotSet + NotSet NotSet + NotSet AdvancedVectorExtensions + NotSet AdvancedVectorExtensions + NotSet AdvancedVectorExtensions AdvancedVectorExtensions @@ -246,19 +321,33 @@ AdvancedVectorExtensions2 AdvancedVectorExtensions2 AdvancedVectorExtensions2 + NotSet AdvancedVectorExtensions2 + NotSet StreamingSIMDExtensions2 StreamingSIMDExtensions2 NotSet + NotSet NotSet + NotSet NoExtensions NoExtensions NotSet + NotSet + NotSet + NotSet + + + NoExtensions + NoExtensions + NotSet + NotSet NotSet + NotSet diff --git a/t1ha-static.vcxproj b/t1ha-static.vcxproj index a9e8083..826ee49 100644 --- a/t1ha-static.vcxproj +++ b/t1ha-static.vcxproj @@ -5,6 +5,10 @@ Debug ARM + + Debug + ARM64 + Debug Win32 @@ -13,6 +17,10 @@ Release ARM + + Release + ARM64 + Release Win32 @@ -29,38 +37,52 @@ {EF987F12-27EC-4300-8B20-63A2C7156AFC} Win32Proj - 10.0.14393.0 + 8.1 StaticLibrary true - v140 + v141 StaticLibrary false - v140 + v141 StaticLibrary true - v140 + v141 + + + StaticLibrary + true + v141 + true StaticLibrary true v140 + true StaticLibrary false v140 + + StaticLibrary + false + v140 + true + StaticLibrary false v140 + true @@ -76,12 +98,18 @@ + + + + + + @@ -101,6 +129,10 @@ $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ $(Platform)\$(Configuration)\$(ProjectName)\ + + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ + $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ $(Platform)\$(Configuration)\$(ProjectName)\ @@ -109,6 +141,10 @@ $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ $(Platform)\$(Configuration)\$(ProjectName)\ + + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ + $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ $(Platform)\$(Configuration)\$(ProjectName)\ @@ -159,6 +195,11 @@ Full + + + Full + + Full @@ -169,6 +210,11 @@ true + + + true + + true @@ -179,6 +225,11 @@ Speed + + + Speed + + Speed @@ -189,6 +240,11 @@ true + + + true + + true @@ -205,6 +261,17 @@ MultiThreadedDLL + + + true + WIN64;NDEBUG;_LIB;%(PreprocessorDefinitions) + true + false + AnySuitable + Level4 + MultiThreadedDLL + + true @@ -228,6 +295,18 @@ MultiThreadedDebugDLL + + + WIN64;_DEBUG;_LIB;%(PreprocessorDefinitions) + true + false + true + false + Level4 + true + MultiThreadedDebugDLL + + WIN64;_DEBUG;_LIB;%(PreprocessorDefinitions) @@ -244,11 +323,15 @@ NoExtensions NoExtensions NotSet + NotSet NotSet + NotSet AdvancedVectorExtensions + NotSet AdvancedVectorExtensions + NotSet AdvancedVectorExtensions AdvancedVectorExtensions @@ -256,11 +339,15 @@ AdvancedVectorExtensions2 AdvancedVectorExtensions2 AdvancedVectorExtensions2 + NotSet AdvancedVectorExtensions2 + NotSet NotSet + NotSet NotSet + NotSet StreamingSIMDExtensions2 StreamingSIMDExtensions2 @@ -268,7 +355,17 @@ NoExtensions NoExtensions NotSet + NotSet + NotSet + NotSet + + + NoExtensions + NoExtensions + NotSet + NotSet NotSet + NotSet diff --git a/t1ha-test.vcxproj b/t1ha-test.vcxproj index b0601a6..6159731 100644 --- a/t1ha-test.vcxproj +++ b/t1ha-test.vcxproj @@ -5,6 +5,10 @@ Debug ARM + + Debug + ARM64 + Debug Win32 @@ -13,6 +17,10 @@ Release ARM + + Release + ARM64 + Release Win32 @@ -29,7 +37,7 @@ {BEF30B4A-6826-4166-B11E-5BB8E6FB8682} Win32Proj - 10.0.14393.0 + 8.1 @@ -48,10 +56,17 @@ true v140 + + Application + true + v140 + true + Application true v140 + true Application @@ -59,11 +74,19 @@ v140 true + + Application + false + v140 + true + true + Application false v140 true + true @@ -79,12 +102,18 @@ + + + + + + @@ -104,6 +133,10 @@ $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ $(Platform)\$(Configuration)\$(ProjectName)\ + + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ + $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ $(Platform)\$(Configuration)\$(ProjectName)\ @@ -113,6 +146,11 @@ $(Platform)\$(Configuration)\$(ProjectName)\ false + + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ + $(Platform)\$(Configuration)\$(ProjectName)\ + false + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ $(Platform)\$(Configuration)\$(ProjectName)\ @@ -175,6 +213,27 @@ MultiThreadedDLL + + + UseLinkTimeCodeGeneration + + + Full + + + true + + + Speed + + + true + WIN64;NDEBUG;_CONSOLE;T1HA_TESTING;%(PreprocessorDefinitions) + AnySuitable + Level4 + MultiThreadedDLL + + UseLinkTimeCodeGeneration @@ -206,6 +265,15 @@ MultiThreadedDebugDLL + + + WIN64;_DEBUG;_CONSOLE;T1HA_TESTING;%(PreprocessorDefinitions) + false + Level4 + true + MultiThreadedDebugDLL + + WIN64;_DEBUG;_CONSOLE;T1HA_TESTING;%(PreprocessorDefinitions) @@ -219,24 +287,37 @@ AdvancedVectorExtensions AdvancedVectorExtensions AdvancedVectorExtensions + NotSet AdvancedVectorExtensions + NotSet AdvancedVectorExtensions2 AdvancedVectorExtensions2 AdvancedVectorExtensions2 + NotSet AdvancedVectorExtensions2 + NotSet - - + + + + + + NoExtensions NoExtensions NotSet + NotSet NotSet + NotSet + + + diff --git a/t1ha.creator b/t1ha.creator new file mode 100644 index 0000000..e94cbbd --- /dev/null +++ b/t1ha.creator @@ -0,0 +1 @@ +[General] diff --git a/t1ha.files b/t1ha.files new file mode 100644 index 0000000..25e8726 --- /dev/null +++ b/t1ha.files @@ -0,0 +1,26 @@ +tests/xxhash/xxhash.h +tests/xxhash/xxhash.c +tests/common.h +tests/test.c +.travis.yml +LICENSE +Makefile +README.md +circle.yml +src/t1ha0.c +src/t1ha0_ia32aes_a.h +src/t1ha0_ia32aes_avx.c +src/t1ha0_ia32aes_avx2.c +src/t1ha0_ia32aes_b.h +src/t1ha0_ia32aes_noavx.c +src/t1ha1.c +src/t1ha2.c +src/t1ha_bits.h +t1ha.h +tests/4bench_t1ha0_ia32aes_avx.c +tests/4bench_t1ha0_ia32aes_avx2.c +tests/4bench_t1ha0_ia32aes_noavx.c +tests/bench.c +tests/main.c +tests/mera.h +tests/mera.c diff --git a/t1ha.h b/t1ha.h index 70edc66..bf4a5e8 100644 --- a/t1ha.h +++ b/t1ha.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2016-2017 Positive Technologies, https://www.ptsecurity.com, + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, * Fast Positive Hash. * - * Portions Copyright (c) 2010-2017 Leonid Yuriev , + * Portions Copyright (c) 2010-2018 Leonid Yuriev , * The 1Hippeus project (t1h). * * This software is provided 'as-is', without any express or implied @@ -42,13 +42,15 @@ */ #pragma once -#include -#include #ifndef __has_attribute #define __has_attribute(x) (0) #endif +#ifndef __has_include +#define __has_include(x) (0) +#endif + #ifndef __GNUC_PREREQ #if defined(__GNUC__) && defined(__GNUC_MINOR__) #define __GNUC_PREREQ(maj, min) \ @@ -67,8 +69,111 @@ #endif #endif /* __CLANG_PREREQ */ +/*****************************************************************************/ + +#if defined(__cplusplus) && __cplusplus >= 201103L +#include +#include +#include +#else +#include +#include +#include +#endif + +/*****************************************************************************/ + +#if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ + defined(i486) || defined(__i486) || defined(__i486__) || \ + defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ + defined(__i686) || defined(__i686__) || defined(_M_IX86) || \ + defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) || \ + defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) || \ + defined(__amd64__) || defined(__amd64) || defined(_M_X64) || \ + defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__) +#ifndef __ia32__ +/* LY: define neutral __ia32__ for x86 and x86-64 archs */ +#define __ia32__ 1 +#endif /* __ia32__ */ +#if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ + defined(__amd64) || defined(_M_X64)) +/* LY: define trusty __amd64__ for all AMD64/x86-64 arch */ +#define __amd64__ 1 +#endif /* __amd64__ */ +#endif /* all x86 */ + +#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \ + !defined(__ORDER_BIG_ENDIAN__) + +/* *INDENT-OFF* */ +/* clang-format off */ + +#if defined(__GLIBC__) || defined(__GNU_LIBRARY__) || defined(__ANDROID__) || \ + __has_include() +#include +#elif defined(__APPLE__) || defined(__MACH__) || defined(__OpenBSD__) || \ + __has_include() +#include +#elif __has_include() +#include +#elif __has_include() && __has_include() +#include +#include +#elif defined(__bsdi__) || defined(__DragonFly__) || defined(__FreeBSD__) || \ + defined(__NETBSD__) || defined(__NetBSD__) || \ + __has_include() +#include +#endif /* OS */ + +/* *INDENT-ON* */ +/* clang-format on */ + +#if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN) +#define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN +#define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN +#define __BYTE_ORDER__ __BYTE_ORDER +#elif defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN) +#define __ORDER_LITTLE_ENDIAN__ _LITTLE_ENDIAN +#define __ORDER_BIG_ENDIAN__ _BIG_ENDIAN +#define __BYTE_ORDER__ _BYTE_ORDER +#else +#define __ORDER_LITTLE_ENDIAN__ 1234 +#define __ORDER_BIG_ENDIAN__ 4321 + +#if defined(__LITTLE_ENDIAN__) || \ + (defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN)) || \ + defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) || \ + defined(__MIPSEL__) || defined(_MIPSEL) || defined(__MIPSEL) || \ + defined(_M_ARM) || defined(_M_ARM64) || defined(__e2k__) || \ + defined(__elbrus_4c__) || defined(__elbrus_8c__) || defined(__bfin__) || \ + defined(__BFIN__) || defined(__ia64__) || defined(_IA64) || \ + defined(__IA64__) || defined(__ia64) || defined(_M_IA64) || \ + defined(__itanium__) || defined(__ia32__) || defined(__CYGWIN__) || \ + defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) || \ + defined(__WINDOWS__) +#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ + +#elif defined(__BIG_ENDIAN__) || \ + (defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN)) || \ + defined(__ARMEB__) || defined(__THUMBEB__) || defined(__AARCH64EB__) || \ + defined(__MIPSEB__) || defined(_MIPSEB) || defined(__MIPSEB) || \ + defined(__m68k__) || defined(M68000) || defined(__hppa__) || \ + defined(__hppa) || defined(__HPPA__) || defined(__sparc__) || \ + defined(__sparc) || defined(__370__) || defined(__THW_370__) || \ + defined(__s390__) || defined(__s390x__) || defined(__SYSC_ZARCH__) +#define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__ + +#else +#error __BYTE_ORDER__ should be defined. +#endif /* Arch */ + +#endif +#endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */ + +/*****************************************************************************/ + #ifndef __dll_export -#if defined(_WIN32) || defined(__CYGWIN__) +#if defined(_WIN32) || defined(_WIN64) || defined(__CYGWIN__) #if defined(__GNUC__) || __has_attribute(dllexport) #define __dll_export __attribute__((dllexport)) #elif defined(_MSC_VER) @@ -84,7 +189,7 @@ #endif /* __dll_export */ #ifndef __dll_import -#if defined(_WIN32) || defined(__CYGWIN__) +#if defined(_WIN32) || defined(_WIN64) || defined(__CYGWIN__) #if defined(__GNUC__) || __has_attribute(dllimport) #define __dll_import __attribute__((dllimport)) #elif defined(_MSC_VER) @@ -105,36 +210,122 @@ #define T1HA_API #endif /* T1HA_API */ +#if defined(_MSC_VER) && defined(__ia32__) +#define T1HA_ALIGN_PREFIX __declspec(align(32)) /* required only for SIMD */ +#else +#define T1HA_ALIGN_PREFIX +#endif /* _MSC_VER */ + +#if defined(__GNUC__) && defined(__ia32__) +#define T1HA_ALIGN_SUFFIX \ + __attribute__((aligned(32))) /* required only for SIMD */ +#else +#define T1HA_ALIGN_SUFFIX +#endif /* GCC x86 */ + #ifdef __cplusplus extern "C" { #endif -/* The main low-endian version. - * - runs faster on 64-bit low-endian platforms, - * in other cases may runs slowly. - * - returns same result on all architectures and CPUs, - * but it is differs from t1ha0(). */ +typedef union T1HA_ALIGN_PREFIX t1ha_state256 { + uint8_t bytes[32]; + uint32_t u32[8]; + uint64_t u64[4]; + struct { + uint64_t a, b, c, d; + } n; +} t1ha_state256_t T1HA_ALIGN_SUFFIX; + +typedef struct t1ha_context { + t1ha_state256_t state; + t1ha_state256_t buffer; + size_t partial; + uint64_t total; +} t1ha_context_t; + +/****************************************************************************** + * + * t1ha2 = 64 and 128-bit, SLIGHTLY MORE ATTENTION FOR QUALITY AND STRENGTH. + * + * - The recommended version of "Fast Positive Hash" with good quality + * for checksum, hash tables and fingerprinting. + * - Portable and extremely efficiency on modern 64-bit CPUs. + * Designed for 64-bit little-endian platforms, + * in other cases will runs slowly. + * - Great quality of hashing and still faster than other non-t1ha hashes. + * Provides streaming mode and 128-bit result. + * + * Note: Due performance reason 64- and 128-bit results are completely + * different each other, i.e. 64-bit result is NOT any part of 128-bit. + */ + +/* The at-once variant with 64-bit result */ +T1HA_API uint64_t t1ha2_atonce(const void *data, size_t length, uint64_t seed); + +/* The at-once variant with 128-bit result. + * Argument `extra_result` is NOT optional and MUST be valid. + * The high 64-bit part of 128-bit hash will be always unconditionally + * stored to the address given by `extra_result` argument. */ +T1HA_API uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result, + const void *__restrict data, size_t length, + uint64_t seed); + +/* The init/update/final trinity for streaming. + * Return 64 or 128-bit result depentently from `extra_result` argument. */ +T1HA_API void t1ha2_init(t1ha_context_t *ctx, uint64_t seed_x, uint64_t seed_y); +T1HA_API void t1ha2_update(t1ha_context_t *__restrict ctx, + const void *__restrict data, size_t length); + +/* Argument `extra_result` is optional and MAY be NULL. + * - If `extra_result` is NOT NULL then the 128-bit hash will be calculated, + * and high 64-bit part of it will be stored to the address given + * by `extra_result` argument. + * - Otherwise the 64-bit hash will be calculated + * and returned from function directly. + * + * Note: Due performance reason 64- and 128-bit results are completely + * different each other, i.e. 64-bit result is NOT any part of 128-bit. */ +T1HA_API uint64_t t1ha2_final(t1ha_context_t *__restrict ctx, + uint64_t *__restrict extra_result /* optional */); + +/****************************************************************************** + * + * t1ha1 = 64-bit, BASELINE FAST PORTABLE HASH: + * + * - Runs faster on 64-bit platforms in other cases may runs slowly. + * - Portable and stable, returns same 64-bit result + * on all architectures and CPUs. + * - Unfortunately it fails the "strict avalanche criteria", + * see test results at https://github.com/demerphq/smhasher. + * + * This flaw is insignificant for the t1ha1() purposes and imperceptible + * from a practical point of view. + * However, nowadays this issue has resolved in the next t1ha2(), + * that was initially planned to providing a bit more quality. + */ + +/* The little-endian variant. */ T1HA_API uint64_t t1ha1_le(const void *data, size_t length, uint64_t seed); -/* The big-endian version. - * - runs faster on 64-bit big-endian platforms, - * in other cases may runs slowly. - * - returns same result on all architectures and CPUs, - * but it is differs from t1ha0(). */ +/* The big-endian variant. */ T1HA_API uint64_t t1ha1_be(const void *data, size_t length, uint64_t seed); -/* The nicname for generic version of "Fast Positive Hash". - * - returns same result on all architectures and CPUs. - * - created for 64-bit little-endian platforms, - * in other cases may runs slowly. */ +/* The historical nicname for generic little-endian variant. */ static __inline uint64_t t1ha(const void *data, size_t length, uint64_t seed) { return t1ha1_le(data, length, seed); } -/* t1ha0() is a facade that selects most quick-and-dirty hash - * for the current processor. +/****************************************************************************** + * + * t1ha0 = 64-bit, JUST ONLY FASTER: * - * BE CAREFUL!!! This is means: + * - Provides fast-as-possible hashing for current CPU, including + * 32-bit systems and engaging the available hardware acceleration. + * - It is a facade that selects most quick-and-dirty hash + * for the current processor. For instance, on IA32 (x86) actual function + * will be selected in runtime, depending on current CPU capabilities + * + * BE CAREFUL!!! THIS IS MEANS: * * 1. The quality of hash is a subject for tradeoffs with performance. * So, the quality and strength of t1ha0() may be lower than t1ha1(), @@ -146,11 +337,34 @@ static __inline uint64_t t1ha(const void *data, size_t length, uint64_t seed) { * * Briefly, such hash-results and their derivatives, should be * used only in runtime, but should not be persist or transferred - * over a network. */ + * over a network. + */ + +/* The little-endian variant for 32-bit CPU. */ +uint64_t t1ha0_32le(const void *data, size_t length, uint64_t seed); +/* The big-endian variant for 32-bit CPU. */ +uint64_t t1ha0_32be(const void *data, size_t length, uint64_t seed); +#if defined(__e2k__) +#define T1HA0_AESNI_AVAILABLE +uint64_t t1ha0_ia32aes_noavx(const void *data, size_t length, uint64_t seed); +uint64_t t1ha0_ia32aes_avx(const void *data, size_t length, uint64_t seed); +#elif defined(__ia32__) && (!defined(_M_IX86) || _MSC_VER > 1800) +#define T1HA0_AESNI_AVAILABLE +#define T1HA0_RUNTIME_SELECT +uint64_t t1ha0_ia32aes_noavx(const void *data, size_t length, uint64_t seed); +uint64_t t1ha0_ia32aes_avx(const void *data, size_t length, uint64_t seed); +uint64_t t1ha0_ia32aes_avx2(const void *data, size_t length, uint64_t seed); +#endif /* __ia32__ */ + +#ifdef T1HA0_RUNTIME_SELECT #ifdef __ELF__ +/* ifunc/gnu_indirect_function will be used on ELF. + * Please see https://en.wikipedia.org/wiki/Executable_and_Linkable_Format */ T1HA_API uint64_t t1ha0(const void *data, size_t length, uint64_t seed); #else +/* Otherwise function pointer will be used. + * Unfortunately this may cause some overhead calling. */ T1HA_API extern uint64_t (*t1ha0_funcptr)(const void *data, size_t length, uint64_t seed); static __inline uint64_t t1ha0(const void *data, size_t length, uint64_t seed) { @@ -158,16 +372,23 @@ static __inline uint64_t t1ha0(const void *data, size_t length, uint64_t seed) { } #endif /* __ELF__ */ -uint64_t t1ha0_32le(const void *data, size_t length, uint64_t seed); -uint64_t t1ha0_32be(const void *data, size_t length, uint64_t seed); - -#if defined(__x86_64__) || (defined(_M_IX86) && _MSC_VER > 1800) || \ - defined(_M_X64) || defined(i386) || defined(_X86_) || defined(__i386__) || \ - defined(_X86_64_) -uint64_t t1ha0_ia32aes_noavx(const void *data, size_t length, uint64_t seed); -uint64_t t1ha0_ia32aes_avx(const void *data, size_t length, uint64_t seed); -uint64_t t1ha0_ia32aes_avx2(const void *data, size_t length, uint64_t seed); -#endif /* __i386__ || __x86_64__ */ +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +static __inline uint64_t t1ha0(const void *data, size_t length, uint64_t seed) { +#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul + return t1ha1_be(data, length, seed); +#else + return t1ha0_32be(data, length, seed); +#endif +} +#else +static __inline uint64_t t1ha0(const void *data, size_t length, uint64_t seed) { +#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul + return t1ha1_le(data, length, seed); +#else + return t1ha0_32le(data, length, seed); +#endif +} +#endif /* !T1HA0_RUNTIME_SELECT */ #ifdef __cplusplus } diff --git a/t1ha.sln b/t1ha.sln index 92f17ee..018a46f 100644 --- a/t1ha.sln +++ b/t1ha.sln @@ -1,7 +1,7 @@  Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 15 -VisualStudioVersion = 15.0.26228.10 +VisualStudioVersion = 15.0.27130.2024 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "t1ha-test", "t1ha-test.vcxproj", "{BEF30B4A-6826-4166-B11E-5BB8E6FB8682}" EndProject @@ -12,45 +12,43 @@ EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|ARM = Debug|ARM + Debug|ARM64 = Debug|ARM64 Debug|x64 = Debug|x64 Debug|x86 = Debug|x86 Release|ARM = Release|ARM + Release|ARM64 = Release|ARM64 Release|x64 = Release|x64 Release|x86 = Release|x86 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution - {BEF30B4A-6826-4166-B11E-5BB8E6FB8682}.Debug|ARM.ActiveCfg = Debug|ARM - {BEF30B4A-6826-4166-B11E-5BB8E6FB8682}.Debug|ARM.Build.0 = Debug|ARM {BEF30B4A-6826-4166-B11E-5BB8E6FB8682}.Debug|x64.ActiveCfg = Debug|x64 {BEF30B4A-6826-4166-B11E-5BB8E6FB8682}.Debug|x64.Build.0 = Debug|x64 {BEF30B4A-6826-4166-B11E-5BB8E6FB8682}.Debug|x86.ActiveCfg = Debug|Win32 {BEF30B4A-6826-4166-B11E-5BB8E6FB8682}.Debug|x86.Build.0 = Debug|Win32 - {BEF30B4A-6826-4166-B11E-5BB8E6FB8682}.Release|ARM.ActiveCfg = Release|ARM - {BEF30B4A-6826-4166-B11E-5BB8E6FB8682}.Release|ARM.Build.0 = Release|ARM {BEF30B4A-6826-4166-B11E-5BB8E6FB8682}.Release|x64.ActiveCfg = Release|x64 {BEF30B4A-6826-4166-B11E-5BB8E6FB8682}.Release|x64.Build.0 = Release|x64 {BEF30B4A-6826-4166-B11E-5BB8E6FB8682}.Release|x86.ActiveCfg = Release|Win32 {BEF30B4A-6826-4166-B11E-5BB8E6FB8682}.Release|x86.Build.0 = Release|Win32 {EF987F12-27EC-4300-8B20-63A2C7156AFC}.Debug|ARM.ActiveCfg = Debug|ARM {EF987F12-27EC-4300-8B20-63A2C7156AFC}.Debug|ARM.Build.0 = Debug|ARM + {EF987F12-27EC-4300-8B20-63A2C7156AFC}.Debug|ARM64.ActiveCfg = Debug|ARM64 + {EF987F12-27EC-4300-8B20-63A2C7156AFC}.Debug|ARM64.Build.0 = Debug|ARM64 {EF987F12-27EC-4300-8B20-63A2C7156AFC}.Debug|x64.ActiveCfg = Debug|x64 {EF987F12-27EC-4300-8B20-63A2C7156AFC}.Debug|x64.Build.0 = Debug|x64 {EF987F12-27EC-4300-8B20-63A2C7156AFC}.Debug|x86.ActiveCfg = Debug|Win32 {EF987F12-27EC-4300-8B20-63A2C7156AFC}.Debug|x86.Build.0 = Debug|Win32 {EF987F12-27EC-4300-8B20-63A2C7156AFC}.Release|ARM.ActiveCfg = Release|ARM {EF987F12-27EC-4300-8B20-63A2C7156AFC}.Release|ARM.Build.0 = Release|ARM + {EF987F12-27EC-4300-8B20-63A2C7156AFC}.Release|ARM64.ActiveCfg = Release|ARM64 + {EF987F12-27EC-4300-8B20-63A2C7156AFC}.Release|ARM64.Build.0 = Release|ARM64 {EF987F12-27EC-4300-8B20-63A2C7156AFC}.Release|x64.ActiveCfg = Release|x64 {EF987F12-27EC-4300-8B20-63A2C7156AFC}.Release|x64.Build.0 = Release|x64 {EF987F12-27EC-4300-8B20-63A2C7156AFC}.Release|x86.ActiveCfg = Release|Win32 {EF987F12-27EC-4300-8B20-63A2C7156AFC}.Release|x86.Build.0 = Release|Win32 - {E243D17B-648E-429B-ABEB-113B9175FB6C}.Debug|ARM.ActiveCfg = Debug|ARM - {E243D17B-648E-429B-ABEB-113B9175FB6C}.Debug|ARM.Build.0 = Debug|ARM {E243D17B-648E-429B-ABEB-113B9175FB6C}.Debug|x64.ActiveCfg = Debug|x64 {E243D17B-648E-429B-ABEB-113B9175FB6C}.Debug|x64.Build.0 = Debug|x64 {E243D17B-648E-429B-ABEB-113B9175FB6C}.Debug|x86.ActiveCfg = Debug|Win32 {E243D17B-648E-429B-ABEB-113B9175FB6C}.Debug|x86.Build.0 = Debug|Win32 - {E243D17B-648E-429B-ABEB-113B9175FB6C}.Release|ARM.ActiveCfg = Release|ARM - {E243D17B-648E-429B-ABEB-113B9175FB6C}.Release|ARM.Build.0 = Release|ARM {E243D17B-648E-429B-ABEB-113B9175FB6C}.Release|x64.ActiveCfg = Release|x64 {E243D17B-648E-429B-ABEB-113B9175FB6C}.Release|x64.Build.0 = Release|x64 {E243D17B-648E-429B-ABEB-113B9175FB6C}.Release|x86.ActiveCfg = Release|Win32 @@ -59,4 +57,7 @@ Global GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {C88E6722-10B0-4766-B350-D6A3C66B5943} + EndGlobalSection EndGlobal diff --git a/tests/bench.c b/tests/bench.c new file mode 100644 index 0000000..3028e77 --- /dev/null +++ b/tests/bench.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, + * Fast Positive Hash. + * + * Portions Copyright (c) 2010-2018 Leonid Yuriev , + * The 1Hippeus project (t1h). + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgement in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#include "common.h" +#include +#include + +double bench_mats(void) { return mera_bench(NULL, NULL, 0, 0); } + +void bench(const char *caption, + uint64_t (*hash)(const void *, size_t, uint64_t), const void *data, + unsigned len, uint64_t seed) { + + printf("%-24s: ", caption); + fflush(NULL); + + double value = mera_bench(hash, data, len, seed); + printf("%10.3f %s/hash, %6.3f %s/byte, %6.3f byte/%s", value, mera.units, + value / len, mera.units, len / value, mera.units); + + if (mera.flags & timestamp_cycles) { + printf(", %6.3f Gb/s @3GHz", 3.0 * len / value); + } else if ((mera.flags & timestamp_ticks) == 0) { + printf(", %6.3f Gb/s", len / value); + } + printf(" %s\n", (mera.flags & timestamp_clock_stable) ? "" : "roughly"); + + if (is_option_set(bench_verbose)) { + printf(" - convergence: "); + if (mera_bci.retry_count) + printf("retries %u, ", mera_bci.retry_count); + printf("restarts %u, accounted-loops %u, worthless-loops %u, spent <%us\n", + mera_bci.restart_count, mera_bci.target_accounted_loops, + mera_bci.target_worthless_loops, mera_bci.spent_seconds); + printf(" - mats/overhead: best %" PRIu64 ", gate %" PRIu64 + ", inner-loops-max %u, best-count %u\n", + mera_bci.overhead_best, mera_bci.overhead_gate, + mera_bci.overhead_loops_max, mera_bci.overhead_best_count); + printf(" - hash: loops %u, best %" PRIu64 ", gate %" PRIu64 + ", tailloops-max %u, best-count %u\n\n", + mera_bci.target_loops, mera_bci.target_best, mera_bci.target_gate, + mera_bci.tail_loops_max, mera_bci.target_best_count); + } + fflush(NULL); +} + +static uint64_t thunk_XXH32(const void *input, size_t length, uint64_t seed) { + return XXH32(input, length, (uint32_t)seed); +} + +void bench_size(const unsigned size, const char *caption) { + printf("\nBench for %s keys (%u bytes):\n", caption, size); + const uint64_t seed = 42; + char *buffer = malloc(size); + for (unsigned i = 0; i < size; ++i) + buffer[i] = (char)(rand() + i); + + if (is_selected(bench_64 | bench_2)) { + bench("t1ha2_atonce", t1ha2_atonce, buffer, size, seed); + bench("t1ha2_atonce128*", thunk_t1ha2_atonce128, buffer, size, seed); + bench("t1ha2_stream*", thunk_t1ha2_stream, buffer, size, seed); + bench("t1ha2_stream128*", thunk_t1ha2_stream128, buffer, size, seed); + } + + if (is_selected(bench_64 | bench_le | bench_1)) + bench("t1ha1_64le", t1ha1_le, buffer, size, seed); + if (is_selected(bench_64 | bench_be | bench_1)) + bench("t1ha1_64be", t1ha1_be, buffer, size, seed); + if (is_selected(bench_0)) + bench("t1ha0", t1ha0, buffer, size, seed); + if (is_selected(bench_32 | bench_le | bench_0)) + bench("t1ha0_32le", t1ha0_32le, buffer, size, seed); + if (is_selected(bench_32 | bench_be | bench_0)) + bench("t1ha0_32be", t1ha0_32be, buffer, size, seed); + +#ifdef T1HA0_AESNI_AVAILABLE + if (is_selected(bench_aes)) { + bench("t1ha0_ia32aes_noavx_a", t1ha0_ia32aes_noavx_a, buffer, size, seed); + bench("t1ha0_ia32aes_noavx_b", t1ha0_ia32aes_noavx_b, buffer, size, seed); + bench("t1ha0_ia32aes_noavx", t1ha0_ia32aes_noavx, buffer, size, seed); + if (is_selected(bench_avx)) { + bench("t1ha0_ia32aes_avx_a", t1ha0_ia32aes_avx_a, buffer, size, seed); + bench("t1ha0_ia32aes_avx_b", t1ha0_ia32aes_avx_b, buffer, size, seed); + bench("t1ha0_ia32aes_avx", t1ha0_ia32aes_avx, buffer, size, seed); + } +#ifndef __e2k__ + if (is_selected(bench_avx2)) { + bench("t1ha0_ia32aes_avx2_a", t1ha0_ia32aes_avx2_a, buffer, size, seed); + bench("t1ha0_ia32aes_avx2_b", t1ha0_ia32aes_avx2_b, buffer, size, seed); + bench("t1ha0_ia32aes_avx2", t1ha0_ia32aes_avx2, buffer, size, seed); + } +#endif /* !__e2k__ */ + } +#endif /* T1HA0_AESNI_AVAILABLE */ + + if (is_selected(bench_xxhash)) { + bench("xxhash32", thunk_XXH32, buffer, size, seed); + bench("xxhash64", XXH64, buffer, size, (uint32_t)seed); + } + free(buffer); +} diff --git a/tests/common.h b/tests/common.h new file mode 100644 index 0000000..6e655ca --- /dev/null +++ b/tests/common.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, + * Fast Positive Hash. + * + * Portions Copyright (c) 2010-2018 Leonid Yuriev , + * The 1Hippeus project (t1h). + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgement in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#pragma once + +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable : 4127) /* conditional expression is constant */ +#if _MSC_VER > 1800 +#pragma warning(disable : 4464) /* relative include path contains '..' */ +#endif +#endif /* MSVC */ + +#include "../t1ha.h" /* for T1HA0_AESNI_AVAILABLE, __ia32__, etc */ +#include "mera.h" /* for ia32_cpu_features */ + +enum test_flags { + test_verbose = 1u << 0, + test_quiet = 1u << 1, + hash_stdin_strings = 1u << 2, + bench_verbose = 1u << 3, + bench_xxhash = 1u << 4, + /* 5, 6, 7 */ + + bench_0 = 1u << 8, + bench_1 = 1u << 9, + bench_2 = 1u << 10, + bench_3 = 1u << 11, + bench_4 = 1u << 12, + bench_5 = 1u << 13, + bench_6 = 1u << 14, + bench_7 = 1u << 15, + + bench_tiny = 1u << 16, + bench_small = 1u << 17, + bench_medium = 1u << 18, + bench_large = 1u << 19, + bench_huge = 1u << 20, + /* 21, 22, 23 */ + bench_size_flags = + bench_tiny | bench_small | bench_medium | bench_large | bench_huge, + + bench_32 = 1u << 24, + bench_64 = 1u << 25, + bench_le = 1u << 26, + bench_be = 1u << 27, +#ifdef T1HA0_AESNI_AVAILABLE + bench_aes = 1u << 28, + bench_avx = 1u << 29, +#ifndef __e2k__ + bench_avx2 = 1u << 30, +#endif /* !__e2k__ */ + user_wanna_aes = 1u << 31, +#endif /* T1HA0_AESNI_AVAILABLE */ + + bench_funcs_flags = bench_0 | bench_1 | bench_2 | bench_3 | bench_4 | + bench_5 | bench_6 | bench_7 | bench_32 | bench_64 | + bench_le | bench_be | 1u << 28 | 1u << 29 | 1u << 30 | + 1u << 31 | bench_xxhash +}; + +extern unsigned option_flags, disabled_option_flags; + +static __inline bool is_option_set(unsigned mask) { + return (option_flags & mask) != 0; +} + +static __inline bool is_selected(unsigned mask) { + return is_option_set(mask) && (disabled_option_flags & mask) == 0; +} + +extern const uint64_t refval_2atonce[]; +extern const uint64_t refval_2atonce128[]; +extern const uint64_t refval_2stream[]; +extern const uint64_t refval_2stream128[]; +extern const uint64_t refval_64le[]; +extern const uint64_t refval_64be[]; +extern const uint64_t refval_32le[]; +extern const uint64_t refval_32be[]; + +#ifdef T1HA0_AESNI_AVAILABLE +uint64_t t1ha0_ia32aes_noavx_a(const void *data, size_t length, uint64_t seed); +uint64_t t1ha0_ia32aes_avx_a(const void *data, size_t length, uint64_t seed); +#ifndef __e2k__ +uint64_t t1ha0_ia32aes_avx2_a(const void *data, size_t length, uint64_t seed); +#endif /* !__e2k__ */ +extern const uint64_t refval_ia32aes_a[]; + +uint64_t t1ha0_ia32aes_noavx_b(const void *data, size_t length, uint64_t seed); +uint64_t t1ha0_ia32aes_avx_b(const void *data, size_t length, uint64_t seed); +#ifndef __e2k__ +uint64_t t1ha0_ia32aes_avx2_b(const void *data, size_t length, uint64_t seed); +#endif /* !__e2k__ */ +extern const uint64_t refval_ia32aes_b[]; +#endif /* T1HA0_AESNI_AVAILABLE */ + +bool verify(const char *title, uint64_t (*hash)(const void *, size_t, uint64_t), + const uint64_t *reference_values); + +uint64_t thunk_t1ha2_atonce128(const void *data, size_t len, uint64_t seed); +uint64_t thunk_t1ha2_stream(const void *data, size_t len, uint64_t seed); +uint64_t thunk_t1ha2_stream128(const void *data, size_t len, uint64_t seed); + +double bench_mats(void); +void bench(const char *caption, + uint64_t (*hash)(const void *, size_t, uint64_t), const void *data, + unsigned len, uint64_t seed); + +void bench_size(const unsigned size, const char *caption); + +/*****************************************************************************/ +/* xxHash - just for comparison */ +uint64_t XXH64(const void *input, size_t length, uint64_t seed); +uint32_t XXH32(const void *input, size_t length, uint32_t seed); diff --git a/tests/main.c b/tests/main.c index 4749dec..de2f77f 100644 --- a/tests/main.c +++ b/tests/main.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2016-2017 Positive Technologies, https://www.ptsecurity.com, + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, * Fast Positive Hash. * - * Portions Copyright (c) 2010-2017 Leonid Yuriev , + * Portions Copyright (c) 2010-2018 Leonid Yuriev , * The 1Hippeus project (t1h). * * This software is provided 'as-is', without any express or implied @@ -22,438 +22,378 @@ * 3. This notice may not be removed or altered from any source distribution. */ -#include "../t1ha.h" - -#include +#include "common.h" #include #include +#include + +const unsigned default_option_flags = + bench_0 | bench_1 | bench_2 | bench_xxhash | bench_tiny | bench_medium; + +const unsigned available_eas_flags = +#ifdef T1HA0_AESNI_AVAILABLE + bench_aes | bench_avx | user_wanna_aes | +#ifndef __e2k__ + bench_avx2 | +#endif /* !__e2k__ */ +#endif /* T1HA0_AESNI_AVAILABLE */ + 0u; + +const unsigned default_disabled_option_flags = +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + bench_le +#else + bench_be +#endif /* BIG_ENDIAN */ + | ((UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) ? bench_32 : 0); + +unsigned option_flags, disabled_option_flags; + +void usage(void) { + printf( + "By default - run reasonable tests and benchmarks\n" + " for current platform\n" + "Generic options:\n" + " --test-quiet - quiet perform tests and exit with status\n" + " --hash-stdin-strings - output hash for each line from stdin\n" + " --test-only, --no-bench - perform tests, but don't benchmarking\n" + " --test-verbose - be verbose while testing\n" + " --bench-verbose - be verbose while benchmarking\n" + " --verbose - turn both --test-verbose\n" + " and --bench-verbose\n" + "Keys size choices:\n" + " --tiny, --no-tiny - include/exclude 5 bytes, i.e tiny keys\n" + " --small, --no-small - include/exclude 31 bytes, i.e small keys\n" + " --medium, --no-medium - include/exclude 1K, i.e medium keys\n" + " --large, --no-large - include/exclude 16K, i.e large keys\n" + " --huge, --no-huge - include/exclude 256K, i.e huge keys\n" + " --all-sizes - run benchmark for all sizes of keys\n" + "\n" + "Functions choices:\n" + " --all-funcs - run benchmark for all functions\n" + " --0, --no-0 - include/exclude t1ha0\n" + " --1, --no-1 - include/exclude t1ha1\n" + " --2, --no-2 - include/exclude t1ha2\n" + " --32, --no-32 - include/exclude 32-bit targets,\n" + " i.e t1ha0_32le(), t1ha0_32be()...\n" + " --64, --no-64 - include/exclude 64-bit targets,\n" + " i.e. t1ha1_64le(), t1ha1_64be()...\n" + " --le, --no-le - include/exclude little-endian targets,\n" + " i.e. t1ha0_32le(), t1ha2...\n" + " --be, --no-le - include/exclude big-endian targets,\n" + " i.e. t1ha0_32be(), t1ha1_64be()...\n" +#ifdef T1HA0_AESNI_AVAILABLE + " --aes, --no-aes - include/exclude AES-NI accelerated,\n" + " i.e. t1ha0_ia32aes_avx(), etc...\n" +#endif /* T1HA0_AESNI_AVAILABLE */ + " --xxhash, --no-xxhash - include/exclude xxhash32() and xxhash64()," + " just for comparison.\n"); +} -#if defined(_MSC_VER) -#pragma warning(disable : 4711) /* function 'xyz' selected for \ - automatic inline expansion */ -#pragma warning(disable : 4127) /* conditional expression is constant */ -#if _MSC_VER < 1900 -#define snprintf _snprintf -#pragma warning(disable : 4996) /* '_snprintf': This function or variable \ - may be unsafe */ -#endif -#endif /* MSVC */ - -/* *INDENT-OFF* */ -/* clang-format off */ -static const uint8_t pattern[64] = { - 0, 1, 2, 3, 4, 5, 6, 7, 0xFF, 0x7F, 0x3F, 0x1F, 0xF, 8, 16, 32, 64, 0x80, 0xFE, - 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0xFD, 0xFB, 0xF7, 0xEF, 0xDF, 0xBF, 0x55, 0xAA, - 11, 17, 19, 23, 29, 37, 42, 43, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', - 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x' -}; -/* *INDENT-ON* */ -/* clang-format on */ - -int verbose; - -int probe(uint64_t (*hash)(const void *, size_t, uint64_t), const char *caption, - const uint64_t check, const void *data, unsigned len, uint64_t seed) { - uint64_t value = hash(data, len, seed); - if (verbose || value != check) - printf("Pattern '%s', reference value %08X%08X: ", caption, - (uint32_t)(check >> 32), (uint32_t)check); - if (check == value) { - if (verbose) - printf("Passed\n"); - return 0; +static bool option(const char *arg, const char *opt, unsigned flag) { + if (strncmp(arg, "--", 2) == 0 && strcmp(arg + 2, opt) == 0) { + option_flags |= flag; + return true; + } + if (strncmp(arg, "--no-", 5) == 0 && strcmp(arg + 5, opt) == 0) { + disabled_option_flags |= flag; + return true; } - printf("Failed! Got %08X%08X\n", (uint32_t)(value >> 32), (uint32_t)value); - return -1; + return false; } -int test(const char *title, uint64_t (*hash)(const void *, size_t, uint64_t), - const uint64_t *reference_values) { - printf("Testing %s...%s", title, verbose ? "\n" : ""); - - const uint64_t zero = 0; - int failed = 0; - failed |= probe(hash, "empty-zero", 0, NULL, 0, zero); - failed |= probe(hash, "empty-all1", *reference_values++, NULL, 0, ~zero); - failed |= probe(hash, "bin64-zero", *reference_values++, pattern, 64, zero); +int main(int argc, const char *argv[]) { + if (argc > 1) { + for (int i = 1; i < argc; ++i) { + if (strcmp("--test-quiet", argv[i]) == 0) { + option_flags = test_quiet; + continue; + } + if (strcmp("--hash-stdin-strings", argv[i]) == 0) { + option_flags = (option_flags & bench_funcs_flags) | test_quiet | + hash_stdin_strings; + continue; + } + if (strcmp("--test-only", argv[i]) == 0 || + strcmp("--no-bench", argv[i]) == 0) { + option_flags &= test_verbose; + continue; + } + if (strcmp("--test-verbose", argv[i]) == 0) { + option_flags |= test_verbose; + continue; + } + if (strcmp("--bench-verbose", argv[i]) == 0) { + option_flags |= bench_verbose; + continue; + } + if (strcmp("--verbose", argv[i]) == 0 || strcmp("-v", argv[i]) == 0) { + option_flags |= bench_verbose | test_verbose; + continue; + } + if (strcmp("--bench-all", argv[i]) == 0) { + option_flags |= ~(test_verbose | bench_verbose); + disabled_option_flags = 0; + continue; + } + if (strcmp("--all-funcs", argv[i]) == 0) { + option_flags |= bench_funcs_flags; + disabled_option_flags &= ~bench_funcs_flags; + continue; + } + if (strcmp("--all-sizes", argv[i]) == 0) { + option_flags |= bench_size_flags; + disabled_option_flags &= ~bench_size_flags; + continue; + } + if (strcmp("--aes", argv[i]) == 0) { + if (available_eas_flags) { + option_flags |= available_eas_flags; + continue; + } + fprintf(stderr, "%s: AES-NI not available for '%s', bailout\n", argv[0], + argv[i]); + return EXIT_FAILURE; + } + if (strcmp("--no-aes", argv[i]) == 0) { + if (available_eas_flags) { + disabled_option_flags |= available_eas_flags; + } else { + fprintf(stderr, "%s: AES-NI not available for '%s', ignore\n", + argv[0], argv[i]); + } + continue; + } - char caption[32]; - uint64_t seed = 1; - for (int i = 1; i < 64; i++) { - snprintf(caption, sizeof(caption), "bin%02i-1p%02u", i, i & 63); - failed |= probe(hash, caption, *reference_values++, pattern, i, seed); - seed <<= 1; + if (option(argv[i], "xxhash", bench_xxhash)) + continue; + + if (option(argv[i], "0", bench_0)) + continue; + if (option(argv[i], "1", bench_1)) + continue; + if (option(argv[i], "2", bench_2)) + continue; + if (option(argv[i], "le", bench_le)) + continue; + if (option(argv[i], "be", bench_be)) + continue; + if (option(argv[i], "32", bench_32)) + continue; + if (option(argv[i], "64", bench_64)) + continue; + if (option(argv[i], "tiny", bench_tiny)) + continue; + if (option(argv[i], "small", bench_small)) + continue; + if (option(argv[i], "medium", bench_medium)) + continue; + if (option(argv[i], "large", bench_large)) + continue; + if (option(argv[i], "huge", bench_huge)) + continue; + + if (strcmp("--help", argv[i]) == 0 || strcmp("-h", argv[i]) == 0) { + usage(); + return EXIT_SUCCESS; + } else { + fprintf(stderr, "%s: unknown option '%s'\n\n", argv[0], argv[i]); + usage(); + return EXIT_FAILURE; + } + } + if ((option_flags & bench_funcs_flags) == 0) + option_flags |= default_option_flags & bench_funcs_flags; + if ((option_flags & bench_size_flags) == 0) + option_flags |= default_option_flags & bench_size_flags; + } else { + option_flags = default_option_flags; + disabled_option_flags = default_disabled_option_flags; } - seed = ~zero; - for (int i = 1; i <= 7; i++) { - seed <<= 1; - snprintf(caption, sizeof(caption), "align%i_F%u", i, 64 - i); + /*************************************************************************/ + + bool failed = false; + /* Nowadays t1ha2 not frozen */ + failed |= verify("t1ha2_atonce", t1ha2_atonce, refval_2atonce); + failed |= verify("t1ha2_atonce128", thunk_t1ha2_atonce128, refval_2atonce128); + failed |= verify("t1ha2_stream", thunk_t1ha2_stream, refval_2stream); + failed |= verify("t1ha2_stream128", thunk_t1ha2_stream128, refval_2stream128); + + /* Stable t1ha1 and t1ha0 */ + failed |= verify("t1ha1_64le", t1ha1_le, refval_64le); + failed |= verify("t1ha1_64be", t1ha1_be, refval_64be); + failed |= verify("t1ha0_32le", t1ha0_32le, refval_32le); + failed |= verify("t1ha0_32be", t1ha0_32be, refval_32be); + +#ifdef __e2k__ + failed |= + verify("t1ha0_ia32aes_noavx", t1ha0_ia32aes_noavx, refval_ia32aes_a); + failed |= verify("t1ha0_ia32aes_avx", t1ha0_ia32aes_avx, refval_ia32aes_a); +#elif defined(T1HA0_AESNI_AVAILABLE) + ia32_fetch_cpu_features(); + if (ia32_cpu_features.basic.ecx & UINT32_C(0x02000000)) { failed |= - probe(hash, caption, *reference_values++, pattern + i, 64 - i, seed); - } - - uint8_t pattern_long[512]; - for (size_t i = 0; i < sizeof(pattern_long); ++i) - pattern_long[i] = (uint8_t)i; - for (int i = 0; i <= 7; i++) { - snprintf(caption, sizeof(caption), "long-%05u", 128 + i * 17); - failed |= probe(hash, caption, *reference_values++, pattern_long + i, - 128 + i * 17, seed); + verify("t1ha0_ia32aes_noavx", t1ha0_ia32aes_noavx, refval_ia32aes_a); + if ((ia32_cpu_features.basic.ecx & UINT32_C(0x1A000000)) == + UINT32_C(0x1A000000)) { + failed |= + verify("t1ha0_ia32aes_avx", t1ha0_ia32aes_avx, refval_ia32aes_a); + if (ia32_cpu_features.extended_7.ebx & 32) + failed |= + verify("t1ha0_ia32aes_avx2", t1ha0_ia32aes_avx2, refval_ia32aes_b); + } + } else { + if (option_flags & user_wanna_aes) + printf(" - AES-NI not available on the current CPU\n"); + option_flags &= ~bench_aes; } + if ((ia32_cpu_features.basic.ecx & UINT32_C(0x1A000000)) != + UINT32_C(0x1A000000)) + option_flags &= ~bench_avx; + if ((ia32_cpu_features.extended_7.ebx & 32) == 0) + option_flags &= ~bench_avx2; +#endif /* T1HA0_AESNI_AVAILABLE */ + + if (failed) + return EXIT_FAILURE; + + /*************************************************************************/ + + if (is_option_set(hash_stdin_strings)) { + uint64_t (*hash_function)(const void *data, size_t length, uint64_t seed) = + NULL; + const char *hash_name = NULL; + if (is_selected(bench_64 | bench_2)) { + hash_function = t1ha2_atonce; + hash_name = "t1ha2_atonce"; + } else if (is_selected(bench_64 | bench_le | bench_1)) { + hash_function = t1ha1_le; + hash_name = "t1ha1_le"; + } else if (is_selected(bench_64 | bench_be | bench_1)) { + hash_function = t1ha1_be; + hash_name = "t1ha1_be"; + } else if (is_selected(bench_32 | bench_le | bench_0)) { + hash_function = t1ha0_32le; + hash_name = "t1ha0_32le"; + } else if (is_selected(bench_32 | bench_be | bench_0)) { + hash_function = t1ha0_32be; + hash_name = "t1ha0_32be"; + } else if (is_selected(bench_0)) { + hash_function = t1ha0; + hash_name = "t1ha0"; + } else if (is_selected(bench_xxhash)) { + hash_function = XXH64; + hash_name = "xxhash64"; + } else { + hash_function = t1ha; + hash_name = "t1ha-default"; + } - printf(" %s\n", (!verbose && !failed) ? "Ok" : ""); - return failed; -} - -/* *INDENT-OFF* */ -/* clang-format off */ -static const uint64_t refval_64le[80] = { - 0x6A580668D6048674, 0xA2FE904AFF0D0879, 0xE3AB9C06FAF4D023, 0x6AF1C60874C95442, - 0xB3557E561A6C5D82, 0x0AE73C696F3D37C0, 0x5EF25F7062324941, 0x9B784F3B4CE6AF33, - 0x6993BB206A74F070, 0xF1E95DF109076C4C, 0x4E1EB70C58E48540, 0x5FDD7649D8EC44E4, - 0x559122C706343421, 0x380133D58665E93D, 0x9CE74296C8C55AE4, 0x3556F9A5757AB6D0, - 0xF62751F7F25C469E, 0x851EEC67F6516D94, 0xED463EE3848A8695, 0xDC8791FEFF8ED3AC, - 0x2569C744E1A282CF, 0xF90EB7C1D70A80B9, 0x68DFA6A1B8050A4C, 0x94CCA5E8210D2134, - 0xF5CC0BEABC259F52, 0x40DBC1F51618FDA7, 0x0807945BF0FB52C6, 0xE5EF7E09DE70848D, - 0x63E1DF35FEBE994A, 0x2025E73769720D5A, 0xAD6120B2B8A152E1, 0x2A71D9F13959F2B7, - 0x8A20849A27C32548, 0x0BCBC9FE3B57884E, 0x0E028D255667AEAD, 0xBE66DAD3043AB694, - 0xB00E4C1238F9E2D4, 0x5C54BDE5AE280E82, 0x0E22B86754BC3BC4, 0x016707EBF858B84D, - 0x990015FBC9E095EE, 0x8B9AF0A3E71F042F, 0x6AA56E88BD380564, 0xAACE57113E681A0F, - 0x19F81514AFA9A22D, 0x80DABA3D62BEAC79, 0x715210412CABBF46, 0xD8FA0B9E9D6AA93F, - 0x6C2FC5A4109FD3A2, 0x5B3E60EEB51DDCD8, 0x0A7C717017756FE7, 0xA73773805CA31934, - 0x4DBD6BB7A31E85FD, 0x24F619D3D5BC2DB4, 0x3E4AF35A1678D636, 0x84A1A8DF8D609239, - 0x359C862CD3BE4FCD, 0xCF3A39F5C27DC125, 0xC0FF62F8FD5F4C77, 0x5E9F2493DDAA166C, - 0x17424152BE1CA266, 0xA78AFA5AB4BBE0CD, 0x7BFB2E2CEF118346, 0x647C3E0FF3E3D241, - 0x0352E4055C13242E, 0x6F42FC70EB660E38, 0x0BEBAD4FABF523BA, 0x9269F4214414D61D, - 0x1CA8760277E6006C, 0x7BAD25A859D87B5D, 0xAD645ADCF7414F1D, 0xB07F517E88D7AFB3, - 0xB321C06FB5FFAB5C, 0xD50F162A1EFDD844, 0x1DFD3D1924FBE319, 0xDFAEAB2F09EF7E78, - 0xA7603B5AF07A0B1E, 0x41CD044C0E5A4EE3, 0xF64D2F86E813BF33, 0xFF9FDB99305EB06A -}; - -static const uint64_t refval_64be[80] = { - 0x6A580668D6048674, 0xDECC975A0E3B8177, 0xE3AB9C06FAF4D023, 0xE401FA8F1B6AF969, - 0x67DB1DAE56FB94E3, 0x1106266A09B7A073, 0x550339B1EF2C7BBB, 0x290A2BAF590045BB, - 0xA182C1258C09F54A, 0x137D53C34BE7143A, 0xF6D2B69C6F42BEDC, 0x39643EAF2CA2E4B4, - 0x22A81F139A2C9559, 0x5B3D6AEF0AF33807, 0x56E3F80A68643C08, 0x9E423BE502378780, - 0xCDB0986F9A5B2FD5, 0xD5B3C84E7933293F, 0xE5FB8C90399E9742, 0x5D393C1F77B2CF3D, - 0xC8C82F5B2FF09266, 0xACA0230CA6F7B593, 0xCB5805E2960D1655, 0x7E2AD5B704D77C95, - 0xC5E903CDB8B9EB5D, 0x4CC7D0D21CC03511, 0x8385DF382CFB3E93, 0xF17699D0564D348A, - 0xF77EE7F8274A4C8D, 0xB9D8CEE48903BABE, 0xFE0EBD2A82B9CFE9, 0xB49FB6397270F565, - 0x173735C8C342108E, 0xA37C7FBBEEC0A2EA, 0xC13F66F462BB0B6E, 0x0C04F3C2B551467E, - 0x76A9CB156810C96E, 0x2038850919B0B151, 0xCEA19F2B6EED647B, 0x6746656D2FA109A4, - 0xF05137F221007F37, 0x892FA9E13A3B4948, 0x4D57B70D37548A32, 0x1A7CFB3D566580E6, - 0x7CB30272A45E3FAC, 0x137CCFFD9D51423F, 0xB87D96F3B82DF266, 0x33349AEE7472ED37, - 0x5CC0D3C99555BC07, 0x4A8F4FA196D964EF, 0xE82A0D64F281FBFA, 0x38A1BAC2C36823E1, - 0x77D197C239FD737E, 0xFB07746B4E07DF26, 0xC8A2198E967672BD, 0x5F1A146D143FA05A, - 0x26B877A1201AB7AC, 0x74E5B145214723F8, 0xE9CE10E3C70254BC, 0x299393A0C05B79E8, - 0xFD2D2B9822A5E7E2, 0x85424FEA50C8E50A, 0xE6839E714B1FFFE5, 0x27971CCB46F9112A, - 0xC98695A2E0715AA9, 0x338E1CBB4F858226, 0xFC6B5C5CF7A8D806, 0x8973CAADDE8DA50C, - 0x9C6D47AE32EBAE72, 0x1EBF1F9F21D26D78, 0x80A9704B8E153859, 0x6AFD20A939F141FB, - 0xC35F6C2B3B553EEF, 0x59529E8B0DC94C1A, 0x1569DF036EBC4FA1, 0xDA32B88593C118F9, - 0xF01E4155FF5A5660, 0x765A2522DCE2B185, 0xCEE95554128073EF, 0x60F072A5CA51DE2F -}; - -static const uint64_t refval_32le[80] = { - 0x7C8D3555003E469A, 0x3681F9C3F1127CC8, 0xDBB713D2028227C2, 0x78771E7D21E489DA, - 0x8D659791EF3374FE, 0xCE9E6B054AB1C4A5, 0x846D50F82D595D82, 0x3639538046797CAA, - 0xB37E122F7392DE0A, 0xEE257CB10C794844, 0xF18B3919E8453962, 0x784AE8942A3E9904, - 0x2F80DD72243E2A0D, 0x1BD8419D553B6BED, 0x5ED2C2CFCE6B4E66, 0x979F14108B53422C, - 0x962DA10D015440AF, 0xB4AD7CEAFDFCBD6E, 0x226326258DD37B81, 0x565B0201832935A5, - 0x68373C98B9575D69, 0x29D4922ADD046615, 0xCD07D2669E26D2E8, 0x06FA9DCDC4828761, - 0x0BE3138F25EC4F45, 0x7A69F05F71894D63, 0xEF1F662FDBF2783D, 0x98C17BE571F52A51, - 0xD0500DD17A0366B5, 0x35AB2ABB09EEE627, 0xE0816D30DEC7987C, 0x9818488B7BC7B41C, - 0x8E7065C5518524DD, 0x20C65F2C8CBC9B3E, 0x7D08B202F425C39E, 0x60DC18CD911CAFC7, - 0x84CB42A883D23167, 0x6BFF2CF8AB705839, 0x41B644EF1101DE4B, 0x7A6944C48F818F25, - 0x7AA67961B1E8FF2C, 0x5BCA8BF67B3D2A11, 0x7F66C0B16E4A160B, 0x35DA1BEC148712A3, - 0x537715EDF8A0622C, 0xC34B43559C5D5440, 0x37D76AC5F07242C7, 0xBA4CB32425DD6BEB, - 0xBEA8FE3B935B8458, 0x88949A6B717DBD3F, 0x4B72D4A47CDC9341, 0xD792D3A694B1B0FE, - 0x186EF1351E6A0750, 0x81F4CDC9D6BB1DAC, 0x6AA7EDC1C2AE2E2C, 0x9CAAB63533410035, - 0x3014C6BF94AC4C77, 0x2CFCCBC761FA75F3, 0xF84BEF163C40D24F, 0x23BBDFAF810055DB, - 0xB936C93055260C8A, 0x5EF24667ECB9775F, 0x0CEC06141BE37147, 0x18FECAB3CB1F7DEA, - 0x1209B660972B0A88, 0xD19351CFD7E1A47C, 0xFF3BF60513833757, 0xDFE09FDAD9B2F85A, - 0x211A4745E3A2AF4B, 0xE3A33A114BE38F28, 0x5BCBB517074EED3A, 0xCC93F5820563E184, - 0xFEB29183724FC3A8, 0xCD99FE922F479963, 0xA38994893FF9CBBE, 0x60F593A497767EC0, - 0xF15203894864B213, 0x4DDB3C121175DF69, 0xFC102F9EDAA30ACE, 0x94E3531CBC1DDC97 -}; - -static const uint64_t refval_32be[80] = { - 0x7C8D3555003E469A, 0xB67182BCAA37BD35, 0xDBB713D2028227C2, 0x29E8C60B04158480, - 0xC8301E0ABB6CA72A, 0x61A789243B057150, 0x7561E8B59EFDCCD3, 0x7CE51F527B4700BE, - 0xCB262ABA944284F3, 0xB2445D0304B96987, 0xBE3A0261E1346214, 0x84326AE0563FA723, - 0x7104EDC3683BE307, 0x5F6A6A51B826861E, 0x5C083F08DAD26389, 0x610AA7EC1E5629BF, - 0x5BC88B64C74823DA, 0x722C0E061B6ADF8D, 0xDEB26B204D5AF889, 0x01D35CA90DFCFC61, - 0xC4F667388834FD3A, 0xE1529168302D0DE7, 0x019D6BCD77C4D807, 0x1BDDADE9D492EFE0, - 0x993F06BA69041D9C, 0x4416CB009DCFB2A0, 0x9FC987E7DBE79F80, 0x3A76B9F2DC24376A, - 0x2C6DEB49516E30A1, 0x2205AD9041F8D9F3, 0x0E7058CA06F227F4, 0x0A6EB0AF8CE58789, - 0x7B72205F87E9ADD1, 0x870EA29548B10850, 0x8A815A513926CC37, 0x898374B5CDC36F49, - 0xBA24138146806BB2, 0x4FBC2261B5F71556, 0x769E1CCADF547147, 0x583DA9C726E5CD8E, - 0xE09BA92D16DA99B8, 0x9B5CC797FA7B7C1D, 0x3D79273B2D39668E, 0x05909A21D5C58AD1, - 0x9BB4DEDD3976D0FB, 0x755230444108C09E, 0xC75EFBE69A37494B, 0x4DA948AE8C0BC5E3, - 0x96F9A10FD5E355DA, 0x488A07BE48A68924, 0x93D65FA824F6D10A, 0x7D2C2CA3FD16143D, - 0xE9ACF05F50B3B631, 0x7F97964287F55F15, 0xD73EE29D102CD84E, 0x8F9F79D13C6475B1, - 0x34AA97BB089DAA38, 0xA3ECA0BC09D5708B, 0x2F3DF1A9F059E0D3, 0x18DC64B7CEB1CD14, - 0x7CE4E707AFA7E618, 0x109B40CDC5F1022C, 0xC52F79564FFF4C99, 0x9654AC2E296F1978, - 0x1C6F0C38B283B7C2, 0x6BF445DC9604BE69, 0x0D1BEEFB0421E124, 0xC12C8C8A95D98EBB, - 0xB96859EF9DB42DBA, 0xD0DFA46371271713, 0x233C6AF600EA3220, 0xD5588780552C6565, - 0x401F3751F212070D, 0xE6138263788254F7, 0x774E523D7F8FFA2F, 0xFE89384CE912D12C, - 0xA485F44080CDAB50, 0x07485C1AB5D2831D, 0x4AF2E5B8CA8EA0D7, 0x5918F4ED3485462E -}; - -#if defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) || \ - defined(i386) || defined(_X86_) || defined(__i386__) || defined(_X86_64_) -static const uint64_t refval_ia32aes_a[80] = { - 0x6a580668d6048674, 0x8400eaa9d99a9005, 0xe3ab9c06faf4d023, 0x6af1c60874c95442, - 0xb3557e561a6c5d82, 0x0ae73c696f3d37c0, 0x5ef25f7062324941, 0x9b784f3b4ce6af33, - 0x6993bb206a74f070, 0xf1e95df109076c4c, 0x4e1eb70c58e48540, 0x5fdd7649d8ec44e4, - 0x559122c706343421, 0x380133d58665e93d, 0x9ce74296c8c55ae4, 0x3556f9a5757ab6d0, - 0xf62751f7f25c469e, 0x851eec67f6516d94, 0xed463ee3848a8695, 0xdc8791feff8ed3ac, - 0x2569c744e1a282cf, 0xf90eb7c1d70a80b9, 0x68dfa6a1b8050a4c, 0x94cca5e8210d2134, - 0xf5cc0beabc259f52, 0x40dbc1f51618fda7, 0x0807945bf0fb52c6, 0xe5ef7e09de70848d, - 0x63e1df35febe994a, 0x2025e73769720d5a, 0xad6120b2b8a152e1, 0x2a71d9f13959f2b7, - 0x8a20849a27c32548, 0x0bcbc9fe3b57884e, 0xa7bf2ddd8f00efc0, 0xb080ba4ffe8c091b, - 0x95c723d82e9e5642, 0xde3e2155d51a2b97, 0xa797bebfea95c7b6, 0x66a04b1c6fcbe618, - 0x0c56ab810681a051, 0x8d1121337a565265, 0x862a3c70eeb20df6, 0xdeb9b38a4989407f, - 0xdba1cf225470e4d0, 0x5f5d52d3885dd1c6, 0xd8a842b32a2480ab, 0x9107908035f2c6de, - 0x9c129a478ca541c2, 0xedec764bfac4bab7, 0xa13dba75b355e511, 0x831dd972eb408603, - 0x2dbb16bf2d928bc9, 0xe3d796db0d12d23a, 0xdf5404c52cf35e52, 0x6748b200122b76cc, - 0x4b8149aafdaea1cf, 0xa01bb26c5f447179, 0x72c97ff21010d6bb, 0x3e6fef0a984a2095, - 0xeb77ebfc0a478c74, 0xf4350a4102478864, 0xbcdfb3555789d1ff, 0x6246e4f758e508da, - 0x8cf2f2d389542441, 0x3e695ca1865d2208, 0x6aaab8f6a7e8382f, 0xfeb2b25ac5d377ee, - 0xd71cb9ef6e6ad9dd, 0x25e50673c0339c0f, 0x1ad9a860235a74a2, 0xac2164169775843e, - 0xa5248411f9e2ffd6, 0xfe6873b7d696b46f, 0x7cebac5d4f9b4a1a, 0x5ca6312e4199250c, - 0x7a27e4ca25d951a6, 0x4986a4d2835186e4, 0x839d0b22d7782adf, 0xa87a89fa41833a00 -}; - -static const uint64_t refval_ia32aes_b[80] = { - 0x6A580668D6048674, 0x8400EAA9D99A9005, 0xE3AB9C06FAF4D023, 0x6AF1C60874C95442, - 0xB3557E561A6C5D82, 0x0AE73C696F3D37C0, 0x5EF25F7062324941, 0x9B784F3B4CE6AF33, - 0x6993BB206A74F070, 0xF1E95DF109076C4C, 0x4E1EB70C58E48540, 0x5FDD7649D8EC44E4, - 0x559122C706343421, 0x380133D58665E93D, 0x9CE74296C8C55AE4, 0x3556F9A5757AB6D0, - 0xF62751F7F25C469E, 0x851EEC67F6516D94, 0xED463EE3848A8695, 0xDC8791FEFF8ED3AC, - 0x2569C744E1A282CF, 0xF90EB7C1D70A80B9, 0x68DFA6A1B8050A4C, 0x94CCA5E8210D2134, - 0xF5CC0BEABC259F52, 0x40DBC1F51618FDA7, 0x0807945BF0FB52C6, 0xE5EF7E09DE70848D, - 0x63E1DF35FEBE994A, 0x2025E73769720D5A, 0xAD6120B2B8A152E1, 0x2A71D9F13959F2B7, - 0x8A20849A27C32548, 0x0BCBC9FE3B57884E, 0xA7BF2DDD8F00EFC0, 0xB080BA4FFE8C091B, - 0x95C723D82E9E5642, 0xDE3E2155D51A2B97, 0xA797BEBFEA95C7B6, 0x66A04B1C6FCBE618, - 0x0C56AB810681A051, 0x8D1121337A565265, 0x862A3C70EEB20DF6, 0xDEB9B38A4989407F, - 0xDBA1CF225470E4D0, 0x5F5D52D3885DD1C6, 0xD8A842B32A2480AB, 0x9107908035F2C6DE, - 0x9C129A478CA541C2, 0x96BE74D0648425CF, 0x799411A7DEE1A5AA, 0x7DD3DAFB6FFA9FA1, - 0x6254D1E910037853, 0x0E7D66F901A0A28D, 0x7512F4034DEEB83E, 0xA98100FA36D06E9D, - 0x7BBC7C13961558CC, 0xD29283DF1F786E8A, 0x461BADAD5A64870B, 0x505CF0561F37E048, - 0x5A15964158B3BF1C, 0x870F80F9507259B6, 0x11DA16EE0507803B, 0xDF9FB89ED586FFAC, - 0x40EA802A0DC6EAF2, 0x7384D5FED96810B0, 0x3DAB55948E3CFA18, 0x961B9DF053FB6226, - 0xD5F398497BD71F91, 0xC6D30AC214F9C53E, 0xCB2966DE966D790A, 0x6AB7D42460A2D9AF, - 0xE53736761CD11758, 0xEB60C15D45991CC8, 0x2C4CE10BBA1F6330, 0x02F5B484E4AA8805, - 0xD671ED579D6185CF, 0x125700F2EFD42D3F, 0x0F8746461407741F, 0xC8878D76F1C0FCB6 -}; -#endif /* Any x86 */ - -/* *INDENT-ON* */ -/* clang-format on */ - -#if defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) || \ - defined(i386) || defined(_X86_) || defined(__i386__) || defined(_X86_64_) - -uint64_t t1ha0_ia32aes_noavx_a(const void *data, size_t length, uint64_t seed); -uint64_t t1ha0_ia32aes_noavx_b(const void *data, size_t length, uint64_t seed); -uint64_t t1ha0_ia32aes_avx_a(const void *data, size_t length, uint64_t seed); -uint64_t t1ha0_ia32aes_avx_b(const void *data, size_t length, uint64_t seed); -uint64_t t1ha0_ia32aes_avx2_a(const void *data, size_t length, uint64_t seed); -uint64_t t1ha0_ia32aes_avx2_b(const void *data, size_t length, uint64_t seed); - -#ifdef __GNUC__ -#include -#include -#elif defined(_MSC_VER) -#include + size_t buffer_size = +#if defined(_POSIX2_LINE_MAX) + _POSIX2_LINE_MAX +#elif defined(LINE_MAX) + LINE_MAX +#else + 4096 #endif + ; -int rdtscp_available; - -static uint64_t x86_cpu_features(void) { - uint32_t features = 0; - uint32_t extended = 0; - rdtscp_available = 0; -#ifdef __GNUC__ - uint32_t eax, ebx, ecx, edx; - const unsigned cpuid_max = __get_cpuid_max(0, NULL); - if (cpuid_max >= 1) { - __cpuid(0x80000001, eax, ebx, ecx, edx); - rdtscp_available = edx & (1 << 27); - __cpuid_count(1, 0, eax, ebx, features, edx); - if (cpuid_max >= 7) - __cpuid_count(7, 0, eax, extended, ecx, edx); - } -#elif defined(_MSC_VER) - int info[4]; - __cpuid(info, 0); - const unsigned cpuid_max = info[0]; - if (cpuid_max >= 1) { - __cpuid(info, 0x80000001); - rdtscp_available = info[3] & (1 << 27); - __cpuidex(info, 1, 0); - features = info[2]; - if (cpuid_max >= 7) { - __cpuidex(info, 7, 0); - extended = info[1]; + char *buffer = malloc(buffer_size); + if (!buffer) { + perror("malloc()"); + return EXIT_FAILURE; } - } -#endif - return features | (uint64_t)extended << 32; -} -#endif - -/***************************************************************************/ - -#if defined(_X86_64_) || defined(__x86_64__) || defined(_M_X64) || \ - defined(__i386__) || defined(_M_IX86) || defined(i386) || defined(_X86_) -unsigned bench(const char *caption, - uint64_t (*hash)(const void *, size_t, uint64_t), - const void *data, unsigned len, uint64_t seed) { - - printf("%24s: ", caption); - fflush(NULL); - - uint64_t min_ticks = UINT64_MAX; - unsigned stable_counter = 0; - - unsigned start_cpu, stop_cpu; - uint64_t start_tsc, stop_tsc; - - while (1) { - int unused[4]; -#ifdef _MSC_VER - __cpuid(unused, 0); -#else - __cpuid(0, unused[0], unused[1], unused[2], unused[3]); -#endif + if (1 > printf("# %s '--hash-stdin' using %s()\n", argv[0], hash_name)) { + perror("printf(stdout)"); + return EXIT_FAILURE; + } - start_tsc = __rdtscp(&start_cpu); - hash(data, len, seed); - stop_tsc = __rdtscp(&stop_cpu); -#ifdef _MSC_VER - __cpuid(unused, 0); + while (!feof(stdin)) { +#if (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200809L) || \ + (defined(_XOPEN_SOURCE) && _XOPEN_SOURCE >= 700) + ssize_t bytes = getline(&buffer, &buffer_size, stdin); + if (bytes < 0) { + if (feof(stdin)) + break; + perror("getline(stdin)"); + return EXIT_FAILURE; + } #else - __cpuid(0, unused[0], unused[1], unused[2], unused[3]); + if (!fgets(buffer, (int)buffer_size, stdin)) { + if (feof(stdin)) + break; + perror("fgets(stdin)"); + return EXIT_FAILURE; + } + size_t bytes = strlen(buffer); #endif - if (start_cpu != stop_cpu || stop_tsc <= start_tsc) - continue; - - uint64_t ticks = stop_tsc - start_tsc; - if (min_ticks > ticks) { - min_ticks = ticks; - stable_counter = 0; - continue; + if (1 > printf("%016" PRIx64 "\n", + hash_function(buffer, bytes, 42 /* seed */))) { + perror("printf(stdout)"); + return EXIT_FAILURE; + } } - if (++stable_counter == 10000) - break; + free(buffer); + return EXIT_SUCCESS; } - printf("%7" PRIu64 " ticks, %7.4f clk/byte, %7.3f Gb/s @3GHz\n", min_ticks, - (double)min_ticks / len, 3.0 * len / min_ticks); - fflush(NULL); - - return (min_ticks < INT32_MAX) ? (unsigned)min_ticks : UINT32_MAX; -} - -#endif /* x86 for t1ha_ia32aes */ + /*************************************************************************/ -/***************************************************************************/ - -int main(int argc, const char *argv[]) { - (void)argc; - (void)argv; - int failed = 0; - failed |= test("t1ha1_64le", t1ha1_le, refval_64le); - failed |= test("t1ha1_64be", t1ha1_be, refval_64be); - failed |= test("t1ha0_32le", t1ha0_32le, refval_32le); - failed |= test("t1ha0_32be", t1ha0_32be, refval_32be); + printf("\nPreparing to benchmarking...\n"); + fflush(NULL); + if (!mera_init()) { + printf(" - sorry, usable clock-source unavailable\n"); + return EXIT_SUCCESS; + } -#if defined(_X86_64_) || defined(__x86_64__) || defined(_M_X64) || \ - defined(__i386__) || (defined(_M_IX86) && _MSC_VER > 1800) || \ - defined(i386) || defined(_X86_) + if (mera.cpunum >= 0) + printf(" - running on CPU#%d\n", mera.cpunum); + printf(" - use %s as clock source for benchmarking\n", mera.source); + printf(" - assume it %s and %s\n", + (mera.flags & timestamp_clock_cheap) ? "cheap" : "costly", + (mera.flags & timestamp_clock_stable) + ? "stable" + : "floating (RESULTS MAY VARY AND BE USELESS)"); - const uint64_t features = x86_cpu_features(); - if (features & UINT32_C(0x02000000)) { - failed |= - test("t1ha0_ia32aes_noavx", t1ha0_ia32aes_noavx, refval_ia32aes_a); - if ((features & UINT32_C(0x1A000000)) == UINT32_C(0x1A000000)) { - failed |= test("t1ha0_ia32aes_avx", t1ha0_ia32aes_avx, refval_ia32aes_a); - if ((features >> 32) & 32) - failed |= - test("t1ha0_ia32aes_avx2", t1ha0_ia32aes_avx2, refval_ia32aes_b); - } + printf(" - measure granularity and overhead: "); + fflush(NULL); + double mats /* MeasurAble TimeSlice */ = bench_mats(); + printf("%g %s, %g iteration/%s\n", mats, mera.units, 1 / mats, mera.units); + + if (is_option_set(bench_verbose)) { + printf(" - convergence: "); + if (mera_bci.retry_count) + printf("retries %u, ", mera_bci.retry_count); + printf("restarts %u, accounted-loops %u, worthless-loops %u, spent <%us\n", + mera_bci.restart_count, mera_bci.overhead_accounted_loops, + mera_bci.overhead_worthless_loops, mera_bci.spent_seconds); + printf(" - mats/overhead: best %" PRIu64 ", gate %" PRIu64 + ", inner-loops-max %u, best-count %u\n", + mera_bci.overhead_best, mera_bci.overhead_gate, + mera_bci.overhead_loops_max, mera_bci.overhead_best_count); } + fflush(NULL); #if !defined(__OPTIMIZE__) && (defined(_MSC_VER) && defined(_DEBUG)) + bench_size(1, "Non-optimized/Debug"); printf("\nNon-optimized/Debug build, skip benchmark\n"); #else - if (!rdtscp_available) { - printf("\nNo RDTSCP available on CPU, skip benchmark\n"); - } else { - const unsigned large = 1024 * 256; - const unsigned medium = 127; - const unsigned small = 31; - char *buffer = malloc(large); - for (unsigned i = 0; i < large; ++i) - buffer[i] = (char)(rand() + i); - - printf("\nSimple bench for x86 (large keys, %u bytes):\n", large); - bench("t1ha1_64le", t1ha1_le, buffer, large, 42); - bench("t1ha1_64be", t1ha1_be, buffer, large, 42); - bench("t1ha0_32le", t1ha0_32le, buffer, large, 42); - bench("t1ha0_32be", t1ha0_32be, buffer, large, 42); - - printf("\nSimple bench for x86 (small keys, %u bytes):\n", small); - bench("t1ha1_64le", t1ha1_le, buffer, small, 42); - bench("t1ha1_64be", t1ha1_be, buffer, small, 42); - bench("t1ha0_32le", t1ha0_32le, buffer, small, 42); - bench("t1ha0_32be", t1ha0_32be, buffer, small, 42); - - if (features & UINT32_C(0x02000000)) { - printf("\nSimple bench for AES-NI (medium keys, %u bytes):\n", medium); - bench("t1ha0_ia32aes_noavx_a", t1ha0_ia32aes_noavx_a, buffer, medium, 42); - bench("t1ha0_ia32aes_noavx_b", t1ha0_ia32aes_noavx_b, buffer, medium, 42); - bench("t1ha0_ia32aes_noavx", t1ha0_ia32aes_noavx, buffer, medium, 42); - if ((features & UINT32_C(0x1A000000)) == UINT32_C(0x1A000000)) { - bench("t1ha0_ia32aes_avx_a", t1ha0_ia32aes_avx_a, buffer, medium, 42); - bench("t1ha0_ia32aes_avx_b", t1ha0_ia32aes_avx_b, buffer, medium, 42); - bench("t1ha0_ia32aes_avx", t1ha0_ia32aes_avx, buffer, medium, 42); - if ((features >> 32) & 32) { - bench("t1ha0_ia32aes_avx2_a", t1ha0_ia32aes_avx2_a, buffer, medium, - 42); - bench("t1ha0_ia32aes_avx2_b", t1ha0_ia32aes_avx2_b, buffer, medium, - 42); - bench("t1ha0_ia32aes_avx2", t1ha0_ia32aes_avx2, buffer, medium, 42); - } - } - - printf("\nSimple bench for AES-NI (large keys, %u bytes):\n", large); - bench("t1ha0_ia32aes_noavx_a", t1ha0_ia32aes_noavx_a, buffer, large, 42); - bench("t1ha0_ia32aes_noavx_b", t1ha0_ia32aes_noavx_b, buffer, large, 42); - bench("t1ha0_ia32aes_noavx", t1ha0_ia32aes_noavx, buffer, large, 42); - if ((features & UINT32_C(0x1A000000)) == UINT32_C(0x1A000000)) { - bench("t1ha0_ia32aes_avx_a", t1ha0_ia32aes_avx_a, buffer, large, 42); - bench("t1ha0_ia32aes_avx_b", t1ha0_ia32aes_avx_b, buffer, large, 42); - bench("t1ha0_ia32aes_avx", t1ha0_ia32aes_avx, buffer, large, 42); - if ((features >> 32) & 32) { - bench("t1ha0_ia32aes_avx2_a", t1ha0_ia32aes_avx2_a, buffer, large, - 42); - bench("t1ha0_ia32aes_avx2_b", t1ha0_ia32aes_avx2_b, buffer, large, - 42); - bench("t1ha0_ia32aes_avx2", t1ha0_ia32aes_avx2, buffer, large, 42); - } - } - } - - free(buffer); - } + if (is_option_set(bench_tiny)) + bench_size(5, "tiny"); + if (is_option_set(bench_small)) + bench_size(31, "small"); + if (is_option_set(bench_medium)) + bench_size(1024, "medium"); + if (is_option_set(bench_large)) + bench_size(1024 * 16, "large"); + if (is_option_set(bench_huge)) + bench_size(1024 * 256, "huge"); #endif /* __OPTIMIZE__ */ -#endif /* x86 for t1ha_ia32aes */ - return failed ? EXIT_FAILURE : EXIT_SUCCESS; + return EXIT_SUCCESS; } diff --git a/tests/mera.c b/tests/mera.c new file mode 100644 index 0000000..966302e --- /dev/null +++ b/tests/mera.c @@ -0,0 +1,1635 @@ +/* + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, + * Fast Positive Hash. + * + * Portions Copyright (c) 2010-2018 Leonid Yuriev , + * The 1Hippeus project (t1h). + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgement in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#ifndef NOMINMAX +#define NOMINMAX +#endif + +#ifndef _ISOC99_SOURCE +#define _ISOC99_SOURCE 1 +#endif + +#ifndef _ISOC11_SOURCE +#define _ISOC11_SOURCE 1 +#endif + +#ifndef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200809L +#endif + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif + +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#ifndef _THREAD_SAFE +#define _THREAD_SAFE 1 +#endif + +#ifndef _REENTRANT +#define _REENTRANT 1 +#endif + +#if defined(_MSC_VER) +#pragma warning(disable : 4711) /* function 'xyz' selected for \ + automatic inline expansion */ +#pragma warning(disable : 4127) /* conditional expression is constant */ +#pragma warning(disable : 4702) /* unreachable code */ +#if _MSC_VER < 1900 +#define snprintf _snprintf +#pragma warning(disable : 4996) /* '_snprintf': This function or variable \ + may be unsafe */ +#endif +#if _MSC_VER > 1800 +#pragma warning(disable : 4464) /* relative include path contains '..' */ +#endif +#endif /* MSVC */ + +/* OS's includes for time/clock */ +#if defined(__linux__) || defined(__gnu_linux__) +#include +#include +#include +#include +#include +#endif /* Linux */ + +#if defined(EMSCRIPTEN) +#include +#elif defined(__APPLE__) || defined(__MACH__) +#include +#include +#endif + +#if defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) || \ + defined(__WINDOWS__) +#include +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif /* OS */ + +#include "mera.h" +#include +#include +#include + +#include "common.h" + +/*****************************************************************************/ + +/* Compiler's includes for builtins/intrinsics */ +#if defined(_MSC_VER) || defined(__INTEL_COMPILER) +#include +#elif __GNUC_PREREQ(4, 4) || defined(__clang__) +#if defined(__ia32__) || defined(__e2k__) +#include +#endif /* __ia32__ */ +#if defined(__ia32__) +#include +#endif /* __ia32__ */ +#elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) +#include +#elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ + (defined(HP_IA64) || defined(__ia64)) +#include +#elif defined(__IBMC__) && defined(__powerpc) +#include +#elif defined(_AIX) +#include +#include +#elif (defined(__osf__) && defined(__DECC)) || defined(__alpha) +#include +#include +#elif defined(__MWERKS__) +/* CodeWarrior - troubles ? */ +#pragma gcc_extensions +#elif defined(__SNC__) +/* Sony PS3 - troubles ? */ +#elif defined(__hppa__) || defined(__hppa) +#include +#else +#error Unsupported C compiler, please use GNU C 4.4 or newer +#endif /* Compiler */ + +static __inline void compiler_barrier(void) { +#if defined(__clang__) || defined(__GNUC__) + __asm__ __volatile__("" ::: "memory"); +#elif defined(_MSC_VER) + _ReadWriteBarrier(); +#elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ + __memory_barrier(); + if (type > MDBX_BARRIER_COMPILER) +#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) + __mf(); +#elif defined(__i386__) || defined(__x86_64__) + _mm_mfence(); +#else +#error "Unknown target for Intel Compiler, please report to us." +#endif +#elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) + __compiler_barrier(); +#elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ + (defined(HP_IA64) || defined(__ia64)) + _Asm_sched_fence(/* LY: no-arg meaning 'all expect ALU', e.g. 0x3D3D */); +#elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) || \ + defined(__ppc64__) || defined(__powerpc64__) + __fence(); +#else +#error "Could not guess the kind of compiler, please report to us." +#endif +} + +#ifndef likely +#if defined(__GNUC__) || defined(__clang__) +#define likely(cond) __builtin_expect(!!(cond), 1) +#else +#define likely(x) (x) +#endif +#endif /* likely */ + +#ifndef unlikely +#if defined(__GNUC__) || defined(__clang__) +#define unlikely(cond) __builtin_expect(!!(cond), 0) +#else +#define unlikely(x) (x) +#endif +#endif /* unlikely */ + +/*****************************************************************************/ + +#if defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) || \ + defined(__WINDOWS__) +static unsigned seh_filter(unsigned exception_code) { + switch (exception_code) { + case EXCEPTION_ILLEGAL_INSTRUCTION: + case EXCEPTION_PRIV_INSTRUCTION: + case EXCEPTION_ACCESS_VIOLATION: + return EXCEPTION_EXECUTE_HANDLER; + } + return EXCEPTION_CONTINUE_SEARCH; +} +#else +static sigjmp_buf sigaction_jump; +static void sigaction_handler(int signum, siginfo_t *info, void *context) { + (void)context; + (void)info; + siglongjmp(sigaction_jump, signum); +} +#endif + +/* LY: dedicated function to avoid clobber args by ‘longjmp’ */ +static int do_probe(unsigned (*start)(timestamp_t *), + unsigned (*finish)(timestamp_t *)) { +#if defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) || \ + defined(__WINDOWS__) + __try { +#else + struct sigaction act, prev_sigsegv, prev_sigill, prev_sigbus; + memset(&act, 0, sizeof(act)); + act.sa_sigaction = sigaction_handler; + if (sigaction(SIGSEGV, &act, &prev_sigsegv)) { + perror(MERA_PERROR_PREFIX "sigaction(SIGSEGV)"); + return -1; + } + if (sigaction(SIGILL, &act, &prev_sigill)) { + perror(MERA_PERROR_PREFIX "sigaction(SIGILL)"); + return -1; + } + if (sigaction(SIGBUS, &act, &prev_sigbus)) { + perror(MERA_PERROR_PREFIX "sigaction(SIGBUS)"); + return -1; + } + + if (sigsetjmp(sigaction_jump, 0) != 0) { + sigaction(SIGSEGV, &prev_sigsegv, NULL); + sigaction(SIGILL, &prev_sigill, NULL); + sigaction(SIGBUS, &prev_sigbus, NULL); + return -2; + } +#endif + + for (unsigned n = 0; n < 42; ++n) { + timestamp_t timestamp_start, timestamp_finish; + unsigned coreid = start(×tamp_start); +#if defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) || \ + defined(__WINDOWS__) + Sleep(1); +#else + usleep(42); +#endif + if (coreid != finish(×tamp_finish)) + continue; + if (timestamp_finish > timestamp_start) + return 1; + if (timestamp_finish == timestamp_start || n > 5) + break; + } + +#if defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) || \ + defined(__WINDOWS__) + } __except (seh_filter(GetExceptionCode())) { + return -2; + } +#else + sigaction(SIGSEGV, &prev_sigsegv, NULL); + sigaction(SIGILL, &prev_sigill, NULL); + sigaction(SIGBUS, &prev_sigbus, NULL); +#endif + return 0; +} + +static bool probe(unsigned (*start)(timestamp_t *), + unsigned (*finish)(timestamp_t *), + double (*convert)(timestamp_t), unsigned flags, + const char *source_name, const char *time_units) { + + if (is_option_set(bench_verbose)) { + printf(" - probe for %s", source_name); + fflush(stdout); + } + + flags |= timestamp_clock_have; + if (mera.flags >= flags) { + if (is_option_set(bench_verbose)) + printf(": Skip (already have)\n"); + return false; + } + + int rc = do_probe(start, finish); + switch (rc) { + case 1: + if (is_option_set(bench_verbose)) + printf(": Ok\n"); + mera.start = start; + mera.finish = finish; + mera.source = source_name; + mera.convert = convert; + if (flags & timestamp_cycles) + mera.units = "cycle"; + else if (flags & timestamp_ticks) + mera.units = "tick"; + else + mera.units = time_units; + mera.flags = flags; + return true; + case 0: + if (is_option_set(bench_verbose)) + printf(": Doesnt work\n"); + break; + case -2: + if (is_option_set(bench_verbose)) + printf(": Not available (SIGSEGV/SIGILL)\n"); + break; + } + return false; +} + +/*****************************************************************************/ + +static int set_single_affinity(void) { +#if defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) || \ + defined(__WINDOWS__) + return -1; +#elif defined(__GLIBC__) || defined(__GNU_LIBRARY__) || defined(__ANDROID__) + const int current_cpu = sched_getcpu(); + if (current_cpu < 0) { + perror(MERA_PERROR_PREFIX "sched_getcpu()"); + return -1; + } + const int ncpu = sysconf(_SC_NPROCESSORS_CONF); + const unsigned cpuset_size = CPU_ALLOC_SIZE(ncpu); + cpu_set_t *affinity = CPU_ALLOC(ncpu); + if (!affinity) { + perror(MERA_PERROR_PREFIX "CPU_ALLOC()"); + return -1; + } + CPU_ZERO_S(cpuset_size, affinity); + CPU_SET_S(current_cpu, cpuset_size, affinity); + if (sched_setaffinity(0, sizeof(affinity), affinity)) { + perror(MERA_PERROR_PREFIX "sched_setaffinity()"); + CPU_FREE(affinity); + return -1; + } + CPU_FREE(affinity); + return current_cpu; +#elif defined(__APPLE__) || defined(__MACH__) + return -1; +#else + return -1; +#endif +} + +/*****************************************************************************/ + +union timestamp { + uint64_t u64; + struct { +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + uint32_t h, l; +#else + uint32_t l, h; +#endif + } u32; +}; + +#if defined(EMSCRIPTEN) +static unsigned clock_emscripten(timestamp_t *now) { + compiler_barrier(); + *now = (timestamp_t)(emscripten_get_now() * 1e6); + compiler_barrier(); + return 0; +} +#endif /* EMSCRIPTEN */ + +#if defined(__APPLE__) || defined(__MACH__) +static unsigned clock_mach(timestamp_t *now) { + compiler_barrier(); + *now = mach_absolute_time(); + compiler_barrier(); + return 0; +} + +static double convert_mach(timestamp_t timestamp) { + static double ratio /* from mach_absolute_time() to seconds */; + if (!ratio) { + mach_timebase_info_data_t ti; + if (mach_timebase_info(&ti) != 0) { + perror(MERA_PERROR_PREFIX "mach_timebase_info()"); + return -1; + } + ratio = (double)ti.numer / ti.denom; + } + return ratio * (double)timestamp; +} +#endif /* defined(__APPLE__) || defined(__MACH__) */ + +#if defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) || \ + defined(__WINDOWS__) +static unsigned clock_windows(timestamp_t *now) { + compiler_barrier(); + if (!QueryPerformanceCounter((LARGE_INTEGER *)now)) { + perror(MERA_PERROR_PREFIX "QueryPerformanceCounter()"); + *now = 42; + } + compiler_barrier(); + return 0; +} + +static double convert_windows(timestamp_t timestamp) { + static double ratio /* from QueryPerformanceCounter() to seconds */; + if (!ratio) { + LARGE_INTEGER frequency; + if (!QueryPerformanceFrequency(&frequency)) { + perror(MERA_PERROR_PREFIX "QueryPerformanceFrequency()"); + return -1; + } + ratio = 1e9 / frequency.QuadPart; + } + return ratio * (double)timestamp; +} + +#else /* Windows */ + +static unsigned clock_gettimeofday(timestamp_t *now) { + compiler_barrier(); + struct timeval tv; + if (gettimeofday(&tv, NULL)) { + perror(MERA_PERROR_PREFIX "gettimeofday()"); + tv.tv_sec = tv.tv_usec = 0; + } + *now = tv.tv_sec * UINT64_C(1000000) + tv.tv_usec; + compiler_barrier(); + return 0; +} + +static double convert_us2ns(timestamp_t timestamp) { return 1e3 * timestamp; } + +#endif /* ! Windows */ + +#if defined(TIMEBASE_SZ) || defined(__OS400__) +static unsigned clock_os400(timestamp_t *now) { + compiler_barrier(); + timebasestruct_t tb; + if (read_wall_time(&tb, TIMEBASE_SZ) != 0) { + perror(MERA_PERROR_PREFIX "read_wall_time(TIMEBASE_SZ)"); + abort(); + } + union timestamp *u = (union timestamp *)now; + u->u32.h = tb.tb_high; + u->u32.l = tb.tb_low; + compiler_barrier(); + return 0; +} + +static double convert_os400(timestamp_t timestamp) { + static double ratio /* from read_wall_time() to seconds */; + if (!ratio) { + timebasestruct_t tb; + tb.tb_high = 0x7fff; + tb.tb_low = 0; + if (time_base_to_time(&tb, TIMEBASE_SZ) != 0) { + perror(MERA_PERROR_PREFIX "time_base_to_time()"); + abort(); + } + ratio = (tb.tb_high * 1e9 + tb.tb_low) / UINT64_C(0x7fff00000000); + } + return ratio * (double)timestamp; +} +#endif /* __OS400__ */ + +#if defined(CLOCK_MONOTONIC) || defined(CLOCK_MONOTONIC_RAW) || \ + defined(CLOCK_SGI_CYCLE) +static clockid_t posix_clockid = CLOCK_REALTIME; +static unsigned clock_posix(timestamp_t *now) { + compiler_barrier(); + struct timespec ts; + if (clock_gettime(posix_clockid, &ts)) { + perror(MERA_PERROR_PREFIX "clock_gettime()"); + ts.tv_sec = ts.tv_nsec = 0; + } + *now = ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; + compiler_barrier(); + return 0; +} +#endif /* CLOCK_MONOTONIC || CLOCK_MONOTONIC_RAW || CLOCK_SGI_CYCLE */ + +#if defined(__sun__) || defined(__sun) +static unsigned clock_solaris(timestamp_t *now) { + compiler_barrier(); + *now = gethrtime(); + compiler_barrier(); +} +#endif /* __sun__ */ + +/*****************************************************************************/ + +#if defined(__e2k__) || defined(__elbrus__) +static unsigned clock_elbrus(timestamp_t *now) { + compiler_barrier(); + unsigned coreid; + *now = __rdtscp(&coreid); + compiler_barrier(); + return coreid; +} +#endif /* __e2k__ || __elbrus__ */ + +#if (defined(__powerpc64__) || defined(__ppc64__) || defined(__ppc64) || \ + defined(__powerpc64)) +static unsigned clock_powerpc64_mfspr268(timestamp_t *now) { + compiler_barrier(); +#if defined(__GNUC__) + uint64_t ticks; + __asm __volatile("mfspr %0, 268" : "=r"(ticks)); + *now = ticks; +#else + *now = 42 /* FIXME */; +#endif + compiler_barrier(); + return 0; +} +#endif /* __powerpc64__ */ + +#if (defined(__powerpc__) || defined(__ppc__) || defined(__powerpc) || \ + defined(__ppc)) +static unsigned clock_powerpc_mftb(timestamp_t *now) { + /* A time-base timer, which is not always precisely a cycle-count. */ + compiler_barrier(); +#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul +#if defined(__GNUC__) + uint64_t ticks; + __asm __volatile("mftb %0" : "=r"(ticks)); + *now = ticks; +#else + *now = 42 /* FIXME */; +#endif + +#else + +#if defined(__GNUC__) + uint32_t low, high_before, high_after; + __asm __volatile("mftbu %0; mftb %1; mftbu %2" + : "=r"(high_before), "=r"(low), "=r"(high_after)); + union timestamp *u = (union timestamp *)now; + u->u32.h = high_after; + u->u32.l = low & /* zeroes if high part has changed */ + ~(high_before - high_after); +#else + *now = 42 /* FIXME */; +#endif +#endif + compiler_barrier(); + return 0; +} +#endif /* __powerpc__ */ + +#if defined(__sparc__) || defined(__sparc) || defined(__sparc64__) || \ + defined(__sparc64) || defined(__sparc_v8plus__) || \ + defined(__sparc_v8plus) || defined(__sparc_v8plusa__) || \ + defined(__sparc_v8plusa) || defined(__sparc_v9__) || defined(__sparc_v9) +static unsigned clock_sparc(timestamp_t *now) { + compiler_barrier(); + union { + uint64_t i64; + struct { + uint32_t high; + uint32_t low; + } i32; + } cycles; +#ifndef __GNUC__ +#warning FIXME +#else + +#if defined(__sparc_v8plus__) || defined(__sparc_v8plusa__) || \ + defined(__sparc_v9__) || defined(__sparc_v8plus) || \ + defined(__sparc_v8plusa) || defined(__sparc_v9) + +#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || \ + defined(__sparc64__) || defined(__sparc64) + __asm __volatile("rd %%tick, %0" : "=r"(cycles.i64)); +#else + __asm __volatile("rd %%tick, %1; srlx %1, 32, %0" + : "=r"(cycles.i32.high), "=r"(cycles.i32.low)); +#endif /* __sparc64__ */ + +#else + __asm __volatile(".byte 0x83, 0x41, 0x00, 0x00; mov %%g1, %0" + : "=r"(cycles.ia64) + : + : "%g1"); +#endif /* __sparc8plus__ || __sparc_v9__ */ +#endif /* GCC */ + *now = cycles.i64; + compiler_barrier(); + return 0; +} +#endif /* __sparc__ */ + +#if defined(__ia64__) || defined(__ia64) +static unsigned clock_ia64(timestamp_t *now) { + compiler_barrier(); +#if defined(__GNUC__) + uint64_t ticks; + __asm __volatile("mov %0 = ar.itc" : "=r"(ticks)); + *now = ticks; +#elif defined(__EDG_VERSION) || defined(__ECC) || defined(IA64_REG_AR_ITC) + *now = __getReg(_IA64_REG_AR_ITC); +#elif defined(__hpux) || defined(_AREG_ITC) + *now = _Asm_mov_from_ar(_AREG_ITC); +#else + *now = 42 /* FIXME */; +#endif + compiler_barrier(); + return 0; +} +#endif /* __ia64__ */ + +#if (defined(__hppa__) || defined(__hppa) || defined(__hppa64__) || \ + defined(__hppa64)) +static unsigned clock_hppa(timestamp_t *now) { + compiler_barrier(); + uint64_t cycles; +#ifdef __GNUC__ + __asm __volatile("mfctl 16, %0" : "=r"(cycles)); +#else + _MFCTL(16, ticks); +#endif + *now = cycles; + compiler_barrier(); + return 0; +} +#endif /* __hppa__ */ + +#if defined(__s390__) || defined(__s390) +static unsigned clock_stcke(timestamp_t *now) { + compiler_barrier(); + uint8_t clk[16]; +#ifdef __GNUC__ + __asm __volatile("stcke %0" : "=Q"(clk) : : "cc"); +#else +#warning FIXME +#endif + *now = *((unsigned long long *)&clk[1]) >> 2; + compiler_barrier(); + return (clk[14] << 8) | (clk[15]); +} + +static unsigned clock_stckf(timestamp_t *now) { + compiler_barrier(); +#ifdef __GNUC__ + __asm __volatile("stckf 0(%1)" : "=m"(*now) : "a"(now) : "cc"); +#else +#warning FIXME +#endif + compiler_barrier(); + return 0; +} + +static unsigned clock_stck(timestamp_t *now) { + compiler_barrier(); +#ifdef __GNUC__ + __asm __volatile("stck 0(%1)" : "=m"(*now) : "a"(now) : "cc"); +#else +#warning FIXME +#endif + compiler_barrier(); + return 0; +} +#endif /* __s390__ */ + +#if defined(__alpha__) || defined(__alpha) +static unsigned clock_alpha(timestamp_t *now) { + compiler_barrier(); +#ifdef __GNUC__ + unsigned long cycles; + __asm__ __volatile("rpcc %0" : "=r"(cycles)); + *now = cycles & 0xFFFFfffful; +#else + *now = 42 /* FIXME */; +#endif + compiler_barrier(); + return 0; +} +#endif /* __alpha__ */ + +/*****************************************************************************/ + +static double convert_1to1(timestamp_t timestamp) { return (double)timestamp; } + +#if (defined(__ARM_ARCH) && __ARM_ARCH > 5 && __ARM_ARCH < 8) || defined(_M_ARM) +static unsigned clock_pmccntr(timestamp_t *now) { + compiler_barrier(); +#ifdef _M_ARM + *now = __rdpmccntr64(); +#else + unsigned long pmccntr; + __asm __volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr)); + *now = (uint64_t)pmccntr; +#endif + compiler_barrier(); + return 0; +} + +static double convert_pmccntr_x64(timestamp_t timestamp) { + /* The counter is set up to count every 64th cycle */ + return timestamp * 64.0; +} +#endif /* __ARM_ARCH >= 6 || _M_ARM */ + +#if defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH > 7) || \ + defined(_M_ARM64) +static unsigned clock_cntvct_el0(timestamp_t *now) { + compiler_barrier(); +/* System timer of ARMv8 runs at a different frequency than the CPU's. + * The frequency is fixed, typically in the range 1-50MHz. It can be + * read at CNTFRQ special register. We assume the OS has set up + * the virtual timer properly. */ +#ifdef _M_ARM64 + *now = _ReadStatusReg(42 /* FIXME: cntvct_el0 */); +#else + uint64_t virtual_timer; + __asm __volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer)); + *now = virtual_timer; +#endif + compiler_barrier(); + return 0; +} +#endif /* __aarch64__ || __ARM_ARCH > 7 || _M_ARM64 */ + +#if defined(__mips__) || defined(__mips) || defined(_R4000) + +#if defined(PROT_READ) && defined(MAP_SHARED) +static volatile uint64_t *mips_tsc_addr; +static unsigned clock_zbustimer(timestamp_t *now) { + compiler_barrier(); + *now = *mips_tsc_addr; + compiler_barrier(); + return 0; +} +#endif /* PROT_READ && MAP_SHARED */ + +#if (defined(_MIPS_ISA) && defined(_MIPS_ISA_MIPS2) && \ + _MIPS_ISA >= _MIPS_ISA_MIPS2) || \ + (defined(__mips) && __mips >= 2 && __mips < 16) || defined(_R4000) || \ + defined(__MIPS_ISA2) || defined(__MIPS_ISA3) || defined(__MIPS_ISA4) || \ + (defined(__mips_isa_rev) && __mips_isa_rev >= 2) + +static unsigned clock_mfc0_25_1(timestamp_t *now) { + compiler_barrier(); +#if (defined(_MIPS_SIM) && defined(_ABI64) && _MIPS_SIM == _ABI64) || \ + (defined(_MIPS_SIM) && defined(_ABIO64) && _MIPS_SIM == _ABIO64) || \ + defined(__mips64) || defined(__mips64__) || \ + (defined(__mips) && (__mips >= 64)) + uint64_t count; + __asm __volatile("dmfc0 %0, $25, 1" : "=r"(count)); +#else + uint32_t count; + __asm __volatile("mfc0 %0, $25, 1" : "=r"(count)); +#endif + *now = count; + compiler_barrier(); + return 0; +} + +static unsigned clock_mfc0_9_0(timestamp_t *now) { + compiler_barrier(); +#if (defined(_MIPS_SIM) && defined(_ABI64) && _MIPS_SIM == _ABI64) || \ + (defined(_MIPS_SIM) && defined(_ABIO64) && _MIPS_SIM == _ABIO64) || \ + defined(__mips64) || defined(__mips64__) || \ + (defined(__mips) && (__mips >= 64)) + uint64_t count; + __asm __volatile("dmfc0 %0, $9, 0" : "=r"(count)); +#else + uint32_t count; + __asm __volatile("mfc0 %0, $9, 0" : "=r"(count)); +#endif + *now = count; + compiler_barrier(); + return 0; +} + +static unsigned mips_rdhwr_resolution; +static unsigned clock_rdhwr(timestamp_t *now) { + compiler_barrier(); + unsigned count, coreid; + __asm __volatile("rdhwr %0, $2; rdhwr %1, $0" : "=r"(count), "=r"(coreid)); + *now = count; + compiler_barrier(); + return coreid; +} + +static double convert_rdhwr(timestamp_t timestamp) { + return (double)timestamp * mips_rdhwr_resolution; +} +#endif /* MIPS >= 2 */ + +#endif /* MIPS */ + +#if defined(__ia32__) + +enum ia32_fixed_perfomance_counters { + /* count of retired instructions on the current core in the low-order 48 bits + of an unsigned 64-bit integer */ + ia32_COUNT_HW_INSTRUCTIONS = 1 << 30, + + /* count of actual CPU core cycles executed by the current core. Core cycles + are not accumulated while the processor is in the "HALT" state, which is + used when the operating system has no task(s) to run on a processor core. + */ + ia32_COUNT_HW_CPU_CYCLES = (1 << 30) + 1, + + /* count of "reference" (or "nominal") CPU core cycles executed by the current + core. This counts at the same rate as the TSC, but does not count when the + core is in the "HALT" state. If a timed section of code shows a larger + change in TSC than in rdpmc_reference_cycles, the processor probably spent + some time in a HALT state. */ + ia32_COUNT_HW_REF_CPU_CYCLES = (1 << 30) + 2, +}; + +static unsigned clock_rdpmc_start(timestamp_t *now) { + compiler_barrier(); +#if __GNUC__ + uint32_t low, high; + __asm __volatile("cpuid" ::: "%eax", "%ebx", "%ecx", "%edx"); + __asm __volatile("rdpmc" + : "=a"(low), "=d"(high) + : "c"(ia32_COUNT_HW_CPU_CYCLES)); + union timestamp *u = (union timestamp *)now; + u->u32.l = low; + u->u32.h = high; +#elif defined(_MSC_VER) + int unused[4]; + __cpuid(unused, 0); + *now = __readpmc(ia32_COUNT_HW_CPU_CYCLES); +#else +#error "FIXME: Unsupported compiler" +#endif + compiler_barrier(); + return 0; +} + +static unsigned clock_rdpmc_finish(timestamp_t *now) { + compiler_barrier(); +#if __GNUC__ + uint32_t low, high; + __asm __volatile("mov %2, %%ecx; rdpmc; mov %%eax, %0; mov %%edx, %1; cpuid" + : "=r"(low), "=r"(high) + : "i"(ia32_COUNT_HW_CPU_CYCLES) + : "%eax", "%ebx", "%ecx", "%edx"); + union timestamp *u = (union timestamp *)now; + u->u32.l = low; + u->u32.h = high; +#elif defined(_MSC_VER) + *now = __readpmc(ia32_COUNT_HW_CPU_CYCLES); + int unused[4]; + __cpuid(unused, 0); +#else +#error "FIXME: Unsupported compiler" +#endif + return 0; +} + +static unsigned clock_rdtscp_start(timestamp_t *now) { + compiler_barrier(); + unsigned coreid; + *now = __rdtscp(&coreid); + return coreid; +} + +static unsigned clock_rdtscp_finish(timestamp_t *now) { + compiler_barrier(); +#if __GNUC__ + uint32_t low, high, coreid; + __asm __volatile("rdtscp; mov %%eax, %0; mov %%edx, %1; mov %%ecx, %2; cpuid" + : "=r"(low), "=r"(high), "=r"(coreid) + : + : "%eax", "%ebx", "%ecx", "%edx"); + union timestamp *u = (union timestamp *)now; + u->u32.l = low; + u->u32.h = high; + return coreid; +#elif defined(_MSC_VER) + unsigned coreid; + *now = __rdtscp(&coreid); + int unused[4]; + __cpuid(unused, 0); + return coreid; +#else +#error "FIXME: Unsupported compiler" +#endif +} + +static unsigned clock_rdtsc_start(timestamp_t *now) { + compiler_barrier(); +#if __GNUC__ + uint32_t low, high; + __asm __volatile("cpuid; rdtsc" : "=a"(low), "=d"(high) : : "%ebx", "%ecx"); + union timestamp *u = (union timestamp *)now; + u->u32.l = low; + u->u32.h = high; +#elif defined(_MSC_VER) + int unused[4]; + __cpuid(unused, 0); + *now = __rdtsc(); +#else +#error "FIXME: Unsupported compiler" +#endif + compiler_barrier(); + return 0; +} + +static unsigned clock_rdtsc_finish(timestamp_t *now) { + compiler_barrier(); +#if __GNUC__ + uint32_t low, high; + __asm __volatile("rdtsc; mov %%eax, %0; mov %%edx, %1; cpuid" + : "=r"(low), "=r"(high) + : + : "%eax", "%ebx", "%ecx", "%edx"); + union timestamp *u = (union timestamp *)now; + u->u32.l = low; + u->u32.h = high; +#elif defined(_MSC_VER) + int unused[4]; + __cpuid(unused, 0); + *now = __rdtsc(); +#else +#error "FIXME: Unsupported compiler" +#endif + compiler_barrier(); + return 0; +} + +ia32_cpu_features_t ia32_cpu_features; + +void ia32_fetch_cpu_features(void) { + memset(&ia32_cpu_features, 0, sizeof(ia32_cpu_features)); +#ifdef __GNUC__ + uint32_t unused_eax, unused_ebx, cpuid_max; + + cpuid_max = __get_cpuid_max(0, NULL); + if (cpuid_max >= 1) { + __cpuid_count(1, 0, unused_eax, ia32_cpu_features.basic.ebx, + ia32_cpu_features.basic.ecx, ia32_cpu_features.basic.edx); + if (cpuid_max >= 7) + __cpuid_count(7, 0, unused_eax, ia32_cpu_features.extended_7.ebx, + ia32_cpu_features.extended_7.ecx, + ia32_cpu_features.extended_7.edx); + } + cpuid_max = __get_cpuid_max(0x80000000, NULL); + if (cpuid_max >= 0x80000001) { + __cpuid_count(0x80000001, 0, unused_eax, unused_ebx, + ia32_cpu_features.extended_80000001.ecx, + ia32_cpu_features.extended_80000001.edx); + if (cpuid_max >= 0x80000007) + __cpuid_count(0x80000007, 0, unused_eax, unused_ebx, + ia32_cpu_features.extended_80000007.ecx, + ia32_cpu_features.extended_80000007.edx); + } + +#elif defined(_MSC_VER) + int info[4]; + __cpuid(info, 0); + unsigned cpuid_max = info[0]; + if (cpuid_max >= 1) { + __cpuidex(info, 1, 0); + ia32_cpu_features.basic.ebx = info[1]; + ia32_cpu_features.basic.ecx = info[2]; + ia32_cpu_features.basic.edx = info[3]; + if (cpuid_max >= 7) { + __cpuidex(info, 7, 0); + ia32_cpu_features.extended_7.ebx = info[1]; + ia32_cpu_features.extended_7.ecx = info[2]; + ia32_cpu_features.extended_7.edx = info[3]; + } + } + + __cpuid(info, 0x80000000); + cpuid_max = info[0]; + if (cpuid_max >= 0x80000001) { + __cpuidex(info, 0x80000001, 0); + ia32_cpu_features.extended_80000001.ecx = info[2]; + ia32_cpu_features.extended_80000001.edx = info[3]; + if (cpuid_max >= 0x80000007) { + __cpuidex(info, 0x80000007, 0); + ia32_cpu_features.extended_80000007.ecx = info[2]; + ia32_cpu_features.extended_80000007.edx = info[3]; + } + } +#else +#error "FIXME: Unsupported compiler" +#endif +} + +#endif /* __ia32__ */ + +/*****************************************************************************/ + +#ifdef __NR_perf_event_open +static int perf_fd, perf_error; +#if defined(__ia32__) +static const struct perf_event_mmap_page volatile *perf_page; +#else +#define perf_page NULL +#endif +static long perf_event_open(struct perf_event_attr *event_attr, pid_t pid, + int cpu, int group_fd, unsigned long flags) { + return syscall(__NR_perf_event_open, event_attr, pid, cpu, group_fd, flags); +} + +static unsigned clock_perf(timestamp_t *now) { + *now = 42; + return read(perf_fd, now, sizeof(timestamp_t)); +} + +static int perf_setup(void) { +#ifdef PR_TASK_PERF_EVENTS_ENABLE + if (prctl(PR_TASK_PERF_EVENTS_ENABLE, 1, 0, 0, 0)) + perror(MERA_PERROR_PREFIX "prctl(PR_TASK_PERF_EVENTS_ENABLE)"); +#endif /* PR_TASK_PERF_EVENTS_ENABLE */ + + struct perf_event_attr attr; + memset(&attr, 0, sizeof(struct perf_event_attr)); + attr.size = sizeof(struct perf_event_attr); + attr.type = PERF_TYPE_HARDWARE; + attr.config = PERF_COUNT_HW_CPU_CYCLES; + attr.read_format = PERF_FORMAT_TOTAL_TIME_RUNNING; + attr.disabled = 1; + // attr.pinned = 1; + attr.exclude_kernel = 1; + attr.exclude_hv = 1; +#ifndef PERF_FLAG_FD_CLOEXEC /* Since 3.14 */ +#define PERF_FLAG_FD_CLOEXEC 0 +#endif + perf_fd = perf_event_open(&attr, 0 /* current process */, -1 /* any cpu */, + -1 /* no group */, PERF_FLAG_FD_CLOEXEC); + if (perf_fd < 0) { + perf_error = errno; + if (perf_error != EACCES /* will handle later */) + perror(MERA_PERROR_PREFIX "perf_event_open()"); + return -1; + } + +#if defined(__ia32__) + perf_page = (struct perf_event_mmap_page *)mmap( + NULL, getpagesize(), PROT_WRITE | PROT_READ, MAP_SHARED, perf_fd, 0); + if (perf_page == MAP_FAILED) { + perf_error = errno; + perror(MERA_PERROR_PREFIX "mmap(perf_event_mmap_page)"); + perf_page = NULL; + } +#endif /* __ia32__ */ + + if (ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, 0) /* Start counters */) { + perf_error = errno; + perror(MERA_PERROR_PREFIX "ioctl(PERF_EVENT_IOC_ENABLE)"); + close(perf_fd); + perf_fd = -1; + return -1; + } + perf_error = 0; + return 0; +} + +#if defined(__ia32__) +static unsigned perf_rdpmc_index; +unsigned perf_rdpmc_start(timestamp_t *now) { + compiler_barrier(); + uint32_t low, high; + __asm __volatile("cpuid; mov %2, %%ecx; rdpmc" + : "=a"(low), "=d"(high) + : "m"(perf_rdpmc_index) + : "%ebx", "%ecx"); + union timestamp *u = (union timestamp *)now; + u->u32.l = low; + u->u32.h = high; + return 0; +} + +unsigned perf_rdpmc_finish(timestamp_t *now) { + compiler_barrier(); + uint32_t low, high; + __asm __volatile("mov %2, %%ecx; rdpmc; mov %%eax, %0; mov %%edx, %1; cpuid" + : "=r"(low), "=r"(high) + : "m"(perf_rdpmc_index) + : "%eax", "%ebx", "%ecx", "%edx"); + union timestamp *u = (union timestamp *)now; + u->u32.l = low; + u->u32.h = high; + return 0; +} +#endif /* __ia32__ */ + +#else +#define perf_fd (-1) +#endif /* __NR_perf_event_open */ + +/*****************************************************************************/ + +bool mera_init(void) { + mera.flags = 0; + mera.cpunum = set_single_affinity(); + +#if defined(PR_SET_TSC) && defined(__ia32__) + int tsc_mode = PR_TSC_SIGSEGV; + if (prctl(PR_GET_TSC, &tsc_mode, 0, 0, 0)) + perror(MERA_PERROR_PREFIX "prctl(PR_GET_TSC)"); + else if (tsc_mode != PR_TSC_ENABLE && + prctl(PR_SET_TSC, PR_TSC_ENABLE, 0, 0, 0)) + perror(MERA_PERROR_PREFIX "prctl(PR_SET_TSC, PR_TSC_ENABLE)"); +#endif /* PR_SET_TSC */ + +#if defined(EMSCRIPTEN) + return probe(clock_emscripten, clock_emscripten, convert_1to1, 0, + "emscripten_get_now()", "ns"); +#endif + +#if defined(TIMEBASE_SZ) || defined(__OS400__) + probe(clock_os400, clock_os400, convert_os400, 0, + "read_wall_time(TIMEBASE_SZ)", "ns"); +#endif + +#if defined(__APPLE__) || defined(__MACH__) + probe(clock_mach, clock_mach, convert_mach, 0, "mach_absolute_time()", "ns"); +#endif + +#if defined(__sun__) || defined(__sun) + probe(clock_solaris, clock_solaris, convert_1to1, 0, "gethrtime()", "ns"); +#endif /* __sun__ */ + +#if defined(CLOCK_SGI_CYCLE) + if (posix_clockid == CLOCK_REALTIME) { + posix_clockid = CLOCK_SGI_CYCLE; + if (!probe(clock_posix, clock_posix, convert_1to1, 0, + "clock_gettime(CLOCK_SGI_CYCLE)", "ns")) + posix_clockid = CLOCK_REALTIME; + } +#endif /* CLOCK_SGI_CYCLE */ +#if defined(CLOCK_MONOTONIC_RAW) + if (posix_clockid == CLOCK_REALTIME) { + posix_clockid = CLOCK_MONOTONIC_RAW; + if (!probe(clock_posix, clock_posix, convert_1to1, 0, + "clock_gettime(CLOCK_MONOTONIC_RAW)", "ns")) + posix_clockid = CLOCK_REALTIME; + } +#endif /* CLOCK_MONOTONIC_RAW */ +#if defined(CLOCK_MONOTONIC) + if (posix_clockid == CLOCK_REALTIME) { + posix_clockid = CLOCK_MONOTONIC; + if (!probe(clock_posix, clock_posix, convert_1to1, 0, + "clock_gettime(CLOCK_MONOTONIC)", "ns")) + posix_clockid = CLOCK_REALTIME; + } +#endif /* CLOCK_MONOTONIC */ + +#if defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) || \ + defined(__WINDOWS__) + probe(clock_windows, clock_windows, convert_windows, 0, + "QueryPerformanceCounter()", "ns"); +#else + probe(clock_gettimeofday, clock_gettimeofday, convert_us2ns, 0, + "gettimeofday()", "ns"); +#endif /* Windows */ + +/***************************************************************************/ + +#ifndef __native_client__ +#if defined(__elbrus__) || defined(__e2k__) + probe(clock_elbrus, clock_elbrus, convert_1to1, + timestamp_clock_cheap | timestamp_cycles | timestamp_clock_stable, + "Elbrus_TSCP", "cycle"); +#endif /* __elbrus__ */ + +#if (defined(__powerpc64__) || defined(__ppc64__) || defined(__ppc64) || \ + defined(__powerpc64)) + probe(clock_powerpc64_mfspr268, clock_powerpc64_mfspr268, convert_1to1, + timestamp_clock_cheap | timestamp_cycles | timestamp_clock_stable, + "MFSPR(268)", "cycle"); +#endif /* __powerpc64__ */ + +#if (defined(__powerpc__) || defined(__ppc__) || defined(__powerpc) || \ + defined(__ppc)) + probe(clock_powerpc_mftb, clock_powerpc_mftb, convert_1to1, + timestamp_clock_cheap | timestamp_ticks, "MFTB", "tick"); +#endif /* __powerpc__ */ + +#if defined(__sparc__) || defined(__sparc) || defined(__sparc64__) || \ + defined(__sparc64) || defined(__sparc_v8plus__) || \ + defined(__sparc_v8plus) || defined(__sparc_v8plusa__) || \ + defined(__sparc_v8plusa) || defined(__sparc_v9__) || defined(__sparc_v9) + probe(clock_sparc, clock_sparc, convert_1to1, + timestamp_clock_cheap | timestamp_cycles | timestamp_clock_stable, + "tick_register", "cycle"); +#endif /* __sparc__ */ + +#if defined(__ia64__) || defined(__ia64) + probe(clock_ia64, clock_ia64, convert_1to1, + timestamp_clock_cheap | timestamp_cycles | timestamp_clock_stable, + "ITC", "cycle"); +#endif /* __ia64__ */ + +#if (defined(__hppa__) || defined(__hppa) || defined(__hppa64__) || \ + defined(__hppa64)) + probe(clock_hppa, clock_hppa, convert_1to1, + timestamp_clock_cheap | timestamp_cycles | timestamp_clock_stable, + "MFCTL(16)", "cycle"); +#endif /* __hppa__ */ + +#if defined(__s390__) || defined(__s390) + probe(clock_stcke, clock_stcke, convert_1to1, + timestamp_clock_cheap | timestamp_cycles | timestamp_clock_stable, + "STCKE", "cycle"); + probe(clock_stckf, clock_stckf, convert_1to1, + timestamp_clock_cheap | timestamp_cycles | timestamp_clock_stable, + "STCKF", "cycle"); + probe(clock_stck, clock_stck, convert_1to1, + timestamp_clock_cheap | timestamp_cycles | timestamp_clock_stable, + "STCK", "cycle"); +#endif /* __s390__ */ + +#if defined(__alpha__) || defined(__alpha) + probe(clock_alpha, clock_alpha, convert_1to1, + timestamp_clock_cheap | timestamp_cycles | timestamp_clock_stable, + "RPCC", "cycle"); +#endif /* __alpha__ */ + +#if (defined(__ARM_ARCH) && __ARM_ARCH > 5 && __ARM_ARCH < 8) || defined(_M_ARM) + /* Read the user mode perf monitor counter access permissions. */ + uint32_t pmuseren; +#ifdef _M_ARM + pmuseren = _MoveFromCoprocessor(15, 0, 9, 14, 0); +#else + __asm("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren)); +#endif + if (1 & pmuseren /* Is it allowed for user mode code? */) { + uint32_t pmcntenset; +#ifdef _M_ARM + pmcntenset = _MoveFromCoprocessor(15, 0, 9, 12, 1); +#else + __asm("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset)); +#endif + if (pmcntenset & 0x80000000ul /* Is it counting? */) + probe(clock_pmccntr, clock_pmccntr, convert_pmccntr_x64, + timestamp_clock_stable | timestamp_cycles | timestamp_clock_cheap, + "PMCCNTR", "clk"); + else { + printf(" - suggest enable performance-counter\n"); + } + } else { + printf(" - suggest enable access to performance-counters from user-mode\n"); + } +#endif /* (__ARM_ARCH > 5 && __ARM_ARCH < 8) || _M_ARM */ + +#if defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH > 7) || \ + defined(_M_ARM64) + /* System timer of ARMv8 runs at a different frequency than the CPU's. + * The frequency is fixed, typically in the range 1-50MHz. It can be + * read at CNTFRQ special register. We assume the OS has set up + * the virtual timer properly. */ + probe(clock_cntvct_el0, clock_cntvct_el0, convert_1to1, + timestamp_clock_stable | timestamp_ticks | timestamp_clock_cheap, + "CNTVCT_EL0", "tick"); +#endif /* __aarch64__ || __ARM_ARCH > 7 || _M_ARM64 */ + +#if defined(__mips__) || defined(__mips) + +#if (defined(_MIPS_ISA) && defined(_MIPS_ISA_MIPS2) && \ + _MIPS_ISA >= _MIPS_ISA_MIPS2) || \ + (defined(__mips) && __mips >= 2 && __mips < 16) || defined(_R4000) || \ + defined(__MIPS_ISA2) || defined(__MIPS_ISA3) || defined(__MIPS_ISA4) || \ + (defined(__mips_isa_rev) && __mips_isa_rev >= 2) + + probe(clock_mfc0_9_0, clock_mfc0_9_0, convert_1to1, + timestamp_clock_stable | timestamp_clock_cheap | timestamp_cycles, + "MFC0(9.0)", "cycle"); + + if (probe(clock_rdhwr, clock_rdhwr, convert_rdhwr, + timestamp_clock_stable | timestamp_clock_cheap | timestamp_cycles, + "RDHWR(2)", "cycle")) { + unsigned rdhwr_3; + __asm("rdhwr %0, $3" : "=r"(rdhwr_3)); + mips_rdhwr_resolution = rdhwr_3; + if (mips_rdhwr_resolution < 2) + mera.convert = convert_1to1; + } + + probe(clock_mfc0_25_1, clock_mfc0_25_1, convert_1to1, + timestamp_clock_stable | timestamp_clock_cheap | timestamp_cycles, + "MFC0(25.1)", "cycle"); +#endif /* MIPS >= 2 */ + +#if defined(PROT_READ) && defined(MAP_SHARED) + uint64_t *mips_tsc_addr; + int mem_fd = open("/dev/mem", O_RDONLY | O_SYNC, 0); + + if (mem_fd < 0) + if (errno == EACCES) + printf(" - suggest run from super-user for access to /dev/mem " + "(MIPS_ZBUS_TIMER)\n"); + else + perror(MERA_PERROR_PREFIX "open(/dev/mem)"); + else { + mips_tsc_addr = mmap(NULL, getpagesize(), PROT_READ, MAP_SHARED, mem_fd, + 0x10030000 /* MIPS_ZBUS_TIMER */); + if (mips_tsc_addr == MAP_FAILED) { + perror(MERA_PERROR_PREFIX "mmap(MIPS_ZBUS_TIMER)"); + close(mem_fd); + } else { + close(mem_fd); + if (!probe(clock_zbustimer, clock_zbustimer, convert_1to1, + timestamp_clock_stable | timestamp_clock_cheap | + timestamp_ticks, + "ZBUS-Timer(0x10030000)", "tick")) { + + munmap(mips_tsc_addr, getpagesize()); + mips_tsc_addr = NULL; + } + } + } +#endif /* PROT_READ && MAP_SHARED */ + +#endif /* __mips__ */ + +#if defined(__ia32__) + if (ia32_cpu_features.basic.edx == 0) + ia32_fetch_cpu_features(); + if (ia32_cpu_features.basic.edx & (1 << 4)) { + probe(clock_rdpmc_start, clock_rdpmc_finish, convert_1to1, + timestamp_clock_stable | timestamp_clock_cheap | timestamp_cycles, + "RDPMC_40000001", "cycle"); + const unsigned tsc_flags = + (ia32_cpu_features.extended_80000007.edx & (1 << 8)) + /* The TSC rate is invariant, i.e. not always on CPU frequency ! */ + ? timestamp_clock_cheap | timestamp_cycles + : timestamp_clock_cheap | timestamp_cycles | timestamp_clock_stable; + +#ifdef F_OK + if (!(tsc_flags & timestamp_clock_stable) && + !(mera.flags & timestamp_clock_stable) && + access("/sys/devices/cpu/rdpmc", F_OK) == 0) { + printf(" - suggest enable rdpmc for usermode (echo 2 | sudo tee " + "/sys/devices/cpu/rdpmc)\n"); + } +#endif /* F_OK */ + + if (ia32_cpu_features.extended_80000001.edx & (1 << 27)) + probe(clock_rdtscp_start, clock_rdtscp_finish, convert_1to1, tsc_flags, + "RDTSCP", NULL); + probe(clock_rdtsc_start, clock_rdtsc_finish, convert_1to1, tsc_flags, + "RDTSC", NULL); + } +#endif /* __ia32__ */ + +#if __NR_perf_event_open + if (perf_setup() == 0) { + bool perf_used = probe(clock_perf, clock_perf, convert_1to1, + timestamp_cycles | timestamp_clock_stable, + "PERF_COUNT_HW_CPU_CYCLES", "cycle"); +#if defined(__ia32__) + if (perf_page) { + bool perf_used_page = false; + if (perf_page->cap_bit0_is_deprecated && perf_page->cap_user_rdpmc && + perf_page->index) { + perf_rdpmc_index = perf_page->index - 1; + perf_used_page = probe( + perf_rdpmc_start, perf_rdpmc_finish, convert_1to1, + timestamp_clock_stable | timestamp_clock_cheap | timestamp_cycles, + "RDPMC_perf", "cycle"); + } + if (perf_used_page) + perf_used = true; + else { + munmap((void *)perf_page, getpagesize()); + perf_page = NULL; + } + } +#endif /* __ia32__ */ + if (!perf_used) { + close(perf_fd); + perf_fd = -1; + } + } else if (!(mera.flags & timestamp_clock_stable) && perf_error == EACCES && + access("/proc/sys/kernel/perf_event_paranoid", F_OK) == 0) { + printf(" - suggest enable perf for non-admin users (echo 2 | sudo tee " + "/proc/sys/kernel/perf_event_paranoid)\n"); + } +#endif /* __NR_perf_event_open */ + +#endif /* ! __native_client__ */ + return (mera.flags & timestamp_clock_have) ? true : false; +} + +static unsigned fuse_timestamp(timestamp_t *unused) { + (void)unused; + abort(); + return 0; +} + +static double fuse_convert(timestamp_t unused) { + (void)unused; + abort(); + return 0; +} + +mera_t mera = { + fuse_timestamp, fuse_timestamp, fuse_convert, "void", "none", 0, -1}; + +/*****************************************************************************/ + +mera_bci_t mera_bci; + +double mera_bench(MERA_BENCH_TARGET target, MERA_BENCH_SELF_ARGS) { + const time_t timeout_fuse = time(NULL); + unsigned target_loops = 1; + unsigned retry_count = 0, restart_count = 0; + + timestamp_t overhead_best = INT64_MAX; + timestamp_t overhead_gate = 0; + unsigned overhead_loops_max = 0; + + restart_count -= 1; +restart_top:; + timestamp_t overhead_sum = 0; + unsigned overhead_total_count = 0; + unsigned overhead_best_count = 1; + unsigned overhead_worthless_loops = 0; + unsigned overhead_accounted_loops = 0; + +restart_middle:; + timestamp_t target_best = INT64_MAX; + timestamp_t target_gate = 0; + unsigned tail_loops_max = 0; + +restart_bottom:; + timestamp_t target_brutto_sum = 0; + unsigned target_overhead_count = 0; + unsigned target_best_count = 1; + unsigned target_total_count = 0; + unsigned target_worthless_loops = 0; + unsigned target_accounted_loops = 0; + unsigned stable = 0; + restart_count += 1; + + retry_count -= 1; +retry: + retry_count += 1; + + while (true) { + /* measure the overhead of measurement */ + unsigned coreid; + { + /* wait for edge of tick */ + timestamp_t snap, start, finish; + coreid = mera.start(&snap); + do { + if (unlikely(coreid != mera.start(&start) || snap > start)) + goto retry; + } while (snap == start); + + /* first iteration */ + unsigned loops = 1; + if (unlikely(coreid != mera.finish(&finish) || start > finish)) + goto retry; + + /* loop until end of tick */ + while (start == finish) { + loops += 1; + if (unlikely(coreid != mera.start(&snap) || start > snap)) + goto retry; + if (unlikely(coreid != mera.finish(&finish) || snap > finish)) + goto retry; + } + const timestamp_t elapsed = finish - start; + if (unlikely(overhead_best > elapsed || overhead_loops_max < loops)) { + if (overhead_best > elapsed) { + overhead_gate = overhead_best + (overhead_best - elapsed + 1) / 2; + if (overhead_gate > elapsed * 129 / 128) + overhead_gate = elapsed * 129 / 128; + if (overhead_gate < elapsed * 1025 / 1024 + 1) + overhead_gate = elapsed * 1025 / 1024 + 1; + overhead_best = elapsed; + } + overhead_loops_max = + (overhead_loops_max > loops) ? overhead_loops_max : loops; + goto restart_top; + } else if (likely(elapsed <= overhead_gate && + loops + 1 >= overhead_loops_max)) { + if (elapsed == overhead_best && loops == overhead_loops_max) + overhead_best_count += 1; + overhead_sum += elapsed; + overhead_total_count += loops; + overhead_accounted_loops += 1; + } else { + overhead_worthless_loops += 1; + } + } + + /* measure the target */ + if (target) { + /* wait for edge of tick */ + timestamp_t snap, start, finish; + if (unlikely(coreid != mera.start(&snap))) + goto retry; + do { + if (unlikely(coreid != mera.start(&start) || snap > start)) + goto retry; + } while (snap == start); + + unsigned loops = 0; + do + target(MERA_BENCH_TARGET_ARGS); + while (++loops < target_loops); + + loops = 1; + if (unlikely(coreid != mera.finish(&finish) || snap > finish)) + goto retry; + + /* wait for next tick */ + while (true) { + if (unlikely(coreid != mera.start(&snap) || finish > snap)) + goto retry; + if (finish != snap) + break; + if (unlikely(coreid != mera.finish(&snap) || finish > snap)) + goto retry; + if (finish != snap) + break; + loops += 1; + } + + const timestamp_t elapsed = finish - start; + if (unlikely(target_best > elapsed || + (target_best == elapsed && tail_loops_max < loops))) { + if (target_best > elapsed) { + target_gate = target_best + (target_best - elapsed + 1) / 2; + if (target_gate > elapsed * 129 / 128) + target_gate = elapsed * 129 / 128; + if (target_gate < elapsed * 1025 / 1024 + 1) + target_gate = elapsed * 1025 / 1024 + 1; + target_best = elapsed; + } + tail_loops_max = loops; + goto restart_bottom; + } else if (likely(elapsed <= target_gate && + (tail_loops_max - loops /* overflow is ok */) < 2)) { + if (elapsed == target_best && loops == tail_loops_max) + target_best_count += 1; + target_total_count += target_loops; + target_brutto_sum += elapsed; + target_overhead_count += loops; + target_accounted_loops += 1; + } else { + target_worthless_loops += 1; + } + } + + /* checkpoint */ + if (unlikely((++stable & 1023) == 0)) { + if (target) { + const timestamp_t wanna = 1042 + overhead_best * overhead_loops_max; + if (target_best < wanna) { + target_loops += target_loops; + goto restart_middle; + } + if (target_loops > 1 && target_best > wanna * 4) { + target_loops >>= 1; + goto restart_middle; + } + } + + const unsigned enough4fuse_seconds = 9; + const unsigned enough4best = + (mera.flags & timestamp_clock_stable) ? 499 : 1999; + const unsigned enough4avg = + (mera.flags & timestamp_clock_stable) ? 4999 : 29999; + const unsigned enough4bailout = + (mera.flags & timestamp_clock_cheap) ? 99999 : 59999; + + const unsigned spent_seconds = (unsigned)(time(NULL) - timeout_fuse); + + const bool enough4overhead = overhead_best_count > enough4best || + overhead_accounted_loops > enough4avg || + overhead_worthless_loops > enough4bailout || + spent_seconds > enough4fuse_seconds; + + const bool enough4target = target_best_count > enough4best || + target_accounted_loops > enough4avg || + target_worthless_loops > enough4bailout || + spent_seconds > enough4fuse_seconds; + + /* calculate results */ + if (enough4overhead && (!target || enough4target)) { + memset(&mera_bci, 0, sizeof(mera_bci)); + mera_bci.retry_count = retry_count; + mera_bci.restart_count = restart_count; + mera_bci.spent_seconds = spent_seconds + 1; + + mera_bci.overhead_best = overhead_best; + mera_bci.overhead_gate = overhead_gate; + mera_bci.overhead_loops_max = overhead_loops_max; + mera_bci.overhead_best_count = overhead_best_count; + mera_bci.overhead_accounted_loops = overhead_accounted_loops; + mera_bci.overhead_worthless_loops = overhead_worthless_loops; + + const double measured_overhead = + (overhead_best_count > 2 || overhead_total_count < enough4avg / 2) + ? mera.convert(overhead_best) / overhead_loops_max + : mera.convert(overhead_sum) / overhead_total_count; + if (!target) + return measured_overhead; + + mera_bci.target_loops = target_loops; + mera_bci.target_best = target_best; + mera_bci.target_gate = target_gate; + mera_bci.tail_loops_max = tail_loops_max; + mera_bci.target_best_count = target_best_count; + mera_bci.target_accounted_loops = target_accounted_loops; + mera_bci.target_worthless_loops = target_worthless_loops; + + const double measured_target = + (target_best_count > 2 || target_total_count < enough4avg / 2) + ? (mera.convert(target_best) - + measured_overhead * tail_loops_max) / + target_loops + : (mera.convert(target_brutto_sum) - + measured_overhead * target_overhead_count) / + target_total_count; + return measured_target; + } + } + } +} diff --git a/tests/mera.h b/tests/mera.h new file mode 100644 index 0000000..fcf3d87 --- /dev/null +++ b/tests/mera.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, + * Fast Positive Hash. + * + * Portions Copyright (c) 2010-2018 Leonid Yuriev , + * The 1Hippeus project (t1h). + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgement in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#pragma once + +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable : 4127) /* conditional expression is constant */ +#if _MSC_VER > 1800 +#pragma warning(disable : 4464) /* relative include path contains '..' */ +#endif +#endif /* MSVC */ + +#include "../t1ha.h" /* for byteorder and common __ia32__ */ + +/*****************************************************************************/ + +typedef uint64_t timestamp_t; + +enum mera_flags { + timestamp_clock_have = 1u << 0, + timestamp_clock_cheap = 1u << 1, + timestamp_ticks = 1u << 2, + timestamp_cycles = 1u << 3, + timestamp_clock_stable = 1u << 4, +}; + +typedef struct { + unsigned (*start)(timestamp_t *); + unsigned (*finish)(timestamp_t *); + double (*convert)(timestamp_t); + const char *units; + const char *source; + unsigned flags; + int cpunum; +} mera_t; + +extern mera_t mera; +bool mera_init(void); + +typedef struct { + unsigned retry_count, restart_count; + unsigned overhead_loops_max, overhead_best_count, overhead_accounted_loops, + overhead_worthless_loops; + uint64_t overhead_best, overhead_gate; + + unsigned target_loops, tail_loops_max, target_best_count, + target_accounted_loops, target_worthless_loops, spent_seconds; + uint64_t target_best, target_gate; +} mera_bci_t /* bci = Bench Convergence Info */; + +extern mera_bci_t mera_bci; +typedef uint64_t (*mera_bench_target_t)(const void *data, size_t bytes, + uint64_t seed); + +#define MERA_BENCH_TARGET mera_bench_target_t +#define MERA_BENCH_SELF_ARGS const void *data, size_t bytes, uint64_t seed +#define MERA_BENCH_TARGET_ARGS data, bytes, seed +#define MERA_PERROR_PREFIX " - " + +double mera_bench(MERA_BENCH_TARGET target, MERA_BENCH_SELF_ARGS); + +/*****************************************************************************/ + +#if defined(__ia32__) +typedef struct _ia32_cpu_features { + struct { + uint32_t ebx; + uint32_t ecx; + uint32_t edx; + } basic /* https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits */, + extended_7 /* https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features */; + + struct { + uint32_t ecx; + uint32_t edx; + } extended_80000001 /* https://en.wikipedia.org/wiki/CPUID#EAX=80000001h:_Extended_Processor_Info_and_Feature_Bits */; + + struct { + uint32_t ecx; + uint32_t edx; + } extended_80000007 /* Advanced Power Management Information */; + +} ia32_cpu_features_t; + +extern ia32_cpu_features_t ia32_cpu_features; +void ia32_fetch_cpu_features(void); +#endif /* __ia32__ */ diff --git a/tests/test.c b/tests/test.c new file mode 100644 index 0000000..0c23904 --- /dev/null +++ b/tests/test.c @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, + * Fast Positive Hash. + * + * Portions Copyright (c) 2010-2018 Leonid Yuriev , + * The 1Hippeus project (t1h). + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgement in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#include "common.h" +#include + +#if defined(_MSC_VER) +#if _MSC_VER < 1900 +#define snprintf _snprintf +#pragma warning(disable : 4996) /* '_snprintf': This function or variable \ + may be unsafe */ +#endif +#endif /* MSVC */ + +/*****************************************************************************/ + +/* *INDENT-OFF* */ +/* clang-format off */ +static const uint8_t pattern[64] = { + 0, 1, 2, 3, 4, 5, 6, 7, 0xFF, 0x7F, 0x3F, 0x1F, 0xF, 8, 16, 32, 64, 0x80, 0xFE, + 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0xFD, 0xFB, 0xF7, 0xEF, 0xDF, 0xBF, 0x55, 0xAA, + 11, 17, 19, 23, 29, 37, 42, 43, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', + 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x' +}; + +const uint64_t refval_2atonce[81] = { 0, + 0x772C7311BE32FF42, 0x444753D23F207E03, 0x71F6DF5DA3B4F532, 0x555859635365F660, + 0xE98808F1CD39C626, 0x2EB18FAF2163BB09, 0x7B9DD892C8019C87, 0xE2B1431C4DA4D15A, + 0x1984E718A5477F70, 0x08DD17B266484F79, 0x4C83A05D766AD550, 0x92DCEBB131D1907D, + 0xD67BC6FC881B8549, 0xF6A9886555FBF66B, 0x6E31616D7F33E25E, 0x36E31B7426E3049D, + 0x4F8E4FAF46A13F5F, 0x03EB0CB3253F819F, 0x636A7769905770D2, 0x3ADF3781D16D1148, + 0x92D19CB1818BC9C2, 0x283E68F4D459C533, 0xFA83A8A88DECAA04, 0x8C6F00368EAC538C, + 0x7B66B0CF3797B322, 0x5131E122FDABA3FF, 0x6E59FF515C08C7A9, 0xBA2C5269B2C377B0, + 0xA9D24FD368FE8A2B, 0x22DB13D32E33E891, 0x7B97DFC804B876E5, 0xC598BDFCD0E834F9, + 0xB256163D3687F5A7, 0x66D7A73C6AEF50B3, 0x25A7201C85D9E2A3, 0x911573EDA15299AA, + 0x5C0062B669E18E4C, 0x17734ADE08D54E28, 0xFFF036E33883F43B, 0xFE0756E7777DF11E, + 0x37972472D023F129, 0x6CFCE201B55C7F57, 0xE019D1D89F02B3E1, 0xAE5CC580FA1BB7E6, + 0x295695FB7E59FC3A, 0x76B6C820A40DD35E, 0xB1680A1768462B17, 0x2FB6AF279137DADA, + 0x28FB6B4366C78535, 0xEC278E53924541B1, 0x164F8AAB8A2A28B5, 0xB6C330AEAC4578AD, + 0x7F6F371070085084, 0x94DEAD60C0F448D3, 0x99737AC232C559EF, 0x6F54A6F9CA8EDD57, + 0x979B01E926BFCE0C, 0xF7D20BC85439C5B4, 0x64EDB27CD8087C12, 0x11488DE5F79C0BE2, + 0x25541DDD1680B5A4, 0x8B633D33BE9D1973, 0x404A3113ACF7F6C6, 0xC59DBDEF8550CD56, + 0x039D23C68F4F992C, 0x5BBB48E4BDD6FD86, 0x41E312248780DF5A, 0xD34791CE75D4E94F, + 0xED523E5D04DCDCFF, 0x7A6BCE0B6182D879, 0x21FB37483CAC28D8, 0x19A1B66E8DA878AD, + 0x6F804C5295B09ABE, 0x2A4BE5014115BA81, 0xA678ECC5FC924BE0, 0x50F7A54A99A36F59, + 0x0FD7E63A39A66452, 0x5AB1B213DD29C4E4, 0xF3ED80D9DF6534C5, 0xC736B12EF90615FD +}; + +const uint64_t refval_2atonce128[81] = { 0x4EC7F6A48E33B00A, + 0xB7B7FAA5BD7D8C1E, 0x3269533F66534A76, 0x6C3EC6B687923BFC, 0xC096F5E7EFA471A9, + 0x79D8AFB550CEA471, 0xCEE0507A20FD5119, 0xFB04CFFC14A9F4BF, 0xBD4406E923807AF2, + 0x375C02FF11010491, 0xA6EA4C2A59E173FF, 0xE0A606F0002CADDF, 0xE13BEAE6EBC07897, + 0xF069C2463E48EA10, 0x75BEE1A97089B5FA, 0x378F22F8DE0B8085, 0x9C726FC4D53D0D8B, + 0x71F6130A2D08F788, 0x7A9B20433FF6CF69, 0xFF49B7CD59BF6D61, 0xCCAAEE0D1CA9C6B3, + 0xC77889D86039D2AD, 0x7B378B5BEA9B0475, 0x6520BFA79D59AD66, 0x2441490CB8A37267, + 0xA715A66B7D5CF473, 0x9AE892C88334FD67, 0xD2FFE9AEC1D2169A, 0x790B993F18B18CBB, + 0xA0D02FBCF6A7B1AD, 0xA90833E6F151D0C1, 0x1AC7AFA37BD79BE0, 0xD5383628B2881A24, + 0xE5526F9D63F9F8F1, 0xC1F165A01A6D1F4D, 0x6CCEF8FF3FCFA3F2, 0x2030F18325E6DF48, + 0x289207230E3FB17A, 0x077B66F713A3C4B9, 0x9F39843CAF871754, 0x512FDA0F808ACCF3, + 0xF4D9801CD0CD1F14, 0x28A0C749ED323638, 0x94844CAFA671F01C, 0xD0E261876B8ACA51, + 0x8FC2A648A4792EA2, 0x8EF87282136AF5FE, 0x5FE6A54A9FBA6B40, 0xA3CC5B8FE6223D54, + 0xA8C3C0DD651BB01C, 0x625E9FDD534716F3, 0x1AB2604083C33AC5, 0xDE098853F8692F12, + 0x4B0813891BD87624, 0x4AB89C4553D182AD, 0x92C15AA2A3C27ADA, 0xFF2918D68191F5D9, + 0x06363174F641C325, 0x667112ADA74A2059, 0x4BD605D6B5E53D7D, 0xF2512C53663A14C8, + 0x21857BCB1852667C, 0xAFBEBD0369AEE228, 0x7049340E48FBFD6B, 0x50710E1924F46954, + 0x869A75E04A976A3F, 0x5A41ABBDD6373889, 0xA781778389B4B188, 0x21A3AFCED6C925B6, + 0x107226192EC10B42, 0x62A862E84EC2F9B1, 0x2B15E91659606DD7, 0x613934D1F9EC5A42, + 0x4DC3A96DC5361BAF, 0xC80BBA4CB5F12903, 0x3E3EDAE99A7D6987, 0x8F97B2D55941DCB0, + 0x4C9787364C3E4EC1, 0xEF0A2D07BEA90CA7, 0x5FABF32C70AEEAFB, 0x3356A5CFA8F23BF4 +}; + +const uint64_t refval_2stream[81] = { 0x3C8426E33CB41606, + 0xFD74BE70EE73E617, 0xF43DE3CDD8A20486, 0x882FBCB37E8EA3BB, 0x1AA2CDD34CAA3D4B, + 0xEE755B2BFAE07ED5, 0xD4E225250D92E213, 0xA09B49083205965B, 0xD47B21724EF9EC9E, + 0xAC888FC3858CEE11, 0x94F820D85736F244, 0x1707951CCA920932, 0x8E0E45603F7877F0, + 0x9FD2592C0E3A7212, 0x9A66370F3AE3D427, 0xD33382D2161DE2B7, 0x9A35BE079DA7115F, + 0x73457C7FF58B4EC3, 0xBE8610BD53D7CE98, 0x65506DFE5CCD5371, 0x286A321AF9D5D9FA, + 0xB81EF9A7EF3C536D, 0x2CFDB5E6825C6E86, 0xB2A58CBFDFDD303A, 0xD26094A42B950635, + 0xA34D666A5F02AD9A, 0x0151E013EBCC72E5, 0x9254A6EA7FCB6BB5, 0x10C9361B3869DC2B, + 0xD7EC55A060606276, 0xA2FF7F8BF8976FFD, 0xB5181BB6852DCC88, 0x0EE394BB6178BAFF, + 0x3A8B4B400D21B89C, 0xEC270461970960FD, 0x615967FAB053877E, 0xFA51BF1CFEB4714C, + 0x29FDA8383070F375, 0xC3B663061BC52EDA, 0x192BBAF1F1A57923, 0x6D193B52F93C53AF, + 0x7F6F5639FE87CA1E, 0x69F7F9140B32EDC8, 0xD0F2416FB24325B6, 0x62C0E37FEDD49FF3, + 0x57866A4B809D373D, 0x9848D24BD935E137, 0xDFC905B66734D50A, 0x9A938DD194A68529, + 0x8276C44DF0625228, 0xA4B35D00AD67C0AB, 0x3D9CB359842DB452, 0x4241BFA8C23B267F, + 0x650FA517BEF15952, 0x782DE2ABD8C7B1E1, 0x4EAE456166CA3E15, 0x40CDF3A02614E337, + 0xAD84092C46102172, 0x0C68479B03F9A167, 0x7E1BA046749E181C, 0x3F3AB41A697382C1, + 0xC5E5DD6586EBFDC4, 0xFF926CD4EB02555C, 0x035CFE67F89E709B, 0x89F06AB6464A1B9D, + 0x8EFF58F3F7DEA758, 0x8B54AC657902089F, 0xC6C4F1F9F8DA4D64, 0xBDB729048AAAC93A, + 0xEA76BA628F5E5CD6, 0x742159B728B8A979, 0x6D151CD3C720E53D, 0xE97FFF9368FCDC42, + 0xCA5B38314914FBDA, 0xDD92C91D8B858EAE, 0x66E5F07CF647CBF2, 0xD4CF9B42F4985AFB, + 0x72AE17AC7D92F6B7, 0xB8206B22AB0472E1, 0x385876B5CFD42479, 0x03294A249EBE6B26 +}; + +const uint64_t refval_2stream128[81] = { 0xCD2801D3B92237D6, + 0x10E4D47BD821546D, 0x9100704B9D65CD06, 0xD6951CB4016313EF, 0x24DB636F96F474DA, + 0x3F4AF7DF3C49E422, 0xBFF25B8AF143459B, 0xA157EC13538BE549, 0xD3F5F52C47DBD419, + 0x0EF3D7D735AF1575, 0x46B7B892823F7B1B, 0xEE22EA4655213289, 0x56AD76F02FE929BC, + 0x9CF6CD1AC886546E, 0xAF45CE47AEA0B933, 0x535F9DC09F3996B7, 0x1F0C3C01694AE128, + 0x18495069BE0766F7, 0x37E5FFB3D72A4CB1, 0x6D6C2E9299F30709, 0x4F39E693F50B41E3, + 0xB11FC4EF0658E116, 0x48BFAACB78E5079B, 0xE1B4C89C781B3AD0, 0x81D2F34888D333A1, + 0xF6D02270D2EA449C, 0xC884C3C2C3CE1503, 0x711AE16BA157A9B9, 0x1E6140C642558C9D, + 0x35AB3D238F5DC55B, 0x33F07B6AEF051177, 0xE57336776EEFA71C, 0x6D445F8318BA3752, + 0xD4F5F6631934C988, 0xD5E260085727C4A2, 0x5B54B41EC180B4FA, 0x7F5D75769C15A898, + 0xAE5A6DB850CA33C6, 0x038CCB8044663403, 0xDA16310133DC92B8, 0x6A2FFB7AB2B7CE2B, + 0xDC1832D9229BAE20, 0x8C62C479F5ABC9E4, 0x5EB7B617857C9CCB, 0xB79CF7D749A1E80D, + 0xDE7FAC3798324FD3, 0x8178911813685D06, 0x6A726CBD394D4410, 0x6CBE6B3280DA1113, + 0x6829BA4410CF1148, 0xFA7E417EB26C5BC6, 0x22ED87884D6E3A49, 0x15F1472D5115669D, + 0x2EA0B4C8BF69D318, 0xDFE87070AA545503, 0x6B4C14B5F7144AB9, 0xC1ED49C06126551A, + 0x351919FC425C3899, 0x7B569C0FA6F1BD3E, 0x713AC2350844CFFD, 0xE9367F9A638C2FF3, + 0x97F17D325AEA0786, 0xBCB907CC6CF75F91, 0x0CB7517DAF247719, 0xBE16093CC45BE8A9, + 0x786EEE97359AD6AB, 0xB7AFA4F326B97E78, 0x2694B67FE23E502E, 0x4CB492826E98E0B4, + 0x838D119F74A416C7, 0x70D6A91E4E5677FD, 0xF3E4027AD30000E6, 0x9BDF692795807F77, + 0x6A371F966E034A54, 0x8789CF41AE4D67EF, 0x02688755484D60AE, 0xD5834B3A4BF5CE42, + 0x9405FC61440DE25D, 0x35EB280A157979B6, 0x48D40D6A525297AC, 0x6A87DC185054BADA +}; + +const uint64_t refval_64le[81] = { 0, + 0x6A580668D6048674, 0xA2FE904AFF0D0879, 0xE3AB9C06FAF4D023, 0x6AF1C60874C95442, + 0xB3557E561A6C5D82, 0x0AE73C696F3D37C0, 0x5EF25F7062324941, 0x9B784F3B4CE6AF33, + 0x6993BB206A74F070, 0xF1E95DF109076C4C, 0x4E1EB70C58E48540, 0x5FDD7649D8EC44E4, + 0x559122C706343421, 0x380133D58665E93D, 0x9CE74296C8C55AE4, 0x3556F9A5757AB6D0, + 0xF62751F7F25C469E, 0x851EEC67F6516D94, 0xED463EE3848A8695, 0xDC8791FEFF8ED3AC, + 0x2569C744E1A282CF, 0xF90EB7C1D70A80B9, 0x68DFA6A1B8050A4C, 0x94CCA5E8210D2134, + 0xF5CC0BEABC259F52, 0x40DBC1F51618FDA7, 0x0807945BF0FB52C6, 0xE5EF7E09DE70848D, + 0x63E1DF35FEBE994A, 0x2025E73769720D5A, 0xAD6120B2B8A152E1, 0x2A71D9F13959F2B7, + 0x8A20849A27C32548, 0x0BCBC9FE3B57884E, 0x0E028D255667AEAD, 0xBE66DAD3043AB694, + 0xB00E4C1238F9E2D4, 0x5C54BDE5AE280E82, 0x0E22B86754BC3BC4, 0x016707EBF858B84D, + 0x990015FBC9E095EE, 0x8B9AF0A3E71F042F, 0x6AA56E88BD380564, 0xAACE57113E681A0F, + 0x19F81514AFA9A22D, 0x80DABA3D62BEAC79, 0x715210412CABBF46, 0xD8FA0B9E9D6AA93F, + 0x6C2FC5A4109FD3A2, 0x5B3E60EEB51DDCD8, 0x0A7C717017756FE7, 0xA73773805CA31934, + 0x4DBD6BB7A31E85FD, 0x24F619D3D5BC2DB4, 0x3E4AF35A1678D636, 0x84A1A8DF8D609239, + 0x359C862CD3BE4FCD, 0xCF3A39F5C27DC125, 0xC0FF62F8FD5F4C77, 0x5E9F2493DDAA166C, + 0x17424152BE1CA266, 0xA78AFA5AB4BBE0CD, 0x7BFB2E2CEF118346, 0x647C3E0FF3E3D241, + 0x0352E4055C13242E, 0x6F42FC70EB660E38, 0x0BEBAD4FABF523BA, 0x9269F4214414D61D, + 0x1CA8760277E6006C, 0x7BAD25A859D87B5D, 0xAD645ADCF7414F1D, 0xB07F517E88D7AFB3, + 0xB321C06FB5FFAB5C, 0xD50F162A1EFDD844, 0x1DFD3D1924FBE319, 0xDFAEAB2F09EF7E78, + 0xA7603B5AF07A0B1E, 0x41CD044C0E5A4EE3, 0xF64D2F86E813BF33, 0xFF9FDB99305EB06A +}; + +const uint64_t refval_64be[81] = { 0, + 0x6A580668D6048674, 0xDECC975A0E3B8177, 0xE3AB9C06FAF4D023, 0xE401FA8F1B6AF969, + 0x67DB1DAE56FB94E3, 0x1106266A09B7A073, 0x550339B1EF2C7BBB, 0x290A2BAF590045BB, + 0xA182C1258C09F54A, 0x137D53C34BE7143A, 0xF6D2B69C6F42BEDC, 0x39643EAF2CA2E4B4, + 0x22A81F139A2C9559, 0x5B3D6AEF0AF33807, 0x56E3F80A68643C08, 0x9E423BE502378780, + 0xCDB0986F9A5B2FD5, 0xD5B3C84E7933293F, 0xE5FB8C90399E9742, 0x5D393C1F77B2CF3D, + 0xC8C82F5B2FF09266, 0xACA0230CA6F7B593, 0xCB5805E2960D1655, 0x7E2AD5B704D77C95, + 0xC5E903CDB8B9EB5D, 0x4CC7D0D21CC03511, 0x8385DF382CFB3E93, 0xF17699D0564D348A, + 0xF77EE7F8274A4C8D, 0xB9D8CEE48903BABE, 0xFE0EBD2A82B9CFE9, 0xB49FB6397270F565, + 0x173735C8C342108E, 0xA37C7FBBEEC0A2EA, 0xC13F66F462BB0B6E, 0x0C04F3C2B551467E, + 0x76A9CB156810C96E, 0x2038850919B0B151, 0xCEA19F2B6EED647B, 0x6746656D2FA109A4, + 0xF05137F221007F37, 0x892FA9E13A3B4948, 0x4D57B70D37548A32, 0x1A7CFB3D566580E6, + 0x7CB30272A45E3FAC, 0x137CCFFD9D51423F, 0xB87D96F3B82DF266, 0x33349AEE7472ED37, + 0x5CC0D3C99555BC07, 0x4A8F4FA196D964EF, 0xE82A0D64F281FBFA, 0x38A1BAC2C36823E1, + 0x77D197C239FD737E, 0xFB07746B4E07DF26, 0xC8A2198E967672BD, 0x5F1A146D143FA05A, + 0x26B877A1201AB7AC, 0x74E5B145214723F8, 0xE9CE10E3C70254BC, 0x299393A0C05B79E8, + 0xFD2D2B9822A5E7E2, 0x85424FEA50C8E50A, 0xE6839E714B1FFFE5, 0x27971CCB46F9112A, + 0xC98695A2E0715AA9, 0x338E1CBB4F858226, 0xFC6B5C5CF7A8D806, 0x8973CAADDE8DA50C, + 0x9C6D47AE32EBAE72, 0x1EBF1F9F21D26D78, 0x80A9704B8E153859, 0x6AFD20A939F141FB, + 0xC35F6C2B3B553EEF, 0x59529E8B0DC94C1A, 0x1569DF036EBC4FA1, 0xDA32B88593C118F9, + 0xF01E4155FF5A5660, 0x765A2522DCE2B185, 0xCEE95554128073EF, 0x60F072A5CA51DE2F +}; + +const uint64_t refval_32le[81] = { 0, + 0xC92229C10FAEA50E, 0x3DF1354B0DFDC443, 0x968F016D60417BB3, 0x85AAFB50C6DA770F, + 0x66CCE3BB6842C7D6, 0xDDAA39C11537C226, 0x35958D281F0C9C8C, 0x8C5D64B091DE608E, + 0x4094DF680D39786B, 0x1014F4AA2A2EDF4D, 0x39D21891615AA310, 0x7EF51F67C398C7C4, + 0x06163990DDBF319D, 0xE229CAA00C8D6F3F, 0xD2240B4B0D54E0F5, 0xEA2E7E905DDEAF94, + 0x8D4F8A887183A5CE, 0x44337F9A63C5820C, 0x94938D1E86A9B797, 0x96E9CABA5CA210CC, + 0x6EFBB9CC9E8F7708, 0x3D12EA0282FB8BBC, 0x5DA781EE205A2C48, 0xFA4A51A12677FE12, + 0x81D5F04E20660B28, 0x57258D043BCD3841, 0x5C9BEB62059C1ED2, 0x57A02162F9034B33, + 0xBA2A13E457CE19B8, 0xE593263BF9451F3A, 0x0BC1175539606BC5, 0xA3E2929E9C5F289F, + 0x86BDBD06835E35F7, 0xA180950AB48BAADC, 0x7812C994D9924028, 0x308366011415F46B, + 0x77FE9A9991C5F959, 0x925C340B70B0B1E3, 0xCD9C5BA4C41E2E10, 0x7CC4E7758B94CD93, + 0x898B235962EA4625, 0xD7E3E5BF22893286, 0x396F4CDD33056C64, 0x740AB2E32F17CD9F, + 0x60D12FF9CD15B321, 0xBEE3A6C9903A81D8, 0xB47040913B33C35E, 0x19EE8C2ACC013CFF, + 0x5DEC94C5783B55C4, 0x78DC122D562C5F1D, 0x6520F008DA1C181E, 0x77CAF155A36EBF7C, + 0x0A09E02BDB883CA6, 0xFD5D9ADA7E3FB895, 0xC6F5FDD9EEAB83B5, 0x84589BB29F52A92A, + 0x9B2517F13F8E9814, 0x6F752AF6A52E31EC, 0x8E717799E324CE8A, 0x84D90AEF39262D58, + 0x79C27B13FC28944D, 0xE6D6DF6438E0044A, 0x51B603E400D79CA4, 0x6A902B28C588B390, + 0x8D7F8DE9E6CB1D83, 0xCF1A4DC11CA7F044, 0xEF02E43C366786F1, 0x89915BCDBCFBE30F, + 0x5928B306F1A9CC7F, 0xA8B59092996851C5, 0x22050A20427E8B25, 0x6E6D64018941E7EE, + 0x9798C898B81AE846, 0x80EF218CDC30124A, 0xFCE45E60D55B0284, 0x4010E735D3147C35, + 0xEB647D999FD8DC7E, 0xD3544DCAB14FE907, 0xB588B27D8438700C, 0xA49EBFC43E057A4C +}; + +const uint64_t refval_32be[81] = { 0, + 0xC92229C10FAEA50E, 0x0FE212630DD87E0F, 0x968F016D60417BB3, 0xE6B12B2C889913AB, + 0xAA3787887A9DA368, 0x06EE7202D53CEF39, 0x6149AFB2C296664B, 0x86C893210F9A5805, + 0x8379E5DA988AA04C, 0x24763AA7CE411A60, 0x9CF9C64B395A4CF8, 0xFFC192C338DDE904, + 0x094575BAB319E5F5, 0xBBBACFE7728C6511, 0x36B8C3CEBE4EF409, 0xAA0BA8A3397BA4D0, + 0xF9F85CF7124EE653, 0x3ADF4F7DF2A887AE, 0xAA2A0F5964AA9A7A, 0xF18B563F42D36EB8, + 0x034366CEF8334F5C, 0xAE2E85180E330E5F, 0xA5CE9FBFDF5C65B8, 0x5E509F25A9CA9B0B, + 0xE30D1358C2013BD2, 0xBB3A04D5EB8111FE, 0xB04234E82A15A28D, 0x87426A56D0EA0E2F, + 0x095086668E07F9F8, 0xF4CD3A43B6A6AEA5, 0x73F9B9B674D472A6, 0x558344229A1E4DCF, + 0x0AD4C95B2279181A, 0x5E3D19D80821CA6B, 0x652492D25BEBA258, 0xEFA84B02EAB849B1, + 0x81AD2D253059AC2C, 0x1400CCB0DFB2F457, 0x5688DC72A839860E, 0x67CC130E0FD1B0A7, + 0x0A851E3A94E21E69, 0x2EA0000B6A073907, 0xAE9776FF9BF1D02E, 0xC0A96B66B160631C, + 0xA93341DE4ED7C8F0, 0x6FBADD8F5B85E141, 0xB7D295F1C21E0CBA, 0x6D6114591B8E434F, + 0xF5B6939B63D97BE7, 0x3C80D5053F0E5DB4, 0xAC520ACC6B73F62D, 0xD1051F5841CF3966, + 0x62245AEA644AE760, 0x0CD56BE15497C62D, 0x5BB93435C4988FB6, 0x5FADB88EB18DB512, + 0xC897CAE2242475CC, 0xF1A094EF846DC9BB, 0x2B1D8B24924F79B6, 0xC6DF0C0E8456EB53, + 0xE6A40128303A9B9C, 0x64D37AF5EFFA7BD9, 0x90FEB70A5AE2A598, 0xEC3BA5F126D9FF4B, + 0x3121C8EC3AC51B29, 0x3B41C4D422166EC1, 0xB4878DDCBF48ED76, 0x5CB850D77CB762E4, + 0x9A27A43CC1DD171F, 0x2FDFFC6F99CB424A, 0xF54A57E09FDEA7BB, 0x5F78E5EE2CAB7039, + 0xB8BA95883DB31CBA, 0x131C61EB84AF86C3, 0x84B1F64E9C613DA7, 0xE94C1888C0C37C02, + 0xEA08F8BFB2039CDE, 0xCCC6D04D243EC753, 0x8977D105298B0629, 0x7AAA976494A5905E +}; + +#ifdef T1HA0_AESNI_AVAILABLE +const uint64_t refval_ia32aes_a[81] = { 0, + 0x772C7311BE32FF42, 0xB231AC660E5B23B5, 0x71F6DF5DA3B4F532, 0x555859635365F660, + 0xE98808F1CD39C626, 0x2EB18FAF2163BB09, 0x7B9DD892C8019C87, 0xE2B1431C4DA4D15A, + 0x1984E718A5477F70, 0x08DD17B266484F79, 0x4C83A05D766AD550, 0x92DCEBB131D1907D, + 0xD67BC6FC881B8549, 0xF6A9886555FBF66B, 0x6E31616D7F33E25E, 0x36E31B7426E3049D, + 0x4F8E4FAF46A13F5F, 0x03EB0CB3253F819F, 0x636A7769905770D2, 0x3ADF3781D16D1148, + 0x92D19CB1818BC9C2, 0x283E68F4D459C533, 0xFA83A8A88DECAA04, 0x8C6F00368EAC538C, + 0x7B66B0CF3797B322, 0x5131E122FDABA3FF, 0x6E59FF515C08C7A9, 0xBA2C5269B2C377B0, + 0xA9D24FD368FE8A2B, 0x22DB13D32E33E891, 0x7B97DFC804B876E5, 0xC598BDFCD0E834F9, + 0xB256163D3687F5A7, 0x66D7A73C6AEF50B3, 0xBB34C6A4396695D2, 0x7F46E1981C3256AD, + 0x4B25A9B217A6C5B4, 0x7A0A6BCDD2321DA9, 0x0A1F55E690A7B44E, 0x8F451A91D7F05244, + 0x624D5D3C9B9800A7, 0x09DDC2B6409DDC25, 0x3E155765865622B6, 0x96519FAC9511B381, + 0x512E58482FE4FBF0, 0x1AB260EA7D54AE1C, 0x67976F12CC28BBBD, 0x0607B5B2E6250156, + 0x7E700BEA717AD36E, 0x06A058D9D61CABB3, 0x57DA5324A824972F, 0x1193BA74DBEBF7E7, + 0xC18DC3140E7002D4, 0x9F7CCC11DFA0EF17, 0xC487D6C20666A13A, 0xB67190E4B50EF0C8, + 0xA53DAA608DF0B9A5, 0x7E13101DE87F9ED3, 0x7F8955AE2F05088B, 0x2DF7E5A097AD383F, + 0xF027683A21EA14B5, 0x9BB8AEC3E3360942, 0x92BE39B54967E7FE, 0x978C6D332E7AFD27, + 0xED512FE96A4FAE81, 0x9E1099B8140D7BA3, 0xDFD5A5BE1E6FE9A6, 0x1D82600E23B66DD4, + 0x3FA3C3B7EE7B52CE, 0xEE84F7D2A655EF4C, 0x2A4361EC769E3BEB, 0x22E4B38916636702, + 0x0063096F5D39A115, 0x6C51B24DAAFA5434, 0xBAFB1DB1B411E344, 0xFF529F161AE0C4B0, + 0x1290EAE3AC0A686F, 0xA7B0D4585447D1BE, 0xAED3D18CB6CCAD53, 0xFC73D46F8B41BEC6 +}; + +const uint64_t refval_ia32aes_b[81] = { 0, + 0x772C7311BE32FF42, 0x4398F62A8CB6F72A, 0x71F6DF5DA3B4F532, 0x555859635365F660, + 0xE98808F1CD39C626, 0x2EB18FAF2163BB09, 0x7B9DD892C8019C87, 0xE2B1431C4DA4D15A, + 0x1984E718A5477F70, 0x08DD17B266484F79, 0x4C83A05D766AD550, 0x92DCEBB131D1907D, + 0xD67BC6FC881B8549, 0xF6A9886555FBF66B, 0x6E31616D7F33E25E, 0x36E31B7426E3049D, + 0x4F8E4FAF46A13F5F, 0x03EB0CB3253F819F, 0x636A7769905770D2, 0x3ADF3781D16D1148, + 0x92D19CB1818BC9C2, 0x283E68F4D459C533, 0xFA83A8A88DECAA04, 0x8C6F00368EAC538C, + 0x7B66B0CF3797B322, 0x5131E122FDABA3FF, 0x6E59FF515C08C7A9, 0xBA2C5269B2C377B0, + 0xA9D24FD368FE8A2B, 0x22DB13D32E33E891, 0x7B97DFC804B876E5, 0xC598BDFCD0E834F9, + 0xB256163D3687F5A7, 0x66D7A73C6AEF50B3, 0xE810F88E85CEA11A, 0x4814F8F3B83E4394, + 0x9CABA22D10A2F690, 0x0D10032511F58111, 0xE9A36EF5EEA3CD58, 0xC79242DE194D9D7C, + 0xC3871AA0435EE5C8, 0x52890BED43CCF4CD, 0x07A1D0861ACCD373, 0x227B816FF0FEE9ED, + 0x59FFBF73AACFC0C4, 0x09AB564F2BEDAD0C, 0xC05F744F2EE38318, 0x7B50B621D547C661, + 0x0C1F71CB4E68E5D1, 0x0E33A47881D4DBAA, 0xF5C3BF198E9A7C2E, 0x16328FD8C0F68A91, + 0xA3E399C9AB3E9A59, 0x163AE71CBCBB18B8, 0x18F17E4A8C79F7AB, 0x9250E2EA37014B45, + 0x7BBBB111D60B03E4, 0x3DAA4A3071A0BD88, 0xA28828D790A2D6DC, 0xBC70FC88F64BE3F1, + 0xA3E48008BA4333C7, 0x739E435ACAFC79F7, 0x42BBB360BE007CC6, 0x4FFB6FD2AF74EC92, + 0x2A799A2994673146, 0xBE0A045B69D48E9F, 0x549432F54FC6A278, 0x371D3C60369FC702, + 0xDB4557D415B08CA7, 0xE8692F0A83850B37, 0x022E46AEB36E9AAB, 0x117AC9B814E4652D, + 0xA361041267AE9048, 0x277CB51C961C3DDA, 0xAFFC96F377CB8A8D, 0x83CC79FA01DD1BA7, + 0xA494842ACF4B802C, 0xFC6D9CDDE2C34A3F, 0x4ED6863CE455F7A7, 0x630914D0DB7AAE98 +}; +#endif /* T1HA0_AESNI_AVAILABLE */ + +/* *INDENT-ON* */ +/* clang-format on */ + +/*****************************************************************************/ + +uint64_t thunk_t1ha2_atonce128(const void *data, size_t len, uint64_t seed) { + uint64_t unused; + return t1ha2_atonce128(&unused, data, len, seed); +} + +uint64_t thunk_t1ha2_stream(const void *data, size_t len, uint64_t seed) { + t1ha_context_t ctx; + t1ha2_init(&ctx, seed, seed); + t1ha2_update(&ctx, data, len); + return t1ha2_final(&ctx, NULL); +} + +uint64_t thunk_t1ha2_stream128(const void *data, size_t len, uint64_t seed) { + t1ha_context_t ctx; + t1ha2_init(&ctx, seed, seed); + t1ha2_update(&ctx, data, len); + uint64_t unused; + return t1ha2_final(&ctx, &unused); +} + +static bool probe(uint64_t (*hash)(const void *, size_t, uint64_t), + const char *caption, const uint64_t check, const void *data, + unsigned len, uint64_t seed) { + uint64_t value = hash(data, len, seed); + if (is_option_set(test_verbose) || (value != check)) + printf("Pattern '%s', reference value %08X%08X: ", caption, + (uint32_t)(check >> 32), (uint32_t)check); + if (check == value) { + if (is_option_set(test_verbose)) + printf("Passed\n"); + return false; + } + if (!is_option_set(test_quiet)) + printf("Failed! Got %08X%08X\n", (uint32_t)(value >> 32), (uint32_t)value); + return true; +} + +bool verify(const char *title, uint64_t (*hash)(const void *, size_t, uint64_t), + const uint64_t *reference_values) { + if (!is_option_set(test_quiet)) + printf("Testing %s...%s", title, is_option_set(test_verbose) ? "\n" : ""); + + const uint64_t zero = 0; + bool failed = false; + failed |= probe(hash, "empty-zero", *reference_values++, NULL, 0, zero); + failed |= probe(hash, "empty-all1", *reference_values++, NULL, 0, ~zero); + failed |= probe(hash, "bin64-zero", *reference_values++, pattern, 64, zero); + + char caption[32]; + uint64_t seed = 1; + for (int i = 1; i < 64; i++) { + snprintf(caption, sizeof(caption), "bin%02i-1p%02u", i, i & 63); + failed |= probe(hash, caption, *reference_values++, pattern, i, seed); + seed <<= 1; + } + + seed = ~zero; + for (int i = 1; i <= 7; i++) { + seed <<= 1; + snprintf(caption, sizeof(caption), "align%i_F%u", i, 64 - i); + failed |= + probe(hash, caption, *reference_values++, pattern + i, 64 - i, seed); + } + + uint8_t pattern_long[512]; + for (size_t i = 0; i < sizeof(pattern_long); ++i) + pattern_long[i] = (uint8_t)i; + for (int i = 0; i <= 7; i++) { + snprintf(caption, sizeof(caption), "long-%05u", 128 + i * 17); + failed |= probe(hash, caption, *reference_values++, pattern_long + i, + 128 + i * 17, seed); + } + + if (!is_option_set(test_quiet)) + printf(" %s\n", (!is_option_set(test_verbose) && !failed) ? "Ok" : ""); + return failed; +} diff --git a/tests/xxhash/xxhash.c b/tests/xxhash/xxhash.c new file mode 100644 index 0000000..b4f9032 --- /dev/null +++ b/tests/xxhash/xxhash.c @@ -0,0 +1,1111 @@ +/* +* xxHash - Fast Hash algorithm +* Copyright (C) 2012-2016, Yann Collet +* +* BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions are +* met: +* +* * Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* * Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following disclaimer +* in the documentation and/or other materials provided with the +* distribution. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +* You can contact the author at : +* - xxHash homepage: http://www.xxhash.com +* - xxHash source repository : https://github.com/Cyan4973/xxHash +*/ + +/* ************************************* +* Tuning parameters +***************************************/ +/*!XXH_FORCE_MEMORY_ACCESS : + * By default, access to unaligned memory is controlled by `memcpy()`, which is + * safe and portable. + * Unfortunately, on some target/compiler combinations, the generated assembly + * is sub-optimal. + * The below switch allow to select different access method for improved + * performance. + * Method 0 (default) : use `memcpy()`. Safe and portable. + * Method 1 : `__packed` statement. It depends on compiler extension (ie, not + * portable). + * This method is safe if your compiler supports it, and *generally* + * as fast or faster than `memcpy`. + * Method 2 : direct access. This method doesn't depend on compiler but violate + * C standard. + * It can generate buggy code on targets which do not support + * unaligned memory accesses. + * But in some circumstances, it's the only known way to get the most + * performance (ie GCC + ARMv6) + * See http://stackoverflow.com/a/32095106/646947 for details. + * Prefer these methods in priority order (0 > 1 > 2) + */ +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line \ + for example */ +#if defined(__GNUC__) && \ + (defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \ + defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__)) +#define XXH_FORCE_MEMORY_ACCESS 2 +#elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ + (defined(__GNUC__) && \ + (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ + defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || \ + defined(__ARM_ARCH_7S__))) +#define XXH_FORCE_MEMORY_ACCESS 1 +#endif +#endif + +/*!XXH_ACCEPT_NULL_INPUT_POINTER : + * If input pointer is NULL, xxHash default behavior is to dereference it, + * triggering a segfault. + * When this macro is enabled, xxHash actively checks input for null pointer. + * It it is, result for null input pointers is the same as a null-length input. + */ +#ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ +#define XXH_ACCEPT_NULL_INPUT_POINTER 0 +#endif + +/*!XXH_FORCE_NATIVE_FORMAT : + * By default, xxHash library provides endian-independent Hash values, based on + * little-endian convention. + * Results are therefore identical for little-endian and big-endian CPU. + * This comes at a performance cost for big-endian CPU, since some swapping is + * required to emulate little-endian format. + * Should endian-independence be of no importance for your application, you may + * set the #define below to 1, + * to improve speed for Big-endian CPU. + * This option has no impact on Little_Endian CPU. + */ +#ifndef XXH_FORCE_NATIVE_FORMAT /* can be defined externally */ +#define XXH_FORCE_NATIVE_FORMAT 0 +#endif + +/*!XXH_FORCE_ALIGN_CHECK : + * This is a minor performance trick, only useful with lots of very small keys. + * It means : check for aligned/unaligned input. + * The check costs one initial branch per hash; + * set it to 0 when the input is guaranteed to be aligned, + * or when alignment doesn't matter for performance. + */ +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ +#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || \ + defined(_M_X64) +#define XXH_FORCE_ALIGN_CHECK 0 +#else +#define XXH_FORCE_ALIGN_CHECK 1 +#endif +#endif + +#ifdef _MSC_VER +/* LY: just disable warnings */ +#pragma warning(push, 1) +#pragma warning(disable : 4715) /* not all control paths return a value */ +#endif /* MSVC */ + +/* ************************************* +* Includes & Memory related functions +***************************************/ +/*! Modify the local functions below should you wish to use some other memory +* routines +* for malloc(), free() */ +#include +static void *XXH_malloc(size_t s) { return malloc(s); } +static void XXH_free(void *p) { free(p); } +/*! and for memcpy() */ +#include +static void *XXH_memcpy(void *dest, const void *src, size_t size) { + return memcpy(dest, src, size); +} + +#include /* assert */ + +#define XXH_STATIC_LINKING_ONLY +#include "xxhash.h" + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio */ +#pragma warning( \ + disable : 4127) /* disable: C4127: conditional expression is constant */ +#define FORCE_INLINE static __forceinline +#else +#if defined(__cplusplus) || \ + defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +#ifdef __GNUC__ +#define FORCE_INLINE static inline __attribute__((always_inline)) +#else +#define FORCE_INLINE static inline +#endif +#else +#define FORCE_INLINE static +#endif /* __STDC_VERSION__ */ +#endif + +/* ************************************* +* Basic Types +***************************************/ +#ifndef MEM_MODULE +#if !defined(__VMS) && \ + (defined(__cplusplus) || \ + (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)) +#include +typedef uint8_t BYTE; +typedef uint16_t U16; +typedef uint32_t U32; +#else +typedef unsigned char BYTE; +typedef unsigned short U16; +typedef unsigned int U32; +#endif +#endif + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory + * access in hardware */ +static U32 XXH_read32(const void *memPtr) { return *(const U32 *)memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1)) + +/* __pack instructions are safer, but compiler specific, hence potentially + * problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { U32 u32; } __attribute__((packed)) unalign; +static U32 XXH_read32(const void *ptr) { return ((const unalign *)ptr)->u32; } + +#else + +/* portable and safe solution. Generally efficient. + * see : http://stackoverflow.com/a/32095106/646947 + */ +static U32 XXH_read32(const void *memPtr) { + U32 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +/* Note : although _rotl exists for minGW (GCC under windows), performance seems + * poor */ +#if defined(_MSC_VER) +#define XXH_rotl32(x, r) _rotl(x, r) +#define XXH_rotl64(x, r) _rotl64(x, r) +#else +#define XXH_rotl32(x, r) ((x << r) | (x >> (32 - r))) +#define XXH_rotl64(x, r) ((x << r) | (x >> (64 - r))) +#endif + +#if defined(_MSC_VER) /* Visual Studio */ +#define XXH_swap32 _byteswap_ulong +#elif XXH_GCC_VERSION >= 403 +#define XXH_swap32 __builtin_bswap32 +#else +static U32 XXH_swap32(U32 x) { + return ((x << 24) & 0xff000000) | ((x << 8) & 0x00ff0000) | + ((x >> 8) & 0x0000ff00) | ((x >> 24) & 0x000000ff); +} +#endif + +/* ************************************* +* Architecture Macros +***************************************/ +typedef enum { XXH_bigEndian = 0, XXH_littleEndian = 1 } XXH_endianess; + +/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler + * command line */ +#ifndef XXH_CPU_LITTLE_ENDIAN +static int XXH_isLittleEndian(void) { + const union { + U32 u; + BYTE c[4]; + } one = {1}; /* don't use static : performance detrimental */ + return one.c[0]; +} +#define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() +#endif + +/* *************************** +* Memory reads +*****************************/ +typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; + +FORCE_INLINE U32 XXH_readLE32_align(const void *ptr, XXH_endianess endian, + XXH_alignment align) { + if (align == XXH_unaligned) + return endian == XXH_littleEndian ? XXH_read32(ptr) + : XXH_swap32(XXH_read32(ptr)); + else + return endian == XXH_littleEndian ? *(const U32 *)ptr + : XXH_swap32(*(const U32 *)ptr); +} + +FORCE_INLINE U32 XXH_readLE32(const void *ptr, XXH_endianess endian) { + return XXH_readLE32_align(ptr, endian, XXH_unaligned); +} + +static U32 XXH_readBE32(const void *ptr) { + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} + +/* ************************************* +* Macros +***************************************/ +#define XXH_STATIC_ASSERT(c) \ + { \ + enum { XXH_sa = 1 / (int)(!!(c)) }; \ + } /* use after variable declarations */ +XXH_PUBLIC_API unsigned XXH_versionNumber(void) { return XXH_VERSION_NUMBER; } + +/* ******************************************************************* +* 32-bit hash functions +*********************************************************************/ +static const U32 PRIME32_1 = 2654435761U; +static const U32 PRIME32_2 = 2246822519U; +static const U32 PRIME32_3 = 3266489917U; +static const U32 PRIME32_4 = 668265263U; +static const U32 PRIME32_5 = 374761393U; + +static U32 XXH32_round(U32 seed, U32 input) { + seed += input * PRIME32_2; + seed = XXH_rotl32(seed, 13); + seed *= PRIME32_1; + return seed; +} + +/* mix all bits */ +static U32 XXH32_avalanche(U32 h32) { + h32 ^= h32 >> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + return (h32); +} + +#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align) + +static U32 XXH32_finalize(U32 h32, const void *ptr, size_t len, + XXH_endianess endian, XXH_alignment align) + +{ + const BYTE *p = (const BYTE *)ptr; +#define PROCESS1 \ + h32 += (*p) * PRIME32_5; \ + p++; \ + h32 = XXH_rotl32(h32, 11) * PRIME32_1; + +#define PROCESS4 \ + h32 += XXH_get32bits(p) * PRIME32_3; \ + p += 4; \ + h32 = XXH_rotl32(h32, 17) * PRIME32_4; + + switch (len & 15) /* or switch(bEnd - p) */ + { + case 12: + PROCESS4; + /* fallthrough */ + case 8: + PROCESS4; + /* fallthrough */ + case 4: + PROCESS4; + return XXH32_avalanche(h32); + + case 13: + PROCESS4; + /* fallthrough */ + case 9: + PROCESS4; + /* fallthrough */ + case 5: + PROCESS4; + PROCESS1; + return XXH32_avalanche(h32); + + case 14: + PROCESS4; + /* fallthrough */ + case 10: + PROCESS4; + /* fallthrough */ + case 6: + PROCESS4; + PROCESS1; + PROCESS1; + return XXH32_avalanche(h32); + + case 15: + PROCESS4; + /* fallthrough */ + case 11: + PROCESS4; + /* fallthrough */ + case 7: + PROCESS4; + /* fallthrough */ + case 3: + PROCESS1; + /* fallthrough */ + case 2: + PROCESS1; + /* fallthrough */ + case 1: + PROCESS1; + /* fallthrough */ + case 0: + return XXH32_avalanche(h32); + } + assert(0); + return h32; /* reaching this point is deemed impossible */ +} + +FORCE_INLINE U32 XXH32_endian_align(const void *input, size_t len, U32 seed, + XXH_endianess endian, XXH_alignment align) { + const BYTE *p = (const BYTE *)input; + const BYTE *bEnd = p + len; + U32 h32; + +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ + (XXH_ACCEPT_NULL_INPUT_POINTER >= 1) + if (p == NULL) { + len = 0; + bEnd = p = (const BYTE *)(size_t)16; + } +#endif + + if (len >= 16) { + const BYTE *const limit = bEnd - 15; + U32 v1 = seed + PRIME32_1 + PRIME32_2; + U32 v2 = seed + PRIME32_2; + U32 v3 = seed + 0; + U32 v4 = seed - PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(p)); + p += 4; + v2 = XXH32_round(v2, XXH_get32bits(p)); + p += 4; + v3 = XXH32_round(v3, XXH_get32bits(p)); + p += 4; + v4 = XXH32_round(v4, XXH_get32bits(p)); + p += 4; + } while (p < limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + + XXH_rotl32(v4, 18); + } else { + h32 = seed + PRIME32_5; + } + + h32 += (U32)len; + + return XXH32_finalize(h32, p, len & 15, endian, align); +} + +XXH_PUBLIC_API unsigned int XXH32(const void *input, size_t len, + unsigned int seed) { +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, input, len); + return XXH32_digest(&state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == + 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, + XXH_aligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } + } + + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, + XXH_unaligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + +/*====== Hash streaming ======*/ + +XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void) { + return (XXH32_state_t *)XXH_malloc(sizeof(XXH32_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr) { + XXH_free(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t *dstState, + const XXH32_state_t *srcState) { + memcpy(dstState, srcState, sizeof(*dstState)); +} + +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr, + unsigned int seed) { + XXH32_state_t state; /* using a local state to memcpy() in order to avoid + strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME32_1 + PRIME32_2; + state.v2 = seed + PRIME32_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME32_1; + /* do not write into reserved, planned to be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); + return XXH_OK; +} + +FORCE_INLINE +XXH_errorcode XXH32_update_endian(XXH32_state_t *state, const void *input, + size_t len, XXH_endianess endian) { + const BYTE *p = (const BYTE *)input; + const BYTE *const bEnd = p + len; + + if (input == NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ + (XXH_ACCEPT_NULL_INPUT_POINTER >= 1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + state->total_len_32 += (unsigned)len; + state->large_len |= (len >= 16) | (state->total_len_32 >= 16); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((BYTE *)(state->mem32) + state->memsize, input, len); + state->memsize += (unsigned)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((BYTE *)(state->mem32) + state->memsize, input, + 16 - state->memsize); + { + const U32 *p32 = state->mem32; + state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); + p32++; + state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); + p32++; + state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); + p32++; + state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian)); + } + p += 16 - state->memsize; + state->memsize = 0; + } + + if (p <= bEnd - 16) { + const BYTE *const limit = bEnd - 16; + U32 v1 = state->v1; + U32 v2 = state->v2; + U32 v3 = state->v3; + U32 v4 = state->v4; + + do { + v1 = XXH32_round(v1, XXH_readLE32(p, endian)); + p += 4; + v2 = XXH32_round(v2, XXH_readLE32(p, endian)); + p += 4; + v3 = XXH32_round(v3, XXH_readLE32(p, endian)); + p += 4; + v4 = XXH32_round(v4, XXH_readLE32(p, endian)); + p += 4; + } while (p <= limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd - p)); + state->memsize = (unsigned)(bEnd - p); + } + + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *state_in, + const void *input, size_t len) { + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH32_update_endian(state_in, input, len, XXH_bigEndian); +} + +FORCE_INLINE U32 XXH32_digest_endian(const XXH32_state_t *state, + XXH_endianess endian) { + U32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18); + } else { + h32 = state->v3 /* == seed */ + PRIME32_5; + } + + h32 += state->total_len_32; + + return XXH32_finalize(h32, state->mem32, state->memsize, endian, XXH_aligned); +} + +XXH_PUBLIC_API unsigned int XXH32_digest(const XXH32_state_t *state_in) { + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_digest_endian(state_in, XXH_littleEndian); + else + return XXH32_digest_endian(state_in, XXH_bigEndian); +} + +/*====== Canonical representation ======*/ + +/*! Default XXH result types are basic unsigned 32 and 64 bits. +* The canonical representation follows human-readable write convention, aka +* big-endian (large digits first). +* These functions allow transformation of hash result into and from its +* canonical format. +* This way, hash values can be written into a file or buffer, remaining +* comparable across different systems. +*/ + +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst, + XXH32_hash_t hash) { + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) + hash = XXH_swap32(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API XXH32_hash_t +XXH32_hashFromCanonical(const XXH32_canonical_t *src) { + return XXH_readBE32(src); +} + +#ifndef XXH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ + +/*====== Memory access ======*/ + +#ifndef MEM_MODULE +#define MEM_MODULE +#if !defined(__VMS) && \ + (defined(__cplusplus) || \ + (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)) +#include +typedef uint64_t U64; +#else +/* if compiler doesn't support unsigned long long, replace by another 64-bit + * type */ +typedef unsigned long long U64; +#endif +#endif + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory + * access in hardware */ +static U64 XXH_read64(const void *memPtr) { return *(const U64 *)memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1)) + +/* __pack instructions are safer, but compiler specific, hence potentially + * problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { + U32 u32; + U64 u64; +} __attribute__((packed)) unalign64; +static U64 XXH_read64(const void *ptr) { return ((const unalign64 *)ptr)->u64; } + +#else + +/* portable and safe solution. Generally efficient. + * see : http://stackoverflow.com/a/32095106/646947 + */ + +static U64 XXH_read64(const void *memPtr) { + U64 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +#define XXH_swap64 _byteswap_uint64 +#elif XXH_GCC_VERSION >= 403 +#define XXH_swap64 __builtin_bswap64 +#else +static U64 XXH_swap64(U64 x) { + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + +FORCE_INLINE U64 XXH_readLE64_align(const void *ptr, XXH_endianess endian, + XXH_alignment align) { + if (align == XXH_unaligned) + return endian == XXH_littleEndian ? XXH_read64(ptr) + : XXH_swap64(XXH_read64(ptr)); + else + return endian == XXH_littleEndian ? *(const U64 *)ptr + : XXH_swap64(*(const U64 *)ptr); +} + +FORCE_INLINE U64 XXH_readLE64(const void *ptr, XXH_endianess endian) { + return XXH_readLE64_align(ptr, endian, XXH_unaligned); +} + +static U64 XXH_readBE64(const void *ptr) { + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} + +/*====== xxh64 ======*/ + +static const U64 PRIME64_1 = 11400714785074694791ULL; +static const U64 PRIME64_2 = 14029467366897019727ULL; +static const U64 PRIME64_3 = 1609587929392839161ULL; +static const U64 PRIME64_4 = 9650029242287828579ULL; +static const U64 PRIME64_5 = 2870177450012600261ULL; + +static U64 XXH64_round(U64 acc, U64 input) { + acc += input * PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= PRIME64_1; + return acc; +} + +static U64 XXH64_mergeRound(U64 acc, U64 val) { + val = XXH64_round(0, val); + acc ^= val; + acc = acc * PRIME64_1 + PRIME64_4; + return acc; +} + +static U64 XXH64_avalanche(U64 h64) { + h64 ^= h64 >> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + return h64; +} + +#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align) + +static U64 XXH64_finalize(U64 h64, const void *ptr, size_t len, + XXH_endianess endian, XXH_alignment align) { + const BYTE *p = (const BYTE *)ptr; + +#define PROCESS1_64 \ + h64 ^= (*p) * PRIME64_5; \ + p++; \ + h64 = XXH_rotl64(h64, 11) * PRIME64_1; + +#define PROCESS4_64 \ + h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; \ + p += 4; \ + h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; + +#define PROCESS8_64 \ + { \ + U64 const k1 = XXH64_round(0, XXH_get64bits(p)); \ + p += 8; \ + h64 ^= k1; \ + h64 = XXH_rotl64(h64, 27) * PRIME64_1 + PRIME64_4; \ + } + + switch (len & 31) { + case 24: + PROCESS8_64; + /* fall through */ + case 16: + PROCESS8_64; + /* fall through */ + case 8: + PROCESS8_64; + return XXH64_avalanche(h64); + + case 28: + PROCESS8_64; + /* fall through */ + case 20: + PROCESS8_64; + /* fall through */ + case 12: + PROCESS8_64; + /* fall through */ + case 4: + PROCESS4_64; + return XXH64_avalanche(h64); + + case 25: + PROCESS8_64; + /* fall through */ + case 17: + PROCESS8_64; + /* fall through */ + case 9: + PROCESS8_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 29: + PROCESS8_64; + /* fall through */ + case 21: + PROCESS8_64; + /* fall through */ + case 13: + PROCESS8_64; + /* fall through */ + case 5: + PROCESS4_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 26: + PROCESS8_64; + /* fall through */ + case 18: + PROCESS8_64; + /* fall through */ + case 10: + PROCESS8_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 30: + PROCESS8_64; + /* fall through */ + case 22: + PROCESS8_64; + /* fall through */ + case 14: + PROCESS8_64; + /* fall through */ + case 6: + PROCESS4_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 27: + PROCESS8_64; + /* fall through */ + case 19: + PROCESS8_64; + /* fall through */ + case 11: + PROCESS8_64; + PROCESS1_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 31: + PROCESS8_64; + /* fall through */ + case 23: + PROCESS8_64; + /* fall through */ + case 15: + PROCESS8_64; + /* fall through */ + case 7: + PROCESS4_64; + /* fall through */ + case 3: + PROCESS1_64; + /* fall through */ + case 2: + PROCESS1_64; + /* fall through */ + case 1: + PROCESS1_64; + /* fall through */ + case 0: + return XXH64_avalanche(h64); + } + /* impossible to reach */ + assert(0); +} + +FORCE_INLINE U64 XXH64_endian_align(const void *input, size_t len, U64 seed, + XXH_endianess endian, XXH_alignment align) { + const BYTE *p = (const BYTE *)input; + const BYTE *bEnd = p + len; + U64 h64; + +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ + (XXH_ACCEPT_NULL_INPUT_POINTER >= 1) + if (p == NULL) { + len = 0; + bEnd = p = (const BYTE *)(size_t)32; + } +#endif + + if (len >= 32) { + const BYTE *const limit = bEnd - 32; + U64 v1 = seed + PRIME64_1 + PRIME64_2; + U64 v2 = seed + PRIME64_2; + U64 v3 = seed + 0; + U64 v4 = seed - PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(p)); + p += 8; + v2 = XXH64_round(v2, XXH_get64bits(p)); + p += 8; + v3 = XXH64_round(v3, XXH_get64bits(p)); + p += 8; + v4 = XXH64_round(v4, XXH_get64bits(p)); + p += 8; + } while (p <= limit); + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + + } else { + h64 = seed + PRIME64_5; + } + + h64 += (U64)len; + + return XXH64_finalize(h64, p, len, endian, align); +} + +XXH_PUBLIC_API unsigned long long XXH64(const void *input, size_t len, + unsigned long long seed) { +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, input, len); + return XXH64_digest(&state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7) == + 0) { /* Input is aligned, let's leverage the speed advantage */ + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_endian_align(input, len, seed, XXH_littleEndian, + XXH_aligned); + else + return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } + } + + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_endian_align(input, len, seed, XXH_littleEndian, + XXH_unaligned); + else + return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + +/*====== Hash Streaming ======*/ + +XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void) { + return (XXH64_state_t *)XXH_malloc(sizeof(XXH64_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr) { + XXH_free(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t *dstState, + const XXH64_state_t *srcState) { + memcpy(dstState, srcState, sizeof(*dstState)); +} + +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr, + unsigned long long seed) { + XXH64_state_t state; /* using a local state to memcpy() in order to avoid + strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME64_1 + PRIME64_2; + state.v2 = seed + PRIME64_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME64_1; + /* do not write into reserved, planned to be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); + return XXH_OK; +} + +FORCE_INLINE +XXH_errorcode XXH64_update_endian(XXH64_state_t *state, const void *input, + size_t len, XXH_endianess endian) { + const BYTE *p = (const BYTE *)input; + const BYTE *const bEnd = p + len; + + if (input == NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ + (XXH_ACCEPT_NULL_INPUT_POINTER >= 1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((BYTE *)state->mem64) + state->memsize, input, len); + state->memsize += (U32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((BYTE *)state->mem64) + state->memsize, input, + 32 - state->memsize); + state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64 + 0, endian)); + state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64 + 1, endian)); + state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64 + 2, endian)); + state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64 + 3, endian)); + p += 32 - state->memsize; + state->memsize = 0; + } + + if (p + 32 <= bEnd) { + const BYTE *const limit = bEnd - 32; + U64 v1 = state->v1; + U64 v2 = state->v2; + U64 v3 = state->v3; + U64 v4 = state->v4; + + do { + v1 = XXH64_round(v1, XXH_readLE64(p, endian)); + p += 8; + v2 = XXH64_round(v2, XXH_readLE64(p, endian)); + p += 8; + v3 = XXH64_round(v3, XXH_readLE64(p, endian)); + p += 8; + v4 = XXH64_round(v4, XXH_readLE64(p, endian)); + p += 8; + } while (p <= limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd - p)); + state->memsize = (unsigned)(bEnd - p); + } + + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *state_in, + const void *input, size_t len) { + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH64_update_endian(state_in, input, len, XXH_bigEndian); +} + +FORCE_INLINE U64 XXH64_digest_endian(const XXH64_state_t *state, + XXH_endianess endian) { + U64 h64; + + if (state->total_len >= 32) { + U64 const v1 = state->v1; + U64 const v2 = state->v2; + U64 const v3 = state->v3; + U64 const v4 = state->v4; + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + } else { + h64 = state->v3 /*seed*/ + PRIME64_5; + } + + h64 += (U64)state->total_len; + + return XXH64_finalize(h64, state->mem64, (size_t)state->total_len, endian, + XXH_aligned); +} + +XXH_PUBLIC_API unsigned long long XXH64_digest(const XXH64_state_t *state_in) { + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_digest_endian(state_in, XXH_littleEndian); + else + return XXH64_digest_endian(state_in, XXH_bigEndian); +} + +/*====== Canonical representation ======*/ + +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst, + XXH64_hash_t hash) { + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) + hash = XXH_swap64(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API XXH64_hash_t +XXH64_hashFromCanonical(const XXH64_canonical_t *src) { + return XXH_readBE64(src); +} + +#endif /* XXH_NO_LONG_LONG */ diff --git a/tests/xxhash/xxhash.h b/tests/xxhash/xxhash.h new file mode 100644 index 0000000..9135cfe --- /dev/null +++ b/tests/xxhash/xxhash.h @@ -0,0 +1,326 @@ +/* + xxHash - Extremely Fast Hash algorithm + Header File + Copyright (C) 2012-2016, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - xxHash source repository : https://github.com/Cyan4973/xxHash +*/ + +/* Notice extracted from xxHash homepage : + +xxHash is an extremely fast Hash algorithm, running at RAM speed limits. +It also successfully passes all tests from the SMHasher suite. + +Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo +@3GHz) + +Name Speed Q.Score Author +xxHash 5.4 GB/s 10 +CrapWow 3.2 GB/s 2 Andrew +MumurHash 3a 2.7 GB/s 10 Austin Appleby +SpookyHash 2.0 GB/s 10 Bob Jenkins +SBox 1.4 GB/s 9 Bret Mulvey +Lookup3 1.2 GB/s 9 Bob Jenkins +SuperFastHash 1.2 GB/s 1 Paul Hsieh +CityHash64 1.05 GB/s 10 Pike & Alakuijala +FNV 0.55 GB/s 5 Fowler, Noll, Vo +CRC32 0.43 GB/s 9 +MD5-32 0.33 GB/s 10 Ronald L. Rivest +SHA1-32 0.28 GB/s 10 + +Q.Score is a measure of quality of the hash function. +It depends on successfully passing SMHasher test set. +10 is a perfect score. + +A 64-bit version, named XXH64, is available since r35. +It offers much better speed, but for 64-bit applications only. +Name Speed on 64 bits Speed on 32 bits +XXH64 13.8 GB/s 1.9 GB/s +XXH32 6.8 GB/s 6.0 GB/s +*/ + +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + +#if defined(__cplusplus) +extern "C" { +#endif + +/* **************************** +* Definitions +******************************/ +#include /* size_t */ +typedef enum { XXH_OK = 0, XXH_ERROR } XXH_errorcode; + +/* **************************** +* API modifier +******************************/ +/** XXH_PRIVATE_API +* This is useful to include xxhash functions in `static` mode +* in order to inline them, and remove their symbol from the public list. +* Methodology : +* #define XXH_PRIVATE_API +* #include "xxhash.h" +* `xxhash.c` is automatically included. +* It's not useful to compile and link it as a separate module. +*/ +#ifdef XXH_PRIVATE_API +#ifndef XXH_STATIC_LINKING_ONLY +#define XXH_STATIC_LINKING_ONLY +#endif +#if defined(__GNUC__) +#define XXH_PUBLIC_API static __inline __attribute__((unused)) +#elif defined(__cplusplus) || \ + (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +#define XXH_PUBLIC_API static inline +#elif defined(_MSC_VER) +#define XXH_PUBLIC_API static __inline +#else +/* this version may generate warnings for unused static functions */ +#define XXH_PUBLIC_API static +#endif +#else +#define XXH_PUBLIC_API /* do nothing */ +#endif /* XXH_PRIVATE_API */ + +/*!XXH_NAMESPACE, aka Namespace Emulation : + +If you want to include _and expose_ xxHash functions from within your own +library, +but also want to avoid symbol collisions with other libraries which may also +include xxHash, + +you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash +library +with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values). + +Note that no change is required within the calling program as long as it +includes `xxhash.h` : +regular symbol name will be automatically translated by this header. +*/ +#ifdef XXH_NAMESPACE +#define XXH_CAT(A, B) A##B +#define XXH_NAME2(A, B) XXH_CAT(A, B) +#define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +#define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +#define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +#define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +#define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +#define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +#define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +#define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +#define XXH32_canonicalFromHash \ + XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +#define XXH32_hashFromCanonical \ + XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +#define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +#define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +#define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +#define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +#define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +#define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +#define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +#define XXH64_canonicalFromHash \ + XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +#define XXH64_hashFromCanonical \ + XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +#endif + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 6 +#define XXH_VERSION_RELEASE 4 +#define XXH_VERSION_NUMBER \ + (XXH_VERSION_MAJOR * 100 * 100 + XXH_VERSION_MINOR * 100 + \ + XXH_VERSION_RELEASE) +XXH_PUBLIC_API unsigned XXH_versionNumber(void); + +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +typedef unsigned int XXH32_hash_t; + +/*! XXH32() : + Calculate the 32-bit hash of sequence "length" bytes stored at memory + address "input". + The memory between input & input+length must be valid (allocated and + read-accessible). + "seed" can be used to alter the result predictably. + Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s + */ +XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t length, + unsigned int seed); + +/*====== Streaming ======*/ +typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */ +XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr); +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t *dst_state, + const XXH32_state_t *src_state); + +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr, + unsigned int seed); +XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *statePtr, + const void *input, size_t length); +XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *statePtr); + +/* +These functions generate the xxHash of an input provided in multiple segments. +Note that, for small input, they are slower than single-call functions, due to +state management. +For small input, prefer `XXH32()` and `XXH64()` . + +XXH state must first be allocated, using XXH*_createState() . + +Start a new hash by initializing state with a seed, using XXH*_reset(). + +Then, feed the hash state by calling XXH*_update() as many times as necessary. +Obviously, input must be allocated and read accessible. +The function returns an error code, with 0 meaning OK, and any other value +meaning there is an error. + +Finally, a hash value can be produced anytime, by using XXH*_digest(). +This function returns the nn-bits hash as an int or long long. + +It's still possible to continue inserting input into the hash state after a +digest, +and generate some new hashes later on, by calling again XXH*_digest(). + +When done, free XXH state space if it was allocated dynamically. +*/ + +/*====== Canonical representation ======*/ + +typedef struct { unsigned char digest[4]; } XXH32_canonical_t; +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst, + XXH32_hash_t hash); +XXH_PUBLIC_API XXH32_hash_t +XXH32_hashFromCanonical(const XXH32_canonical_t *src); + +/* Default result type for XXH functions are primitive unsigned 32 and 64 bits. +* The canonical representation uses human-readable write convention, aka +* big-endian (large digits first). +* These functions allow transformation of hash result into and from its +* canonical format. +* This way, hash values can be written into a file / memory, and remain +* comparable on different systems and programs. +*/ + +#ifndef XXH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +typedef unsigned long long XXH64_hash_t; + +/*! XXH64() : + Calculate the 64-bit hash of sequence of length "len" stored at memory + address "input". + "seed" can be used to alter the result predictably. + This function runs faster on 64-bit systems, but slower on 32-bit systems + (see benchmark). +*/ +XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t length, + unsigned long long seed); + +/*====== Streaming ======*/ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ +XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr); +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t *dst_state, + const XXH64_state_t *src_state); + +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr, + unsigned long long seed); +XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *statePtr, + const void *input, size_t length); +XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t *statePtr); + +/*====== Canonical representation ======*/ +typedef struct { unsigned char digest[8]; } XXH64_canonical_t; +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst, + XXH64_hash_t hash); +XXH_PUBLIC_API XXH64_hash_t +XXH64_hashFromCanonical(const XXH64_canonical_t *src); +#endif /* XXH_NO_LONG_LONG */ + +#ifdef XXH_STATIC_LINKING_ONLY + +/* ================================================================================================ + This section contains declarations which are not guaranteed to remain stable. + They may change in future versions, becoming incompatible with a different +version of the library. + These declarations should only be used with static linking. + Never use them in association with dynamic linking ! +=================================================================================================== +*/ + +/* These definitions are only meant to make possible + static allocation of XXH state, on stack or in a struct for example. + Never use members directly. */ + +struct XXH32_state_s { + unsigned total_len_32; + unsigned large_len; + unsigned v1; + unsigned v2; + unsigned v3; + unsigned v4; + unsigned mem32[4]; /* buffer defined as U32 for alignment */ + unsigned memsize; + unsigned + reserved; /* never read nor write, will be removed in a future version */ +}; /* typedef'd to XXH32_state_t */ + +#ifndef XXH_NO_LONG_LONG /* remove 64-bit support */ +struct XXH64_state_s { + unsigned long long total_len; + unsigned long long v1; + unsigned long long v2; + unsigned long long v3; + unsigned long long v4; + unsigned long long mem64[4]; /* buffer defined as U64 for alignment */ + unsigned memsize; + unsigned reserved[2]; /* never read nor write, will be removed in a future + version */ +}; /* typedef'd to XXH64_state_t */ +#endif + +#ifdef XXH_PRIVATE_API +#include "xxhash.c" /* include xxhash function bodies as `static`, for inlining */ +#endif + +#endif /* XXH_STATIC_LINKING_ONLY */ + +#if defined(__cplusplus) +} +#endif + +#endif /* XXHASH_H_5627135585666179 */