From af0430cb3835526c0e5be968b17384ddfadddc85 Mon Sep 17 00:00:00 2001 From: Amine Khaldi Date: Tue, 13 Jun 2023 16:07:01 +0100 Subject: [PATCH 01/27] Update Catch2 to v3.3.2. (#346) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a01abc8f0..5e653b42c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -151,7 +151,7 @@ ENDIF() FetchContent_Declare( Catch2 GIT_REPOSITORY https://github.com/catchorg/Catch2.git - GIT_TAG v3.2.1 + GIT_TAG v3.3.2 ) FetchContent_MakeAvailable(Catch2) From 6c7e82e41bfd33bbf7479a53968c3cdd12c3e810 Mon Sep 17 00:00:00 2001 From: Kyle Altendorf Date: Tue, 13 Jun 2023 14:59:24 -0400 Subject: [PATCH 02/27] correct wheel matrix arch -> arm (#365) --- .github/workflows/build-wheels.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 8fbdfee8e..62825849e 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -44,31 +44,31 @@ jobs: - major-dot-minor: '3.7' cibw-build: 'cp37-*' manylinux: - arch: manylinux2014 + arm: manylinux2014 intel: manylinux2010 matrix: '3.7' - major-dot-minor: '3.8' cibw-build: 'cp38-*' manylinux: - arch: manylinux2014 + arm: manylinux2014 intel: manylinux2010 matrix: '3.8' - major-dot-minor: '3.9' cibw-build: 'cp39-*' manylinux: - arch: manylinux2014 + arm: manylinux2014 intel: manylinux2010 matrix: '3.9' - major-dot-minor: '3.10' cibw-build: 'cp310-*' manylinux: - arch: manylinux2014 + arm: manylinux2014 intel: manylinux2010 matrix: '3.10' - major-dot-minor: '3.11' cibw-build: 'cp311-*' manylinux: - arch: manylinux2014 + arm: manylinux2014 intel: manylinux2014 matrix: '3.11' arch: From 2309cbddca193014c84b9f46af870ed586542940 Mon Sep 17 00:00:00 2001 From: Earle Lowe <30607889+emlowe@users.noreply.github.com> Date: Fri, 4 Aug 2023 17:29:43 -0700 Subject: [PATCH 03/27] Update cibuildwheel to 2.11.2 (#378) Consolidate cibuildwheel version across repos to 2.11.2 This is the last version that uses bash in windows, which is needed for some scripts. The scripts could be edited to work in powershell, but this is left for another day --- .github/workflows/build-wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 62825849e..04b523677 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -149,7 +149,7 @@ jobs: CIBW_TEST_REQUIRES: pytest CIBW_TEST_COMMAND: py.test -v {project}/tests run: - pipx run --spec='cibuildwheel==2.9.0' cibuildwheel --output-dir dist 2>&1 + pipx run --spec='cibuildwheel==2.11.2' cibuildwheel --output-dir dist 2>&1 - name: Upload artifacts uses: actions/upload-artifact@v3 From d963785d874f28c46a3c3890649da0257da57339 Mon Sep 17 00:00:00 2001 From: William Allen Date: Fri, 4 Aug 2023 21:17:15 -0500 Subject: [PATCH 04/27] Adding compressed harvesting (#370) Signed-off-by: wallentx Co-authored-by: Author: Florin Chirica Co-authored-by: Harold Brenes Co-authored-by: Amine Khaldi Co-authored-by: Kyle Altendorf Co-authored-by: Chris Marslender Co-authored-by: arvidn Co-authored-by: Phill Walker <72089507+PhillWalker@users.noreply.github.com> --- .github/actions/fetch_bladebit_harvester.sh | 93 ++++ .github/workflows/build-wheels.yml | 87 ++- .gitignore | 6 + CMakeLists.txt | 109 +++- python-bindings/chiapos.cpp | 5 + setup.py | 17 + src/cli.cpp | 34 +- src/prover_disk.hpp | 561 +++++++++++++++++++- 8 files changed, 820 insertions(+), 92 deletions(-) create mode 100755 .github/actions/fetch_bladebit_harvester.sh diff --git a/.github/actions/fetch_bladebit_harvester.sh b/.github/actions/fetch_bladebit_harvester.sh new file mode 100755 index 000000000..a757941b0 --- /dev/null +++ b/.github/actions/fetch_bladebit_harvester.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash +set -eo pipefail +_dir=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +cd "$_dir/../.." + +## +# Usage: fetch_bladebit_harvester.sh +# +# Use gitbash or similar under Windows. +## +host_os=$1 +host_arch=$2 + +if [[ "${host_os}" != "linux" ]] && [[ "${host_os}" != "macos" ]] && [[ "${host_os}" != "windows" ]]; then + echo >&2 "Unkonwn OS '${host_os}'" + exit 1 +fi + +if [[ "${host_arch}" != "arm64" ]] && [[ "${host_arch}" != "x86-64" ]]; then + echo >&2 "Unkonwn Architecture '${host_arch}'" + exit 1 +fi + +## Change this if including a new bladebit release +artifact_ver="v3.0.0" +artifact_base_url="https://github.com/Chia-Network/bladebit/releases/download/v3.0.0" + +linux_arm_sha256="5a53d82c2cc22172bfa2267ea9fb53126a527eba7bafd03ddf5503913a61f70c" +linux_x86_sha256="3cdbcf127126d7c61f6da715b25ef73a8420778dd34d56e82ed1865d7a1ebfeb" +macos_arm_sha256="325150951e83be4ee8690be996e6fde0776ff4cca89e39111c97f0aae3f93bf3" +macos_x86_sha256="718ab50a19ea3a8f064f2d09df38720cbb7d89667b599f57f2bca3cdf41c18e9" +windows_sha256="f3e14d02daafaa8e3666ab4666220d3b2859b1c10254bddb38a69da83cc899c5" +## End changes + +artifact_ext="tar.gz" +sha_bin="sha256sum" +expected_sha256= + +if [[ "$OSTYPE" == "darwin"* ]]; then + sha_bin="shasum -a 256" +fi + +case "${host_os}" in +linux) + if [[ "${host_arch}" == "arm64" ]]; then + expected_sha256=$linux_arm_sha256 + else + expected_sha256=$linux_x86_sha256 + fi + ;; +macos) + if [[ "${host_arch}" == "arm64" ]]; then + expected_sha256=$macos_arm_sha256 + else + expected_sha256=$macos_x86_sha256 + fi + ;; +windows) + expected_sha256=$windows_sha256 + artifact_ext="zip" + ;; +*) + echo >&2 "Unexpected OS '${host_os}'" + exit 1 + ;; +esac + +# Download artifact +artifact_name="green_reaper.${artifact_ext}" +curl -L "${artifact_base_url}/green_reaper-${artifact_ver}-${host_os}-${host_arch}.${artifact_ext}" >"${artifact_name}" + +# Validate sha256, if one was given +if [ -n "${expected_sha256}" ]; then + gr_sha256="$(${sha_bin} ${artifact_name} | cut -d' ' -f1)" + + if [[ "${gr_sha256}" != "${expected_sha256}" ]]; then + echo >&2 "GreenReaper SHA256 mismatch!" + echo >&2 " Got : '${gr_sha256}'" + echo >&2 " Expected: '${expected_sha256}'" + exit 1 + fi +fi + +# Unpack artifact +dst_dir="libs/green_reaper" +mkdir -p "${dst_dir}" +if [[ "${artifact_ext}" == "zip" ]]; then + unzip -d "${dst_dir}" "${artifact_name}" +else + pushd "${dst_dir}" + tar -xzvf "../../${artifact_name}" + popd +fi diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 04b523677..2694aa89b 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -4,8 +4,8 @@ on: push: branches: - main - tags: - - '**' + release: + types: [published] pull_request: branches: - '**' @@ -19,6 +19,7 @@ jobs: build-wheels: name: Wheel - ${{ matrix.os.name }} ${{ matrix.python.major-dot-minor }} ${{ matrix.arch.name }} runs-on: ${{ matrix.os.runs-on[matrix.arch.matrix] }} + continue-on-error: true strategy: fail-fast: false matrix: @@ -41,29 +42,23 @@ jobs: runs-on: intel: [windows-latest] python: - - major-dot-minor: '3.7' - cibw-build: 'cp37-*' - manylinux: - arm: manylinux2014 - intel: manylinux2010 - matrix: '3.7' - major-dot-minor: '3.8' cibw-build: 'cp38-*' manylinux: arm: manylinux2014 - intel: manylinux2010 + intel: manylinux2014 matrix: '3.8' - major-dot-minor: '3.9' cibw-build: 'cp39-*' manylinux: arm: manylinux2014 - intel: manylinux2010 + intel: manylinux2014 matrix: '3.9' - major-dot-minor: '3.10' cibw-build: 'cp310-*' manylinux: arm: manylinux2014 - intel: manylinux2010 + intel: manylinux2014 matrix: '3.10' - major-dot-minor: '3.11' cibw-build: 'cp311-*' @@ -96,9 +91,9 @@ jobs: arm: [macOS, ARM64] intel: [macos-latest] python: - major-dot-minor: '3.7' - cibw-build: 'cp37-*' - matrix: '3.7' + major-dot-minor: '3.8' + cibw-build: 'cp38-*' + matrix: '3.8' arch: name: ARM matrix: arm @@ -112,6 +107,12 @@ jobs: with: fetch-depth: 0 + - name: Set Env + if: env.RUNNER_ARCH != 'ARM64' + uses: Chia-Network/actions/setjobenv@main + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: Chia-Network/actions/setup-python@main with: python-version: ${{ matrix.python.major-dot-minor }} @@ -120,32 +121,55 @@ jobs: run: | pip install pipx + - name: Get Windows Bladebit Harvester Artifact + if: runner.os == 'Windows' + shell: bash + run: | + set -eo pipefail + set -x + .github/actions/fetch_bladebit_harvester.sh windows x86-64 + - name: Build and test env: CIBW_PRERELEASE_PYTHONS: True - CIBW_BUILD_VERBOSITY_MACOS: 0 - CIBW_BUILD_VERBOSITY_LINUX: 0 - CIBW_BUILD_VERBOSITY_WINDOWS: 0 + CIBW_BUILD_VERBOSITY_MACOS: 1 + CIBW_BUILD_VERBOSITY_LINUX: 1 + CIBW_BUILD_VERBOSITY_WINDOWS: 1 CIBW_BUILD: ${{ matrix.python.cibw-build }} CIBW_SKIP: '*-manylinux_i686 *-win32 *-musllinux_*' CIBW_MANYLINUX_AARCH64_IMAGE: ${{ matrix.python.manylinux['arm'] }} CIBW_MANYLINUX_X86_64_IMAGE: ${{ matrix.python.manylinux['intel'] }} - CIBW_ENVIRONMENT_LINUX: "PATH=/project/cmake-3.17.3-Linux-`uname -m`/bin:$PATH" - CIBW_BEFORE_ALL_LINUX: > - curl -L https://github.com/Kitware/CMake/releases/download/v3.17.3/cmake-3.17.3-Linux-`uname -m`.sh > cmake.sh - && yes | sh cmake.sh | cat - && rm -f /usr/bin/cmake - && which cmake - && cmake --version - && uname -a + CIBW_ENVIRONMENT_WINDOWS: "CP_USE_GREEN_REAPER=1" + CIBW_ENVIRONMENT_LINUX: CP_USE_GREEN_REAPER="1" + CIBW_BEFORE_ALL_LINUX: | + set -eo pipefail + set -x + # Get bladebit harvester + set -eo pipefail + ARCH=$(uname -m) + if [[ $ARCH == x86_64 ]]; then + .github/actions/fetch_bladebit_harvester.sh linux x86-64 + else + .github/actions/fetch_bladebit_harvester.sh linux arm64 + fi + CIBW_BEFORE_BUILD_LINUX: > python -m pip install --upgrade pip CIBW_ARCHS_MACOS: ${{ matrix.os.cibw-archs-macos[matrix.arch.matrix] }} - CIBW_BEFORE_ALL_MACOS: > + CIBW_BEFORE_ALL_MACOS: | brew install gmp boost cmake + # Get bladebit harvester + set -eo pipefail + ARCH=$(uname -m) + if [[ $ARCH == x86_64 ]]; then + .github/actions/fetch_bladebit_harvester.sh macos x86-64 + else + .github/actions/fetch_bladebit_harvester.sh macos arm64 + fi + CIBW_BEFORE_BUILD_MACOS: > python -m pip install --upgrade pip - CIBW_ENVIRONMENT_MACOS: "MACOSX_DEPLOYMENT_TARGET=10.14" + CIBW_ENVIRONMENT_MACOS: "MACOSX_DEPLOYMENT_TARGET=10.14 CP_USE_GREEN_REAPER=1" CIBW_TEST_REQUIRES: pytest CIBW_TEST_COMMAND: py.test -v {project}/tests run: @@ -274,6 +298,11 @@ jobs: with: fetch-depth: 0 + - name: Set Env + uses: Chia-Network/actions/setjobenv@main + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: Chia-Network/actions/setup-python@main with: python-version: ${{ matrix.python.major-dot-minor }} @@ -298,7 +327,7 @@ jobs: run: pip install twine - name: Publish distribution to PyPI - if: startsWith(github.event.ref, 'refs/tags') && steps.check_secrets.outputs.HAS_SECRET + if: env.RELEASE == 'true' && steps.check_secrets.outputs.HAS_SECRET env: TWINE_USERNAME: __token__ TWINE_NON_INTERACTIVE: 1 @@ -306,7 +335,7 @@ jobs: run: twine upload --non-interactive --skip-existing --verbose 'dist/*' - name: Publish distribution to Test PyPI - if: steps.check_secrets.outputs.HAS_SECRET + if: env.PRE_RELEASE == 'true' && steps.check_secrets.outputs.HAS_SECRET env: TWINE_REPOSITORY_URL: https://test.pypi.org/legacy/ TWINE_USERNAME: __token__ diff --git a/.gitignore b/.gitignore index 8ba5e60ea..698734681 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,9 @@ build .mypy_cache *.whl venv +build-tsan +build-* +cmake-build* +*.zip +*.tar.gz +libs/ \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e653b42c..207aacc8e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,6 +34,26 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(cxxopts) + +option(CP_LINK_BLADEBIT_HARVESTER "Links libbladebit_harvester at build time instead of dynamically loading it." OFF) +option(CP_BUILD_BLADEBIT_HARVESTER "Pulls bladebit harvester target from git and builds it as a dependency.") + +if (${CP_BUILD_BLADEBIT_HARVESTER} AND NOT ${CP_LINK_BLADEBIT_HARVESTER}) + set(CP_LINK_BLADEBIT_HARVESTER ON) +endif() + +if (${CP_BUILD_BLADEBIT_HARVESTER}) + FetchContent_Declare( + bladebit + GIT_REPOSITORY https://github.com/Chia-Network/bladebit.git + GIT_TAG cuda-compression + ) + + set(BB_HARVESTER_ONLY ON) + set(BB_HARVESTER_STATIC ON) + FetchContent_MakeAvailable(bladebit) +endif() + FetchContent_Declare( gulrak GIT_REPOSITORY https://github.com/gulrak/filesystem.git @@ -57,7 +77,11 @@ include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/../lib/FiniteStateEntropy/lib ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/test - ) +) + +IF (${CP_LINK_BLADEBIT_HARVESTER}) + message ("Bladebit Harvesting Enabled") +ENDIF () add_library(fse ${FSE_FILES}) @@ -161,13 +185,6 @@ add_executable(RunTests ${BLAKE3_SRC} ) -target_link_libraries(RunTests - PRIVATE - fse - Threads::Threads - Catch2::Catch2 -) - find_package(Threads REQUIRED) add_library(uint128 STATIC uint128_t/uint128_t.cpp) @@ -175,26 +192,60 @@ target_include_directories(uint128 PUBLIC uint128_t) target_compile_features(fse PUBLIC cxx_std_17) target_compile_features(chiapos PUBLIC cxx_std_17) -target_compile_features(RunTests PUBLIC cxx_std_17) - -if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") - target_link_libraries(chiapos PRIVATE fse Threads::Threads) - target_link_libraries(ProofOfSpace fse Threads::Threads) -elseif (${CMAKE_SYSTEM_NAME} MATCHES "OpenBSD") - target_link_libraries(chiapos PRIVATE fse Threads::Threads) - target_link_libraries(ProofOfSpace fse Threads::Threads) -elseif (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") - target_link_libraries(chiapos PRIVATE fse Threads::Threads) - target_link_libraries(ProofOfSpace fse Threads::Threads) -elseif (MSVC) - target_link_libraries(chiapos PRIVATE fse Threads::Threads uint128) - target_link_libraries(ProofOfSpace fse Threads::Threads uint128) - target_link_libraries(RunTests PRIVATE uint128) -else() - target_link_libraries(chiapos PRIVATE fse stdc++fs Threads::Threads) - target_link_libraries(ProofOfSpace fse stdc++fs Threads::Threads) - target_link_libraries(RunTests PRIVATE stdc++fs) +# target_compile_features(RunTests PUBLIC cxx_std_17) + +target_link_libraries(chiapos PRIVATE fse Threads::Threads + $<$:uint128> + $<$>:stdc++fs> +) +target_link_libraries(ProofOfSpace PRIVATE fse Threads::Threads + $<$:uint128> + $<$>:stdc++fs> +) +target_link_libraries(RunTests PRIVATE fse Threads::Threads Catch2::Catch2WithMain + $<$:uint128> + $<$>:stdc++fs> +) + +if (${CP_LINK_BLADEBIT_HARVESTER}) + + set(bb_defs + USE_GREEN_REAPER=1 + BLADEBIT_HARVESTER_LINKED=1 + $<$:BLADEBIT_IS_PROJECT_DEPENDENCY=1> + ) + set(bb_libs + bladebit_harvester + $<$>:dl> + ) + + include_directories( + ${INCLUDE_DIRECTORIES} + ${CMAKE_CURRENT_SOURCE_DIR}/libs/green_reaper/include + ) + + link_directories( + ${LINK_DIRECTORIES} + ${CMAKE_SOURCE_DIR}/libs/green_reaper/lib + ) + + target_compile_definitions(chiapos PUBLIC ${bb_defs}) + target_compile_definitions(ProofOfSpace PUBLIC ${bb_defs}) + target_compile_definitions(RunTests PUBLIC ${bb_defs}) + + target_link_libraries(chiapos PUBLIC ${bb_libs}) + target_link_libraries(ProofOfSpace PUBLIC ${bb_libs}) + target_link_libraries(RunTests PUBLIC ${bb_libs}) + + target_link_directories(chiapos PUBLIC ${CMAKE_SOURCE_DIR}/libs/green_reaper/lib) + target_link_directories(ProofOfSpace PUBLIC ${CMAKE_SOURCE_DIR}/libs/green_reaper/lib) + target_link_directories(RunTests PUBLIC ${CMAKE_SOURCE_DIR}/libs/green_reaper/lib) + + set_property(TARGET chiapos APPEND PROPERTY BUILD_RPATH "$ORIGIN") + set_property(TARGET ProofOfSpace APPEND PROPERTY BUILD_RPATH "$ORIGIN") + set_property(TARGET RunTests APPEND PROPERTY BUILD_RPATH "$ORIGIN") endif() -enable_testing() -add_test(NAME RunTests COMMAND RunTests) + +#enable_testing() +#add_test(NAME RunTests COMMAND RunTests) diff --git a/python-bindings/chiapos.cpp b/python-bindings/chiapos.cpp index 42431aa54..3a1a95a21 100644 --- a/python-bindings/chiapos.cpp +++ b/python-bindings/chiapos.cpp @@ -105,6 +105,7 @@ PYBIND11_MODULE(chiapos, m) return py::bytes(reinterpret_cast(id.data()), id.size()); }) .def("get_size", [](DiskProver &dp) { return dp.GetSize(); }) + .def("get_compression_level", [](DiskProver &dp) { return dp.GetCompressionLevel(); }) .def("get_filename", [](DiskProver &dp) { return dp.GetFilename(); }) .def( "get_qualities_for_challenge", @@ -179,6 +180,10 @@ PYBIND11_MODULE(chiapos, m) delete[] quality_buf; return stdx::optional(quality_py); }); + + py::class_(m, "ContextQueue") + .def("init", &ContextQueue::init); + m.attr("decompressor_context_queue") = &decompressor_context_queue; } #endif // PYTHON_BINDINGS_PYTHON_BINDINGS_HPP_ diff --git a/setup.py b/setup.py index 44d097dff..244e2ae56 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,7 @@ #!/usr/bin/python3 import os import re +import shutil import sys import platform import subprocess @@ -44,6 +45,9 @@ def build_extension(self, ext): "-DPYTHON_EXECUTABLE=" + sys.executable, ] + if os.getenv("CP_USE_GREEN_REAPER") == "1": + cmake_args.append("-DCP_LINK_BLADEBIT_HARVESTER=ON") + cfg = "Debug" if self.debug else "Release" build_args = ["--config", cfg] @@ -177,11 +181,24 @@ def build_extensions(self): opts.append("-fvisibility=hidden") elif ct == "msvc": opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) + + # Link bladebit_harvester + if os.getenv("CP_USE_GREEN_REAPER") == "1": + opts.append("/DUSE_GREEN_REAPER=1") + opts.append("/DBLADEBIT_HARVESTER_LINKED=1") + opts.append("/Ilibs/green_reaper/include") + link_opts.append("libs/green_reaper/lib/bladebit_harvester.lib") + for ext in self.extensions: ext.extra_compile_args = opts ext.extra_link_args = link_opts build_ext.build_extensions(self) + # Copy bladebit_harvester.dll on windows to the target build directory + # in order to package it into the root directory of the wheel + if os.getenv("CP_USE_GREEN_REAPER") == "1" and sys.platform == "win32": + shutil.copy2("libs/green_reaper/lib/bladebit_harvester.dll", self.build_lib + "/bladebit_harvester.dll") + if platform.system() == "Windows": setup( diff --git a/src/cli.cpp b/src/cli.cpp index 6156dce4a..526a72140 100644 --- a/src/cli.cpp +++ b/src/cli.cpp @@ -63,6 +63,17 @@ void HelpAndQuit(cxxopts::Options options) exit(0); } +// Not thread safe +inline void InitDecompressorQueueDefault(bool no_cuda = false) +{ + static bool initialized = false; + if (initialized) { + return; + } + decompressor_context_queue.init(1, (uint32_t)std::thread::hardware_concurrency(), false, 9, !no_cuda, 0, false); + initialized = true; +} + int main(int argc, char *argv[]) try { cxxopts::Options options( "ProofOfSpace", "Utility for plotting, generating and verifying proofs of space."); @@ -161,6 +172,8 @@ int main(int argc, char *argv[]) try { num_threads, phases_flags); } else if (operation == "prove") { + InitDecompressorQueueDefault(); + if (argc < 3) { HelpAndQuit(options); } @@ -238,6 +251,8 @@ int main(int argc, char *argv[]) try { } delete[] proof_bytes; } else if (operation == "check") { + InitDecompressorQueueDefault(); + uint32_t iterations = 1000; if (argc == 3) { iterations = std::stoi(argv[2]); @@ -247,6 +262,8 @@ int main(int argc, char *argv[]) try { Verifier verifier = Verifier(); uint32_t success = 0; + uint32_t failures = 0; + uint32_t exceptions = 0; std::vector id_bytes = prover.GetId(); k = prover.GetSize(); @@ -257,10 +274,10 @@ int main(int argc, char *argv[]) try { vector hash(picosha2::k_digest_size); picosha2::hash256(hash_input.begin(), hash_input.end(), hash.begin(), hash.end()); - try { - vector qualities = prover.GetQualitiesForChallenge(hash.data()); + vector qualities = prover.GetQualitiesForChallenge(hash.data()); - for (uint32_t i = 0; i < qualities.size(); i++) { + for (uint32_t i = 0; i < qualities.size(); i++) { + try { LargeBits proof = prover.GetFullProof(hash.data(), i, parallel_read); uint8_t *proof_data = new uint8_t[proof.GetSize() / 8]; proof.ToBytes(proof_data); @@ -275,19 +292,22 @@ int main(int argc, char *argv[]) try { success++; } else { cout << "Proof verification failed." << endl; + failures += 1; } delete[] proof_data; + } catch (const std::exception& error) { + cout << "Threw: " << error.what() << endl; + exceptions += 1; } - } catch (const std::exception& error) { - cout << "Threw: " << error.what() << endl; - continue; } } std::cout << "Total success: " << success << "/" << iterations << ", " << (success * 100 / static_cast(iterations)) << "%." << std::endl; + std::cout << "Total failures: " << failures << std::endl; + std::cout << "Exceptions: " << exceptions << std::endl; if (show_progress) { progress(4, 1, 1); } } else { - cout << "Invalid operation. Use create/prove/verify/check" << endl; + cout << "Invalid operation '" << operation << "'. Use create/prove/verify/check" << endl; } return 0; } catch (const cxxopts::OptionException &e) { diff --git a/src/prover_disk.hpp b/src/prover_disk.hpp index 68808abc2..93a52b697 100644 --- a/src/prover_disk.hpp +++ b/src/prover_disk.hpp @@ -24,10 +24,11 @@ #include #include #include -#include #include #include #include +#include +#include #include "../lib/include/picosha2.hpp" #include "calculate_bucket.hpp" @@ -36,6 +37,14 @@ #include "serialize.hpp" #include "util.hpp" +#if USE_GREEN_REAPER + #include "GreenReaperPortable.h" +#endif + +#define CHIA_PLOT_V2_MAGIC 0x544F4C50ul // "PLOT" +#define CHIA_PLOT_VERSION_2_0_0 2 + + struct plot_header { uint8_t magic[19]; uint8_t id[32]; @@ -44,6 +53,229 @@ struct plot_header { uint8_t fmt_desc[50]; }; +#if USE_GREEN_REAPER +static GRApi _grApi{}; +static bool _dcompressor_queue_initialized = false; +class ContextQueue { +public: + ContextQueue() {} + + bool init( + uint32_t context_count, + uint32_t thread_count, + bool no_cpu_affinity, + const uint32_t max_compression_level, + bool use_gpu_harvesting, + uint32_t gpu_index, + bool enforce_gpu_index + ) { + assert(!_dcompressor_queue_initialized); + _dcompressor_queue_initialized = true; + + // Populate the API + #if _WIN32 + #define GR_LIB_PREFIX "" + #define GR_LIB_EXT ".dll" + #else + #define GR_LIB_PREFIX "lib" + + #if __APPLE__ + #define GR_LIB_EXT ".dylib" + #else + #define GR_LIB_EXT ".so" + #endif + #endif + + // void* lib = grLoadModule(GR_LIB_PREFIX "bladebit_harvester" GR_LIB_EXT); + + // if (lib == nullptr) { + // int code; + // #if _WIN32 + // code = (int)::GetLastError(); + // #else + // code = (int)dlerror(); + // #endif + + // std::stringstream err; err << "Failed to load bladebit_harvester with error: '" << code << "'"; + // throw std::runtime_error(err.str()); + // } + // #undef GR_LIB_PREFIX + // #undef GR_LIB_EXT + + // // Init GR API + // { + // const auto r = grPopulateApiFromModule(lib, &_grApi, sizeof(GRApi), GR_API_VERSION); + // if (r != GRResult_OK) { + // std::stringstream err; err << "Failed to initialize GR API with error " << r; + // throw std::runtime_error(err.str()); + // } + // } + + GreenReaperConfig cfg = {}; + cfg.apiVersion = GR_API_VERSION; + cfg.threadCount = thread_count; + cfg.disableCpuAffinity = no_cpu_affinity; + if (!use_gpu_harvesting) { + cfg.gpuRequest = GRGpuRequestKind_None; + } else { + if (enforce_gpu_index) { + cfg.gpuRequest = GRGpuRequestKind_ExactDevice; + } else { + cfg.gpuRequest = GRGpuRequestKind_FirstAvailable; + } + } + cfg.gpuDeviceIndex = gpu_index; + + for (uint32_t i = 0; i < context_count; i++) { + + cfg.cpuOffset = i * thread_count; + GreenReaperContext* gr = nullptr; + auto result = grCreateContext(&gr, &cfg, sizeof(cfg)); + + std::string error_msg; + + if (result == GRResult_OK) { + assert(gr); + queue.push(gr); + + // Preallocate memory required fot the maximum compression level we are supporting initially + result = grPreallocateForCompressionLevel(gr, 32, max_compression_level); + if (result != GRResult_OK) { + std::stringstream err; err << "Failed to preallocate memory for contexts with result " << result; + error_msg = err.str(); + } + } + if (result != GRResult_OK) { + // Destroy contexts that were already created + while (!queue.empty()) { + grDestroyContext( queue.front() ); + queue.pop(); + } + if (error_msg.length() < 1) { + std::stringstream err; err << "Failed to create GRContext with result " << result; + error_msg = err.str(); + } + throw std::runtime_error(error_msg); + } + + if (i == 0 && use_gpu_harvesting) { + if (grHasGpuDecompressor(gr) == GR_TRUE) { + return true; + } else { + // default to CPU + cfg.gpuRequest = GRGpuRequestKind_None; + } + } + } + return false; + } + + void push(GreenReaperContext* gr) { + std::unique_lock lock(mutex); + queue.push(gr); + lock.unlock(); + condition.notify_one(); + } + + GreenReaperContext* pop() { + std::unique_lock lock(mutex); + while (queue.empty()) { + condition.wait(lock); + } + GreenReaperContext* gr = queue.front(); + queue.pop(); + return gr; + } + +private: + std::queue queue; + std::mutex mutex; + std::condition_variable condition; +}; + +class ProofCache { + static constexpr uint32_t MAX_ENTRIES = 64; + + struct Entry { + alignas(16) uint8_t challenge[32]; + uint32_t index; + }; + + uint32_t cache_entry_proof_position = 0; + std::vector challenges; + std::vector full_proofs; + mutable std::mutex lock; + +public: + inline ProofCache() = default; + inline ProofCache(ProofCache&& other) + : cache_entry_proof_position(other.cache_entry_proof_position) + , challenges(std::move(other.challenges)) + , full_proofs(std::move(other.full_proofs)) + { + other.cache_entry_proof_position = 0; + } + + inline ProofCache(ProofCache const& other) = delete; + + inline bool FoundCachedProof(const uint32_t index, const uint8_t* challenge, LargeBits& out_full_proof) { + std::lock_guard l(lock); + + Entry entry; + memcpy(entry.challenge, challenge, sizeof(entry.challenge)); + entry.index = index; + + for (uint32_t i = 0; i < challenges.size(); i++) { + if (memcmp(&challenges[i], &entry, sizeof(Entry)) == 0) { + out_full_proof = full_proofs[i]; + return true; + } + } + return false; + } + + inline void CacheProof(const uint32_t index, const uint8_t* challenge, const LargeBits& full_proof) { + std::lock_guard l(lock); + + Entry entry; + memcpy(entry.challenge, challenge, sizeof(entry.challenge)); + entry.index = index; + + if (challenges.size() < MAX_ENTRIES) { + challenges.emplace_back(entry); + full_proofs.emplace_back(full_proof); + } else { + challenges[cache_entry_proof_position] = entry; + full_proofs[cache_entry_proof_position] = full_proof; + } + cache_entry_proof_position = (cache_entry_proof_position + 1) % MAX_ENTRIES; + } + + static_assert(alignof(ProofCache::Entry) == 16); +}; +#else +// Dummy one for python +class ContextQueue { +public: + inline ContextQueue() {} + + inline bool init( + uint32_t context_count, + uint32_t thread_count, + bool no_cpu_affinity, + const uint32_t max_compression_level, + bool use_gpu_harvesting, + uint32_t gpu_index, + bool enforce_gpu_index + ) + { + return false; + } +}; +#endif // USE_GREEN_REAPER + +ContextQueue decompressor_context_queue; + // The DiskProver, given a correctly formatted plot file, can efficiently generate valid proofs // of space, for a given challenge. @@ -55,6 +287,7 @@ class DiskProver { explicit DiskProver(const std::string& filename) : id(kIdLen) { struct plot_header header{}; + this->compression_level = 0; this->filename = filename; std::ifstream disk_file(filename, std::ios::in | std::ios::binary); @@ -70,27 +303,76 @@ class DiskProver { // 2 bytes - memo length // x bytes - memo - SafeRead(disk_file, (uint8_t*)&header, sizeof(header)); - if (memcmp(header.magic, "Proof of Space Plot", sizeof(header.magic)) != 0) - throw std::invalid_argument("Invalid plot header magic"); + // Check for V2 Magic. + uint8_t magic_2_bytes[4]; + SafeRead(disk_file, magic_2_bytes, 4); + uint32_t magic_2_result; + memcpy(&magic_2_result, magic_2_bytes, sizeof(magic_2_result)); + if (magic_2_result == CHIA_PLOT_V2_MAGIC) { + uint8_t version_bytes[4]; + SafeRead(disk_file, version_bytes, 4); + uint32_t version_result; + memcpy(&version_result, version_bytes, sizeof(version_result)); + if (version_result == CHIA_PLOT_VERSION_2_0_0) { + version = 2; + } + else { + throw std::invalid_argument("Unsupported version."); + } + } else { + // V1 + version = 1; + memcpy(header.magic, magic_2_bytes, sizeof(magic_2_bytes)); + uint8_t tmp_magic_buff[15]; + SafeRead(disk_file, tmp_magic_buff, sizeof(header.magic) - 4); + memcpy(header.magic + 4, tmp_magic_buff, sizeof(tmp_magic_buff)); + if (memcmp(header.magic, "Proof of Space Plot", sizeof(header.magic)) != 0) { + throw std::invalid_argument("Invalid plot header magic: " + Util::HexStr(header.magic, 19)); + } + } - uint16_t fmt_desc_len = Util::TwoBytesToInt(header.fmt_desc_len); + SafeRead(disk_file, (uint8_t*)&header.id, sizeof(header.id)); + SafeRead(disk_file, (uint8_t*)&header.k, sizeof(header.k)); - if (fmt_desc_len == kFormatDescription.size() && - !memcmp(header.fmt_desc, kFormatDescription.c_str(), fmt_desc_len)) { - // OK - } else { - throw std::invalid_argument("Invalid plot file format"); + if (version == 1) { + SafeRead(disk_file, (uint8_t*)&header.fmt_desc_len, sizeof(header.fmt_desc_len)); + SafeRead(disk_file, (uint8_t*)&header.fmt_desc, sizeof(header.fmt_desc)); + + uint16_t fmt_desc_len = Util::TwoBytesToInt(header.fmt_desc_len); + + if (fmt_desc_len == kFormatDescription.size() && + !memcmp(header.fmt_desc, kFormatDescription.c_str(), fmt_desc_len)) { + // OK + } else { + throw std::invalid_argument("Invalid plot file format"); + } + SafeSeek(disk_file, offsetof(struct plot_header, fmt_desc) + fmt_desc_len); } + memcpy(id.data(), header.id, sizeof(header.id)); this->k = header.k; - SafeSeek(disk_file, offsetof(struct plot_header, fmt_desc) + fmt_desc_len); uint8_t size_buf[2]; SafeRead(disk_file, size_buf, 2); memo.resize(Util::TwoBytesToInt(size_buf)); SafeRead(disk_file, memo.data(), memo.size()); + if (version == 2) { + uint8_t flags_bytes[4]; + SafeRead(disk_file, flags_bytes, sizeof(flags_bytes)); + uint32_t flags; + memcpy(&flags, flags_bytes, sizeof(flags)); + if (flags & 1) { + uint8_t compression_level; + SafeRead(disk_file, &compression_level, sizeof(compression_level)); + this->compression_level = compression_level; + } + } + #if !defined( USE_GREEN_REAPER ) + if (this->compression_level > 0) + throw std::logic_error("Harvester does not support compressed plots."); + #endif + this->table_begin_pointers = std::vector(11, 0); this->C2 = std::vector(); @@ -134,7 +416,7 @@ class DiskProver { { Deserializer deserializer(vecBytes); deserializer >> version; - if (version != VERSION) { + if (version != 1 && version != 2) { // TODO: Migrate to new version if we change something related to the data structure throw std::invalid_argument("DiskProver: Invalid version."); } @@ -144,16 +426,30 @@ class DiskProver { deserializer >> k; deserializer >> table_begin_pointers; deserializer >> C2; + if (version == 2) { + deserializer >> compression_level; + } else { + compression_level = 0; + } + + #if !defined( USE_GREEN_REAPER ) + if (compression_level > 0) + throw std::runtime_error("Harvester does not support compressed plots."); + #endif } DiskProver(DiskProver const&) = delete; DiskProver(DiskProver&& other) noexcept + #if USE_GREEN_REAPER + : cached_proofs(std::move(other.cached_proofs)) + #endif { filename = std::move(other.filename); memo = std::move(other.memo); id = std::move(other.id); k = other.k; + compression_level = other.compression_level; table_begin_pointers = std::move(other.table_begin_pointers); C2 = std::move(other.C2); version = std::move(other.version); @@ -180,6 +476,56 @@ class DiskProver { uint8_t GetSize() const noexcept { return k; } + uint8_t GetCompressionLevel() const noexcept { return compression_level; } + + bool CompareProofBits(const LargeBits& left, const LargeBits& right, uint8_t k) + { + uint16_t size = left.GetSize() / k; + assert(left.GetSize() == right.GetSize()); + for (int16_t i = size - 1; i >= 0; i--) { + LargeBits left_val = left.Slice(k * i, k * (i + 1)); + LargeBits right_val = right.Slice(k * i, k * (i + 1)); + if (left_val < right_val) { + return true; + } + if (left_val > right_val) { + return false; + } + } + return false; + } + + LargeBits GetQualityStringFromProof( + LargeBits proof, + const uint8_t* challenge) + { + Bits challenge_bits = Bits(challenge, 256 / 8, 256); + uint16_t quality_index = challenge_bits.Slice(256 - 5).GetValue() << 1; + + // Converts the proof from proof ordering to plot ordering + for (uint8_t table_index = 1; table_index < 7; table_index++) { + LargeBits new_proof; + uint16_t size = k * (1 << (table_index - 1)); + for (int j = 0; j < (1 << (7 - table_index)); j += 2) { + LargeBits L = proof.Slice(j * size, (j + 1) * size); + LargeBits R = proof.Slice((j + 1) * size, (j + 2) * size); + if (CompareProofBits(L, R, k)) { + new_proof += (L + R); + } else { + new_proof += (R + L); + } + } + proof = new_proof; + } + // Hashes two of the x values, based on the quality index + std::vector hash_input(32 + Util::ByteAlign(2 * k) / 8, 0); + memcpy(hash_input.data(), challenge, 32); + proof.Slice(k * quality_index, k * (quality_index + 2)).ToBytes(hash_input.data() + 32); + std::vector hash(picosha2::k_digest_size); + picosha2::hash256(hash_input.begin(), hash_input.end(), hash.begin(), hash.end()); + return LargeBits(hash.data(), 32, 256); + } + // Given a challenge, returns a quality string, which is sha256(challenge + 2 adjecent x // values), from the 64 value proof. Note that this is more efficient than fetching all 64 x // values, which are in different parts of the disk. @@ -187,9 +533,10 @@ class DiskProver { { std::vector qualities; - std::lock_guard l(_mtx); + uint32_t p7_entries_size = 0; { + std::lock_guard l(_mtx); std::ifstream disk_file(filename, std::ios::in | std::ios::binary); if (!disk_file.is_open()) { @@ -203,15 +550,22 @@ class DiskProver { if (p7_entries.empty()) { return std::vector(); } + p7_entries_size = p7_entries.size(); // The last 5 bits of the challenge determine which route we take to get to // our two x values in the leaves. uint8_t last_5_bits = challenge[31] & 0x1f; for (uint64_t position : p7_entries) { + #if USE_GREEN_REAPER + if (compression_level >= 9) { + break; + } + #endif // This inner loop goes from table 6 to table 1, getting the two backpointers, // and following one of them. - for (uint8_t table_index = 6; table_index > 1; table_index--) { + uint64_t alt_position; + for (uint8_t table_index = 6; table_index > GetEndTable(); table_index--) { uint128_t line_point = ReadLinePoint(disk_file, table_index, position); auto xy = Encoding::LinePointToSquare(line_point); @@ -219,13 +573,47 @@ class DiskProver { if (((last_5_bits >> (table_index - 2)) & 1) == 0) { position = xy.second; + alt_position = xy.first; } else { position = xy.first; + alt_position = xy.second; } } - uint128_t new_line_point = ReadLinePoint(disk_file, 1, position); - auto x1x2 = Encoding::LinePointToSquare(new_line_point); - + uint128_t new_line_point = ReadLinePoint(disk_file, GetEndTable(), position); + std::pair x1x2; + + #if USE_GREEN_REAPER + if (compression_level > 0) { + GRCompressedQualitiesRequest req; + req.compressionLevel = compression_level; + req.plotId = id.data(); + req.challenge = challenge; + req.xLinePoints[0].hi = (uint64_t)(new_line_point >> 64); + req.xLinePoints[0].lo = (uint64_t)new_line_point; + if (compression_level >= 6) { + uint128_t alt_line_point = ReadLinePoint(disk_file, GetEndTable(), alt_position); + req.xLinePoints[1].hi = (uint64_t)(alt_line_point >> 64); + req.xLinePoints[1].lo = (uint64_t)alt_line_point; + } + + GreenReaperContext* gr = decompressor_context_queue.pop(); + assert(gr); + + auto res = grGetFetchQualitiesXPair(gr, &req); + decompressor_context_queue.push(gr); + + if (res != GRResult_OK) { + // Expect this will result in failure in a later step. + x1x2.first = x1x2.second = 0; + } else { + x1x2.first = req.x1; + x1x2.second = req.x2; + } + } else + #endif // #if USE_GREEN_REAPER + { + x1x2 = Encoding::LinePointToSquare(new_line_point); + } // The final two x values (which are stored in the same location) are hashed std::vector hash_input(32 + Util::ByteAlign(2 * k) / 8, 0); memcpy(hash_input.data(), challenge, 32); @@ -236,6 +624,23 @@ class DiskProver { qualities.emplace_back(hash.data(), 32, 256); } } // Scope for disk_file + + #if USE_GREEN_REAPER + if (compression_level >= 9) { + uint8_t failure_bytes[32]; + for (int i = 0; i < 32; i++) { + failure_bytes[i] = 255; + } + for (uint32_t i = 0; i < p7_entries_size; i++) { + try { + auto proof = GetFullProof(challenge, i); + qualities.push_back(GetQualityStringFromProof(proof, challenge)); + } catch (const std::exception& error) { + qualities.emplace_back(failure_bytes, 32, 256); + } + } + } + #endif return qualities; } @@ -245,6 +650,12 @@ class DiskProver { LargeBits GetFullProof(const uint8_t* challenge, uint32_t index, bool parallel_read = true) { LargeBits full_proof; + + #if USE_GREEN_REAPER + if (compression_level >= 9 && cached_proofs.FoundCachedProof(index, challenge, full_proof)) { + return full_proof; + } + #endif std::lock_guard l(_mtx); { @@ -262,11 +673,55 @@ class DiskProver { // Gets the 64 leaf x values, concatenated together into a k*64 bit string. std::vector xs; if (parallel_read) { - xs = GetInputs(p7_entries[index], 6); + xs = GetInputs(p7_entries[index], 6, nullptr); } else { xs = GetInputs(p7_entries[index], 6, &disk_file); // Passing in a disk_file disabled the parallel reads } + #if USE_GREEN_REAPER + if (compression_level > 0) { + auto gr = decompressor_context_queue.pop(); + + GRCompressedProofRequest req{}; + req.compressionLevel = compression_level; + req.plotId = id.data(); + + uint8_t compressed_proof_size = (compression_level <= 8 ? GR_POST_PROOF_CMP_X_COUNT : (GR_POST_PROOF_CMP_X_COUNT / 2)); + for (int i = 0; i < compressed_proof_size; i++) { + req.compressedProof[i] = xs[i].GetValue(); + } + + GRResult res = grFetchProofForChallenge(gr, &req); + decompressor_context_queue.push(gr); + + if (res != GRResult_OK) { + if (res == GRResult_NoProof) { + throw std::runtime_error("GRResult_NoProof received"); + } + if (res == GRResult_Failed) { + throw std::runtime_error("GRResult is not GRResult_OK, received GRResult_Failed"); + } + if (res == GRResult_OutOfMemory) { + throw std::runtime_error("GRResult is not GRResult_OK, received GRResult_OutOfMemory"); + } + if (res == GRResult_WrongVersion) { + throw std::runtime_error("GRResult is not GRResult_OK, received GRResult_WrongVersion"); + } + if (res == GRResult_InvalidGPU) { + throw std::runtime_error("GRResult is not GRResult_OK, received GRResult_InvalidGPU"); + } + if (res == GRResult_InvalidArg) { + throw std::runtime_error("GRResult is not GRResult_OK, received GRResult_InvalidArg"); + } + } + std::vector uncompressed_xs; + for (int i = 0; i < GR_POST_PROOF_X_COUNT; i++) { + uncompressed_xs.push_back(Bits(req.fullProof[i], k)); + } + xs = uncompressed_xs; + } + #endif + // Sorts them according to proof ordering, where // f1(x0) m= f1(x1), f2(x0, x1) m= f2(x2, x3), etc. On disk, they are not stored in // proof ordering, they're stored in plot ordering, due to the sorting in the Compress @@ -276,6 +731,12 @@ class DiskProver { full_proof += x; } } // Scope for disk_file + + #if USE_GREEN_REAPER + if (compression_level >= 9) { + cached_proofs.CacheProof(index, challenge, full_proof); + } + #endif return full_proof; } @@ -283,6 +744,9 @@ class DiskProver { { Serializer serializer; serializer << version << filename << memo << id << k << table_begin_pointers << C2; + if (version == 2) { + serializer << compression_level; + } return serializer.Data(); } @@ -293,8 +757,12 @@ class DiskProver { std::vector memo; std::vector id; // Unique plot id uint8_t k; + uint8_t compression_level; std::vector table_begin_pointers; std::vector C2; + #if USE_GREEN_REAPER + ProofCache cached_proofs; + #endif // Using this method instead of simply seeking will prevent segfaults that would arise when // continuing the process of looking up qualities. @@ -328,14 +796,53 @@ class DiskProver { } } + uint8_t GetEndTable() { + if (compression_level == 0) { + return 1; + } + if (compression_level <= 8) { + return 2; + } + return 3; + } + // Reads exactly one line point (pair of two k bit back-pointers) from the given table. // The entry at index "position" is read. First, the park index is calculated, then // the park is read, and finally, entry deltas are added up to the position that we // are looking for. - uint128_t ReadLinePoint(std::ifstream& disk_file, uint8_t table_index, uint64_t position) - { + uint128_t ReadLinePoint( + std::ifstream& disk_file, + uint8_t table_index, + uint64_t position + ) { + size_t compressed_park_size = 0; + uint32_t compressed_stub_size_bits = 0; + double compressed_ans_r_value = 0; + + const bool is_compressed = compression_level > 0 && table_index == GetEndTable(); + (void)is_compressed; + + #if USE_GREEN_REAPER + if (is_compressed) { + GRCompressionInfo info{}; + const auto r = grGetCompressionInfo(&info, sizeof(info), k, compression_level); + if (r != GRResult_OK) { + std::stringstream err; err << "Failed to obtain compression info with error " << r; + throw std::runtime_error(err.str()); + } + + compressed_park_size = info.tableParkSize;; + compressed_stub_size_bits = info.subtSizeBits; + compressed_ans_r_value = info.ansRValue; + } + #else + (void)compressed_stub_size_bits; + (void)compressed_ans_r_value; + (void)compressed_park_size; + #endif + uint64_t park_index = position / kEntriesPerPark; - uint32_t park_size_bits = EntrySizes::CalculateParkSize(k, table_index) * 8; + uint32_t park_size_bits = (is_compressed ? compressed_park_size : EntrySizes::CalculateParkSize(k, table_index)) * 8; SafeSeek(disk_file, table_begin_pointers[table_index] + (park_size_bits / 8) * park_index); @@ -346,12 +853,12 @@ class DiskProver { uint128_t line_point = Util::SliceInt128FromBytes(line_point_bin, 0, k * 2); // Reads EPP stubs - uint32_t stubs_size_bits = EntrySizes::CalculateStubsSize(k) * 8; + uint32_t stubs_size_bits = (is_compressed ? (Util::ByteAlign((kEntriesPerPark - 1) * compressed_stub_size_bits) / 8) : EntrySizes::CalculateStubsSize(k)) * 8; auto* stubs_bin = new uint8_t[stubs_size_bits / 8 + 7]; SafeRead(disk_file, stubs_bin, stubs_size_bits / 8); - // Reads EPP deltas - uint32_t max_deltas_size_bits = EntrySizes::CalculateMaxDeltasSize(k, table_index) * 8; + // Reads EPP deltas + uint32_t max_deltas_size_bits = (is_compressed ? compressed_park_size - (line_point_size + stubs_size_bits) : EntrySizes::CalculateMaxDeltasSize(k, table_index)) * 8; auto* deltas_bin = new uint8_t[max_deltas_size_bits / 8]; // Reads the size of the encoded deltas object @@ -374,13 +881,13 @@ class DiskProver { SafeRead(disk_file, deltas_bin, encoded_deltas_size); // Decodes the deltas - double R = kRValues[table_index - 1]; + double R = (is_compressed ? compressed_ans_r_value : kRValues[table_index - 1]); deltas = Encoding::ANSDecodeDeltas(deltas_bin, encoded_deltas_size, kEntriesPerPark - 1, R); } uint32_t start_bit = 0; - uint8_t stub_size = k - kStubMinusBits; + uint8_t stub_size = (uint8_t)(is_compressed ? compressed_stub_size_bits : k - kStubMinusBits); uint64_t sum_deltas = 0; uint64_t sum_stubs = 0; for (uint32_t i = 0; @@ -724,7 +1231,7 @@ class DiskProver { } std::pair xy = Encoding::LinePointToSquare(line_point); - if (depth == 1) { + if (depth == GetEndTable()) { // For table P1, the line point represents two concatenated x values. std::vector ret; ret.emplace_back(xy.second, k); // y From f7d63f63a24af642f965ed190ce6fe452d624717 Mon Sep 17 00:00:00 2001 From: Earle Lowe <30607889+emlowe@users.noreply.github.com> Date: Wed, 9 Aug 2023 08:18:06 -0700 Subject: [PATCH 05/27] Move cibw config to pyproject.toml (#381) Puts most `cibuildwheel` configuration into `pyproject.toml` instead of using envvars in the github actions. The rationale is that this makes it a little bit easier to build the wheel locally for debugging and testing as there is less configuration specific to GH actions Note some of the envvars refer to GH action matrix items, so those remain in the GH workflow. --- .github/workflows/build-wheels.yml | 36 ----------------------------- pyproject.toml | 37 ++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 2694aa89b..83f75ad4c 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -132,46 +132,10 @@ jobs: - name: Build and test env: CIBW_PRERELEASE_PYTHONS: True - CIBW_BUILD_VERBOSITY_MACOS: 1 - CIBW_BUILD_VERBOSITY_LINUX: 1 - CIBW_BUILD_VERBOSITY_WINDOWS: 1 CIBW_BUILD: ${{ matrix.python.cibw-build }} - CIBW_SKIP: '*-manylinux_i686 *-win32 *-musllinux_*' CIBW_MANYLINUX_AARCH64_IMAGE: ${{ matrix.python.manylinux['arm'] }} CIBW_MANYLINUX_X86_64_IMAGE: ${{ matrix.python.manylinux['intel'] }} - CIBW_ENVIRONMENT_WINDOWS: "CP_USE_GREEN_REAPER=1" - CIBW_ENVIRONMENT_LINUX: CP_USE_GREEN_REAPER="1" - CIBW_BEFORE_ALL_LINUX: | - set -eo pipefail - set -x - # Get bladebit harvester - set -eo pipefail - ARCH=$(uname -m) - if [[ $ARCH == x86_64 ]]; then - .github/actions/fetch_bladebit_harvester.sh linux x86-64 - else - .github/actions/fetch_bladebit_harvester.sh linux arm64 - fi - - CIBW_BEFORE_BUILD_LINUX: > - python -m pip install --upgrade pip CIBW_ARCHS_MACOS: ${{ matrix.os.cibw-archs-macos[matrix.arch.matrix] }} - CIBW_BEFORE_ALL_MACOS: | - brew install gmp boost cmake - # Get bladebit harvester - set -eo pipefail - ARCH=$(uname -m) - if [[ $ARCH == x86_64 ]]; then - .github/actions/fetch_bladebit_harvester.sh macos x86-64 - else - .github/actions/fetch_bladebit_harvester.sh macos arm64 - fi - - CIBW_BEFORE_BUILD_MACOS: > - python -m pip install --upgrade pip - CIBW_ENVIRONMENT_MACOS: "MACOSX_DEPLOYMENT_TARGET=10.14 CP_USE_GREEN_REAPER=1" - CIBW_TEST_REQUIRES: pytest - CIBW_TEST_COMMAND: py.test -v {project}/tests run: pipx run --spec='cibuildwheel==2.11.2' cibuildwheel --output-dir dist 2>&1 diff --git a/pyproject.toml b/pyproject.toml index 7c1146b6b..471d873c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,3 +4,40 @@ build-backend = "setuptools.build_meta" [tool.setuptools_scm] local_scheme = "no-local-version" + +[tool.cibuildwheel] +test-requires = "pytest" +test-command = "pytest -v {project}/tests" +skip = "*-manylinux_i686 *-win32 *-musllinux_*" + +[tool.cibuildwheel.linux] +build-verbosity = 1 +environment = {CP_USE_GREEN_REAPER=1} +before-all = ''' +set -eo pipefail && set -x && set -eo pipefail && ARCH=$(uname -m) +if [[ $ARCH == x86_64 ]]; then +.github/actions/fetch_bladebit_harvester.sh linux x86-64 +else +.github/actions/fetch_bladebit_harvester.sh linux arm64 +fi +''' +before-build = "python -m pip install --upgrade pip" + +[tool.cibuildwheel.macos] +build-verbosity = 1 +before-all = ''' +brew install gmp boost cmake +set -eo pipefail +ARCH=$(uname -m) +if [[ $ARCH == x86_64 ]]; then +.github/actions/fetch_bladebit_harvester.sh macos x86-64 +else +.github/actions/fetch_bladebit_harvester.sh macos arm64 +fi +''' +before-build = "python -m pip install --upgrade pip" +environment = {MACOSX_DEPLOYMENT_TARGET="11", SYSTEM_VERSION_COMPAT=0, CP_USE_GREEN_REAPER=1} + +[tool.cibuildwheel.windows] +build-verbosity = 1 +environment = "CP_USE_GREEN_REAPER=1" \ No newline at end of file From dd5bcd76fc05bcc0b356c0711e6f7dfad307635c Mon Sep 17 00:00:00 2001 From: Earle Lowe <30607889+emlowe@users.noreply.github.com> Date: Wed, 16 Aug 2023 09:02:12 -0700 Subject: [PATCH 06/27] Use ubuntu 22 runners (#386) Use ubuntu-22 runners for the valgrind, ASAN, and TSAN tests - to avoid some strange new issue with ubuntu-20. Generally it doesn't matter much where we run these memory fuzzers. --- .github/workflows/build-test-cplusplus.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-test-cplusplus.yml b/.github/workflows/build-test-cplusplus.yml index 6794730fc..8d16ad3e5 100644 --- a/.github/workflows/build-test-cplusplus.yml +++ b/.github/workflows/build-test-cplusplus.yml @@ -9,7 +9,7 @@ concurrency: jobs: valgrind: name: valgrind ubuntu - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Checkout code uses: actions/checkout@v3 @@ -27,7 +27,7 @@ jobs: asan: name: ASAN ubuntu - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Checkout code uses: actions/checkout@v3 @@ -48,7 +48,7 @@ jobs: tsan: name: TSAN ubuntu - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Checkout code uses: actions/checkout@v3 From 08b7b954733e72ea5098eff3e539412481b28d7e Mon Sep 17 00:00:00 2001 From: Kyle Altendorf Date: Wed, 16 Aug 2023 15:04:51 -0400 Subject: [PATCH 07/27] build wheels for macOS 10.14 for now (#383) https://github.com/Chia-Network/chia-blockchain/pull/16047#pullrequestreview-1579705035 Co-authored-by: Earle Lowe --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 471d873c5..9dfa11382 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,8 +36,8 @@ else fi ''' before-build = "python -m pip install --upgrade pip" -environment = {MACOSX_DEPLOYMENT_TARGET="11", SYSTEM_VERSION_COMPAT=0, CP_USE_GREEN_REAPER=1} +environment = {MACOSX_DEPLOYMENT_TARGET="10.14", SYSTEM_VERSION_COMPAT=0, CP_USE_GREEN_REAPER=1} [tool.cibuildwheel.windows] build-verbosity = 1 -environment = "CP_USE_GREEN_REAPER=1" \ No newline at end of file +environment = "CP_USE_GREEN_REAPER=1" From b68e114554ad5ca2e8860264eb125515f4e85cca Mon Sep 17 00:00:00 2001 From: Kyle Altendorf Date: Wed, 16 Aug 2023 16:29:27 -0400 Subject: [PATCH 08/27] tidy pyproject.toml (#384) --- pyproject.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9dfa11382..1f5b6faef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,9 +16,9 @@ environment = {CP_USE_GREEN_REAPER=1} before-all = ''' set -eo pipefail && set -x && set -eo pipefail && ARCH=$(uname -m) if [[ $ARCH == x86_64 ]]; then -.github/actions/fetch_bladebit_harvester.sh linux x86-64 + .github/actions/fetch_bladebit_harvester.sh linux x86-64 else -.github/actions/fetch_bladebit_harvester.sh linux arm64 + .github/actions/fetch_bladebit_harvester.sh linux arm64 fi ''' before-build = "python -m pip install --upgrade pip" @@ -30,9 +30,9 @@ brew install gmp boost cmake set -eo pipefail ARCH=$(uname -m) if [[ $ARCH == x86_64 ]]; then -.github/actions/fetch_bladebit_harvester.sh macos x86-64 + .github/actions/fetch_bladebit_harvester.sh macos x86-64 else -.github/actions/fetch_bladebit_harvester.sh macos arm64 + .github/actions/fetch_bladebit_harvester.sh macos arm64 fi ''' before-build = "python -m pip install --upgrade pip" From 94f6a5946dcd2bf7e0785c670596868e4d3b7559 Mon Sep 17 00:00:00 2001 From: Kyle Altendorf Date: Wed, 16 Aug 2023 16:30:39 -0400 Subject: [PATCH 09/27] use consistent workflow triggers (#385) --- .github/workflows/build-test-cplusplus.yml | 10 +++++++++- .github/workflows/plot-k27-no-bitfield.yaml | 10 +++++++++- .github/workflows/plot-k27.yaml | 10 +++++++++- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-test-cplusplus.yml b/.github/workflows/build-test-cplusplus.yml index 8d16ad3e5..fce3e528b 100644 --- a/.github/workflows/build-test-cplusplus.yml +++ b/.github/workflows/build-test-cplusplus.yml @@ -1,6 +1,14 @@ name: Build and Test C++ -on: [push, pull_request] +on: + push: + branches: + - main + release: + types: [published] + pull_request: + branches: + - '**' concurrency: group: ${{ github.ref }}-${{ github.workflow }}-${{ github.event_name }}--${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release/') || startsWith(github.ref, 'refs/heads/long_lived/')) && github.sha || '' }} diff --git a/.github/workflows/plot-k27-no-bitfield.yaml b/.github/workflows/plot-k27-no-bitfield.yaml index 133b97c8b..943da131c 100644 --- a/.github/workflows/plot-k27-no-bitfield.yaml +++ b/.github/workflows/plot-k27-no-bitfield.yaml @@ -1,6 +1,14 @@ name: Plot k=27 without bitfield -on: [push, pull_request] +on: + push: + branches: + - main + release: + types: [published] + pull_request: + branches: + - '**' concurrency: group: ${{ github.ref }}-${{ github.workflow }}-${{ github.event_name }}--${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release/') || startsWith(github.ref, 'refs/heads/long_lived/')) && github.sha || '' }} diff --git a/.github/workflows/plot-k27.yaml b/.github/workflows/plot-k27.yaml index 5b81906a8..bdaffdae8 100644 --- a/.github/workflows/plot-k27.yaml +++ b/.github/workflows/plot-k27.yaml @@ -1,6 +1,14 @@ name: Plot k=27 -on: [push, pull_request] +on: + push: + branches: + - main + release: + types: [published] + pull_request: + branches: + - '**' concurrency: group: ${{ github.ref }}-${{ github.workflow }}-${{ github.event_name }}--${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release/') || startsWith(github.ref, 'refs/heads/long_lived/')) && github.sha || '' }} From 10c85837e52c4a4905df00332a15c0d50e61a8f3 Mon Sep 17 00:00:00 2001 From: Florin Chirica Date: Wed, 16 Aug 2023 23:32:59 +0300 Subject: [PATCH 10/27] Add timeout for context queue pop. (#382) Co-authored-by: Earle Lowe --- src/cli.cpp | 2 +- src/prover_disk.hpp | 28 ++++++++++++++++++++++++---- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/src/cli.cpp b/src/cli.cpp index 526a72140..8f96d7364 100644 --- a/src/cli.cpp +++ b/src/cli.cpp @@ -70,7 +70,7 @@ inline void InitDecompressorQueueDefault(bool no_cuda = false) if (initialized) { return; } - decompressor_context_queue.init(1, (uint32_t)std::thread::hardware_concurrency(), false, 9, !no_cuda, 0, false); + decompressor_context_queue.init(1, (uint32_t)std::thread::hardware_concurrency(), false, 9, !no_cuda, 0, false, 30); initialized = true; } diff --git a/src/prover_disk.hpp b/src/prover_disk.hpp index 93a52b697..abc3adae6 100644 --- a/src/prover_disk.hpp +++ b/src/prover_disk.hpp @@ -29,6 +29,7 @@ #include #include #include +#include #include "../lib/include/picosha2.hpp" #include "calculate_bucket.hpp" @@ -67,7 +68,8 @@ class ContextQueue { const uint32_t max_compression_level, bool use_gpu_harvesting, uint32_t gpu_index, - bool enforce_gpu_index + bool enforce_gpu_index, + uint16_t context_queue_timeout ) { assert(!_dcompressor_queue_initialized); _dcompressor_queue_initialized = true; @@ -167,6 +169,7 @@ class ContextQueue { } } } + this->context_queue_timeout = context_queue_timeout; return false; } @@ -179,9 +182,24 @@ class ContextQueue { GreenReaperContext* pop() { std::unique_lock lock(mutex); - while (queue.empty()) { - condition.wait(lock); + + std::chrono::duration wait_time = std::chrono::seconds(context_queue_timeout); + + while (queue.empty() && wait_time.count() > 0) { + auto before_wait = std::chrono::steady_clock::now(); + + if (condition.wait_for(lock, wait_time) == std::cv_status::timeout) { + break; + } + + auto elapsed = std::chrono::duration_cast>(std::chrono::steady_clock::now() - before_wait); + wait_time -= elapsed; + } + + if (queue.empty()) { + throw std::runtime_error("Timeout waiting for context queue."); } + GreenReaperContext* gr = queue.front(); queue.pop(); return gr; @@ -191,6 +209,7 @@ class ContextQueue { std::queue queue; std::mutex mutex; std::condition_variable condition; + uint16_t context_queue_timeout; }; class ProofCache { @@ -266,7 +285,8 @@ class ContextQueue { const uint32_t max_compression_level, bool use_gpu_harvesting, uint32_t gpu_index, - bool enforce_gpu_index + bool enforce_gpu_index, + uint16_t context_queue_timeout ) { return false; From 37358df5431d2c4ab5148ba8e19ad214d7ada791 Mon Sep 17 00:00:00 2001 From: Earle Lowe <30607889+emlowe@users.noreply.github.com> Date: Thu, 17 Aug 2023 13:21:45 -0700 Subject: [PATCH 11/27] Fix bug where timeout wasn't getting set for GPU harvesting (#389) The context timeout was being set after the function was already returning for GPU harvesting. GPU harvesting was short-circuiting and returning before this was being set Make sure to configure the timeout earlier --- src/prover_disk.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/prover_disk.hpp b/src/prover_disk.hpp index abc3adae6..1e1ef2944 100644 --- a/src/prover_disk.hpp +++ b/src/prover_disk.hpp @@ -127,6 +127,7 @@ class ContextQueue { } } cfg.gpuDeviceIndex = gpu_index; + this->context_queue_timeout = context_queue_timeout; for (uint32_t i = 0; i < context_count; i++) { @@ -169,7 +170,7 @@ class ContextQueue { } } } - this->context_queue_timeout = context_queue_timeout; + return false; } From 24851ebfea527e2ce5546794e7e6042fe0ca9571 Mon Sep 17 00:00:00 2001 From: Chris Marslender Date: Fri, 18 Aug 2023 18:02:49 -0500 Subject: [PATCH 12/27] Add pypi trusted publishing (#391) --- .github/workflows/build-wheels.yml | 41 +++++++++++------------------- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 83f75ad4c..c4166491b 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -15,6 +15,10 @@ concurrency: group: ${{ github.ref }}-${{ github.workflow }}-${{ github.event_name }}-${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release/') || startsWith(github.ref, 'refs/heads/long_lived/')) && github.sha || '' }} cancel-in-progress: true +permissions: + contents: read + id-token: write + jobs: build-wheels: name: Wheel - ${{ matrix.os.name }} ${{ matrix.python.major-dot-minor }} ${{ matrix.arch.name }} @@ -277,32 +281,17 @@ jobs: name: packages path: ./dist - - name: Test for secrets access - id: check_secrets - shell: bash - run: | - unset HAS_SECRET - if [ -n "$SECRET" ]; then HAS_SECRET='true' ; fi - echo "HAS_SECRET=${HAS_SECRET}" >>$GITHUB_OUTPUT - env: - SECRET: "${{ secrets.test_pypi_password }}" - - - name: Install twine - run: pip install twine - - name: Publish distribution to PyPI - if: env.RELEASE == 'true' && steps.check_secrets.outputs.HAS_SECRET - env: - TWINE_USERNAME: __token__ - TWINE_NON_INTERACTIVE: 1 - TWINE_PASSWORD: ${{ secrets.pypi_password }} - run: twine upload --non-interactive --skip-existing --verbose 'dist/*' + if: env.RELEASE == 'true' + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages-dir: dist/ + skip-existing: true - name: Publish distribution to Test PyPI - if: env.PRE_RELEASE == 'true' && steps.check_secrets.outputs.HAS_SECRET - env: - TWINE_REPOSITORY_URL: https://test.pypi.org/legacy/ - TWINE_USERNAME: __token__ - TWINE_NON_INTERACTIVE: 1 - TWINE_PASSWORD: ${{ secrets.test_pypi_password }} - run: twine upload --non-interactive --skip-existing --verbose 'dist/*' + if: env.PRE_RELEASE == 'true' + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ + packages-dir: dist/ + skip-existing: true From a383a6354b4d9bf0805deb2a4c2b91351bab33e5 Mon Sep 17 00:00:00 2001 From: Earle Lowe <30607889+emlowe@users.noreply.github.com> Date: Thu, 24 Aug 2023 14:38:22 -0700 Subject: [PATCH 13/27] Set macOS target to 11 (#392) Set the minimum version for macOS to 11 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1f5b6faef..966ac339d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ else fi ''' before-build = "python -m pip install --upgrade pip" -environment = {MACOSX_DEPLOYMENT_TARGET="10.14", SYSTEM_VERSION_COMPAT=0, CP_USE_GREEN_REAPER=1} +environment = {MACOSX_DEPLOYMENT_TARGET="11", SYSTEM_VERSION_COMPAT=0, CP_USE_GREEN_REAPER=1} [tool.cibuildwheel.windows] build-verbosity = 1 From 5bb9e10f0c8a4d0475bbd7fc0819a3061463335a Mon Sep 17 00:00:00 2001 From: Gene Hoffman <30377676+hoffmang9@users.noreply.github.com> Date: Fri, 1 Sep 2023 17:34:02 -0700 Subject: [PATCH 14/27] Update to pybind v2.11.1 Cmake is deprecating the way pybind finds the python interpreter. See https://cmake.org/cmake/help/latest/policy/CMP0148.html and pybind/pybind11#4719 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 207aacc8e..08b38ce7a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,7 @@ else() FetchContent_Declare( pybind11-src GIT_REPOSITORY https://github.com/pybind/pybind11.git - GIT_TAG v2.10.0 + GIT_TAG v2.11.1 ) FetchContent_MakeAvailable(pybind11-src) endif() From 7f1f011788742c5e59d3b18fc007265bd8787dea Mon Sep 17 00:00:00 2001 From: Earle Lowe <30607889+emlowe@users.noreply.github.com> Date: Thu, 7 Sep 2023 09:55:15 -0700 Subject: [PATCH 15/27] adjust use of setuptools to support version 68.2 (#397) --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 244e2ae56..4531a6957 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ import platform import subprocess -from setuptools import setup, setuptools, Extension +from setuptools import setup, errors, Extension from setuptools.command.build_ext import build_ext from distutils.version import LooseVersion @@ -134,7 +134,7 @@ def has_flag(compiler, flagname): f.write("int main (int argc, char **argv) { return 0; }") try: compiler.compile([f.name], extra_postargs=[flagname]) - except setuptools.distutils.errors.CompileError: + except errors.CompileError: return False return True From f569a9913da11c5c127585a5c1a4f381d186a430 Mon Sep 17 00:00:00 2001 From: Earle Lowe <30607889+emlowe@users.noreply.github.com> Date: Thu, 7 Sep 2023 15:46:26 -0700 Subject: [PATCH 16/27] Add riscv64 info and simplify macos (#394) Simplify macos checks by just using CMAKE_SYSTME_PROCESSOR and add in riscv64 --------- Co-authored-by: Chris Marslender --- .github/workflows/build-test-riscv64.yml | 59 ++++++++++++++++++++++++ CMakeLists.txt | 11 +---- 2 files changed, 60 insertions(+), 10 deletions(-) create mode 100644 .github/workflows/build-test-riscv64.yml diff --git a/.github/workflows/build-test-riscv64.yml b/.github/workflows/build-test-riscv64.yml new file mode 100644 index 000000000..2a010b7fd --- /dev/null +++ b/.github/workflows/build-test-riscv64.yml @@ -0,0 +1,59 @@ +name: Build and test riscv64 on ubuntu-latest + +on: + push: + branches: + - main + - dev + tags: + - '**' + pull_request: + branches: + - '**' + +jobs: + build: + timeout-minutes: 60 + name: QEMU riscv64 via Debian on ubuntu-latest + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ ubuntu-latest ] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: Set up QEMU on x86_64 + if: startsWith(matrix.os, 'ubuntu-latest') + id: qemu + uses: docker/setup-qemu-action@v2 + with: + platforms: riscv64 +# image: tonistiigi/binfmt:latest + + - name: Build and Test + run: | + docker run --rm --platform linux/riscv64 \ + -v ${{ github.workspace }}:/ws --workdir=/ws \ + riscv64/debian:rc-buggy \ + bash -exc '\ + apt-get update && \ + apt-get install -y cmake build-essential git python3-full python3-pip python3-dev && \ + cmake --version && \ + uname -a && \ + export CP_USE_GREEN_REAPER=0 && \ + pip wheel -w dist . && \ + python3 -m venv venv && \ + ./venv/bin/python -m pip install dist/*.whl && \ + ./venv/bin/python -m pip install pytest && \ + ./venv/bin/python -m pytest -k "not k_21" -v tests/ + ' + + - name: Upload artifacts + uses: actions/upload-artifact@v3 + with: + name: packages + path: ./dist diff --git a/CMakeLists.txt b/CMakeLists.txt index 207aacc8e..16f37ec77 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -123,15 +123,6 @@ set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2 -fno-omit-frame-pointer -fsanitize=thre set (CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} -fno-omit-frame-pointer -fsanitize=thread") ENDIF() -IF (APPLE) -# on macOS "uname -m" returns the architecture (x86_64 or arm64) -execute_process( - COMMAND uname -m - RESULT_VARIABLE result - OUTPUT_VARIABLE OSX_NATIVE_ARCHITECTURE - OUTPUT_STRIP_TRAILING_WHITESPACE) -ENDIF() - IF (WIN32) set(BLAKE3_SRC src/b3/blake3.c @@ -141,7 +132,7 @@ set(BLAKE3_SRC src/b3/blake3_avx512.c src/b3/blake3_sse41.c ) -ELSEIF(OSX_NATIVE_ARCHITECTURE STREQUAL "arm64") +ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64|arm64") set(BLAKE3_SRC src/b3/blake3.c src/b3/blake3_portable.c From 2ff03d5eeb5db46d8740ae3568662be44535a780 Mon Sep 17 00:00:00 2001 From: William Allen Date: Mon, 25 Sep 2023 13:32:28 -0500 Subject: [PATCH 17/27] Use green_reaper v3.1.0-rc2 (#398) Co-authored-by: Kyle Altendorf --- .github/actions/fetch_bladebit_harvester.sh | 14 +++++++------- src/prover_disk.hpp | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/actions/fetch_bladebit_harvester.sh b/.github/actions/fetch_bladebit_harvester.sh index a757941b0..040ca1514 100755 --- a/.github/actions/fetch_bladebit_harvester.sh +++ b/.github/actions/fetch_bladebit_harvester.sh @@ -22,14 +22,14 @@ if [[ "${host_arch}" != "arm64" ]] && [[ "${host_arch}" != "x86-64" ]]; then fi ## Change this if including a new bladebit release -artifact_ver="v3.0.0" -artifact_base_url="https://github.com/Chia-Network/bladebit/releases/download/v3.0.0" +artifact_ver="v3.1.0-rc2" +artifact_base_url="https://github.com/Chia-Network/bladebit/releases/download/v3.1.0-rc2" -linux_arm_sha256="5a53d82c2cc22172bfa2267ea9fb53126a527eba7bafd03ddf5503913a61f70c" -linux_x86_sha256="3cdbcf127126d7c61f6da715b25ef73a8420778dd34d56e82ed1865d7a1ebfeb" -macos_arm_sha256="325150951e83be4ee8690be996e6fde0776ff4cca89e39111c97f0aae3f93bf3" -macos_x86_sha256="718ab50a19ea3a8f064f2d09df38720cbb7d89667b599f57f2bca3cdf41c18e9" -windows_sha256="f3e14d02daafaa8e3666ab4666220d3b2859b1c10254bddb38a69da83cc899c5" +linux_arm_sha256="9377126d1bf3e820dd3a4de3a28b3ed1309854b1e808c450ec0414b9b1193990" +linux_x86_sha256="e3fce120863ef8a01e664723d7675770058db0be1d1a389965cc856d2c18277f" +macos_arm_sha256="59eefcbe88c29c83363fb1f16167c921b512aa3ddac8f7eff32d28da3ff3cba1" +macos_x86_sha256="8b41cbbbb30c9bb3e5e718fd578291a244eb3582d82a496e0c8049896c2599d8" +windows_sha256="e591e54993b35a371bd46c93c51c08e5d290494e30dd724e65fffd31d98d32ed" ## End changes artifact_ext="tar.gz" diff --git a/src/prover_disk.hpp b/src/prover_disk.hpp index 1e1ef2944..6326e3480 100644 --- a/src/prover_disk.hpp +++ b/src/prover_disk.hpp @@ -853,7 +853,7 @@ class DiskProver { } compressed_park_size = info.tableParkSize;; - compressed_stub_size_bits = info.subtSizeBits; + compressed_stub_size_bits = info.stubSizeBits; compressed_ans_r_value = info.ansRValue; } #else From a12e2bd72d79c01d806e903dff3b2a0106fdcd69 Mon Sep 17 00:00:00 2001 From: Harold Brenes Date: Tue, 3 Oct 2023 18:41:12 -0400 Subject: [PATCH 18/27] Track GreenReaper v3.1.0 --- .github/actions/fetch_bladebit_harvester.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/actions/fetch_bladebit_harvester.sh b/.github/actions/fetch_bladebit_harvester.sh index 040ca1514..d55d35a09 100755 --- a/.github/actions/fetch_bladebit_harvester.sh +++ b/.github/actions/fetch_bladebit_harvester.sh @@ -22,14 +22,14 @@ if [[ "${host_arch}" != "arm64" ]] && [[ "${host_arch}" != "x86-64" ]]; then fi ## Change this if including a new bladebit release -artifact_ver="v3.1.0-rc2" -artifact_base_url="https://github.com/Chia-Network/bladebit/releases/download/v3.1.0-rc2" +artifact_ver="v3.1.0" +artifact_base_url="https://github.com/Chia-Network/bladebit/releases/download/${artifact_ver}" -linux_arm_sha256="9377126d1bf3e820dd3a4de3a28b3ed1309854b1e808c450ec0414b9b1193990" -linux_x86_sha256="e3fce120863ef8a01e664723d7675770058db0be1d1a389965cc856d2c18277f" -macos_arm_sha256="59eefcbe88c29c83363fb1f16167c921b512aa3ddac8f7eff32d28da3ff3cba1" -macos_x86_sha256="8b41cbbbb30c9bb3e5e718fd578291a244eb3582d82a496e0c8049896c2599d8" -windows_sha256="e591e54993b35a371bd46c93c51c08e5d290494e30dd724e65fffd31d98d32ed" +linux_arm_sha256="e8fc09bb5862ce3d029b78144ea46791afe14a2d640390605b6df28fb420e782" +linux_x86_sha256="e31e5226d1e4a399f4181bb2cca243d46218305a8b54912ef29c791022ac079d" +macos_arm_sha256="03958b94ad9d01de074b5a9a9d86a51bd2246c0eab5529c5886bb4bbc4168e0b" +macos_x86_sha256="14975aabfb3d906e22e04cd973be4265f9c5c61e67a92f890c8e51cf9edf0c87" +windows_sha256="ccf115ebec18413c3134f9ca37945f30f4f02d6766242c7e84a6df0d1d989a69" ## End changes artifact_ext="tar.gz" From 2de242041774cf59a561b6949a40519353090c79 Mon Sep 17 00:00:00 2001 From: Earle Lowe <30607889+emlowe@users.noreply.github.com> Date: Fri, 20 Oct 2023 17:50:46 -0700 Subject: [PATCH 19/27] Use CNI riscv docker image (#403) Use CNI riscv docker image based on Ubuntu --- .github/workflows/build-test-riscv64.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/build-test-riscv64.yml b/.github/workflows/build-test-riscv64.yml index 2a010b7fd..58e424218 100644 --- a/.github/workflows/build-test-riscv64.yml +++ b/.github/workflows/build-test-riscv64.yml @@ -38,10 +38,8 @@ jobs: run: | docker run --rm --platform linux/riscv64 \ -v ${{ github.workspace }}:/ws --workdir=/ws \ - riscv64/debian:rc-buggy \ + chianetwork/ubuntu-22.04-risc-builder:latest \ bash -exc '\ - apt-get update && \ - apt-get install -y cmake build-essential git python3-full python3-pip python3-dev && \ cmake --version && \ uname -a && \ export CP_USE_GREEN_REAPER=0 && \ From df636fe75e34520272dae6754ba62f09ca1a3445 Mon Sep 17 00:00:00 2001 From: Chris Marslender Date: Sat, 21 Oct 2023 10:58:46 -0500 Subject: [PATCH 20/27] Swap blake3 to FetchContent + general cleanup (#402) --- .github/workflows/build-test-cplusplus.yml | 23 +- .github/workflows/build-wheels.yml | 2 +- .gitignore | 1 + CMakeLists.txt | 78 +- setup.py | 175 +- src/b3/blake3.c | 598 ----- src/b3/blake3.h | 56 - src/b3/blake3_avx2.c | 325 --- src/b3/blake3_avx2_x86-64_unix.S | 1802 -------------- src/b3/blake3_avx512.c | 1204 --------- src/b3/blake3_avx512_x86-64_unix.S | 2572 -------------------- src/b3/blake3_dispatch.c | 245 -- src/b3/blake3_impl.h | 235 -- src/b3/blake3_portable.c | 168 -- src/b3/blake3_sse41.c | 559 ----- src/b3/blake3_sse41_x86-64_unix.S | 2014 --------------- src/calculate_bucket.hpp | 2 +- tests/test.cpp | 4 +- 18 files changed, 85 insertions(+), 9978 deletions(-) delete mode 100644 src/b3/blake3.c delete mode 100644 src/b3/blake3.h delete mode 100644 src/b3/blake3_avx2.c delete mode 100644 src/b3/blake3_avx2_x86-64_unix.S delete mode 100644 src/b3/blake3_avx512.c delete mode 100644 src/b3/blake3_avx512_x86-64_unix.S delete mode 100644 src/b3/blake3_dispatch.c delete mode 100644 src/b3/blake3_impl.h delete mode 100644 src/b3/blake3_portable.c delete mode 100644 src/b3/blake3_sse41.c delete mode 100644 src/b3/blake3_sse41_x86-64_unix.S diff --git a/.github/workflows/build-test-cplusplus.yml b/.github/workflows/build-test-cplusplus.yml index fce3e528b..c0b87a2f3 100644 --- a/.github/workflows/build-test-cplusplus.yml +++ b/.github/workflows/build-test-cplusplus.yml @@ -20,7 +20,7 @@ jobs: runs-on: ubuntu-22.04 steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: cmake, RunTests, and valgrind on ubuntu-20.04 run: | @@ -38,7 +38,7 @@ jobs: runs-on: ubuntu-22.04 steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: cmake, RunTests with address- and undefined sanitizer on Ubuntu run: | @@ -59,7 +59,7 @@ jobs: runs-on: ubuntu-22.04 steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: cmake, RunTests with thread sanitizer on Ubuntu run: | @@ -69,12 +69,27 @@ jobs: cmake --build . -- -j 6 TSAN_OPTIONS="memory_limit_mb=6000" ./RunTests + mac: + name: MacOS + runs-on: macos-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: cmake, RunTests on Mac + run: | + mkdir build + cd build + cmake .. + cmake --build . --config Release -j 6 + ./RunTests + windows: name: Windows Latest runs-on: windows-latest steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: cmake, RunTests with Windows run: | diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index c4166491b..43cc4f246 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -141,7 +141,7 @@ jobs: CIBW_MANYLINUX_X86_64_IMAGE: ${{ matrix.python.manylinux['intel'] }} CIBW_ARCHS_MACOS: ${{ matrix.os.cibw-archs-macos[matrix.arch.matrix] }} run: - pipx run --spec='cibuildwheel==2.11.2' cibuildwheel --output-dir dist 2>&1 + pipx run --spec='cibuildwheel==2.16.2' cibuildwheel --output-dir dist 2>&1 - name: Upload artifacts uses: actions/upload-artifact@v3 diff --git a/.gitignore b/.gitignore index 698734681..956d66bbf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.idea/ ProofOfSpace RunTests HellmanAttacks diff --git a/CMakeLists.txt b/CMakeLists.txt index 16f37ec77..23e288e70 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,7 +34,6 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(cxxopts) - option(CP_LINK_BLADEBIT_HARVESTER "Links libbladebit_harvester at build time instead of dynamically loading it." OFF) option(CP_BUILD_BLADEBIT_HARVESTER "Pulls bladebit harvester target from git and builds it as a dependency.") @@ -123,38 +122,35 @@ set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2 -fno-omit-frame-pointer -fsanitize=thre set (CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} -fno-omit-frame-pointer -fsanitize=thread") ENDIF() -IF (WIN32) -set(BLAKE3_SRC - src/b3/blake3.c - src/b3/blake3_portable.c - src/b3/blake3_dispatch.c - src/b3/blake3_avx2.c - src/b3/blake3_avx512.c - src/b3/blake3_sse41.c -) -ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64|arm64") -set(BLAKE3_SRC - src/b3/blake3.c - src/b3/blake3_portable.c - src/b3/blake3_dispatch.c +pybind11_add_module(chiapos ${CMAKE_CURRENT_SOURCE_DIR}/python-bindings/chiapos.cpp src/chacha8.c) +add_executable(ProofOfSpace + src/cli.cpp + src/chacha8.c ) -ELSE() -set(BLAKE3_SRC - src/b3/blake3.c - src/b3/blake3_portable.c - src/b3/blake3_dispatch.c - src/b3/blake3_avx2_x86-64_unix.S - src/b3/blake3_avx512_x86-64_unix.S - src/b3/blake3_sse41_x86-64_unix.S + +FetchContent_Declare( + blake3 + GIT_REPOSITORY https://github.com/BLAKE3-team/BLAKE3.git + GIT_TAG 1.5.0 ) -ENDIF() -pybind11_add_module(chiapos ${CMAKE_CURRENT_SOURCE_DIR}/python-bindings/chiapos.cpp src/chacha8.c ${BLAKE3_SRC}) +FetchContent_GetProperties(blake3) +if(NOT blake3_POPULATED) + FetchContent_Populate(blake3) -add_executable(ProofOfSpace - src/cli.cpp - src/chacha8.c - ${BLAKE3_SRC} + # Set BLAKE3 to build as a static library + set(BUILD_SHARED_LIBS FALSE CACHE BOOL "Build static libraries" FORCE) + + add_subdirectory(${blake3_SOURCE_DIR}/c ${blake3_BINARY_DIR}) +endif() + +set(BLAKE3_SRC ${blake3_SOURCE_DIR}/c) +set(BLAKE3_INCLUDE_DIR ${blake3_SOURCE_DIR}/c) +target_link_libraries(chiapos PRIVATE blake3) +target_link_libraries(ProofOfSpace PRIVATE blake3) +include_directories( + ${INCLUDE_DIRECTORIES} + ${BLAKE3_INCLUDE_DIR} ) option(BUILD_PROOF_OF_SPACE_STATICALLY "Build ProofOfSpace target statically" OFF) @@ -173,7 +169,14 @@ FetchContent_MakeAvailable(Catch2) add_executable(RunTests tests/test.cpp src/chacha8.c - ${BLAKE3_SRC} +) + +target_link_libraries(RunTests + PRIVATE + fse + Threads::Threads + Catch2::Catch2 + blake3 ) find_package(Threads REQUIRED) @@ -231,12 +234,21 @@ if (${CP_LINK_BLADEBIT_HARVESTER}) target_link_directories(chiapos PUBLIC ${CMAKE_SOURCE_DIR}/libs/green_reaper/lib) target_link_directories(ProofOfSpace PUBLIC ${CMAKE_SOURCE_DIR}/libs/green_reaper/lib) target_link_directories(RunTests PUBLIC ${CMAKE_SOURCE_DIR}/libs/green_reaper/lib) - + set_property(TARGET chiapos APPEND PROPERTY BUILD_RPATH "$ORIGIN") set_property(TARGET ProofOfSpace APPEND PROPERTY BUILD_RPATH "$ORIGIN") set_property(TARGET RunTests APPEND PROPERTY BUILD_RPATH "$ORIGIN") + + if (WIN32) + add_custom_command(TARGET chiapos POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${CMAKE_SOURCE_DIR}/libs/green_reaper/lib/bladebit_harvester.dll" + "$/bladebit_harvester.dll" + ) + message("The bladebit dll was copied to: $/bladebit_harvester.dll") + endif() endif() -#enable_testing() -#add_test(NAME RunTests COMMAND RunTests) +enable_testing() +add_test(NAME RunTests COMMAND RunTests) diff --git a/setup.py b/setup.py index 4531a6957..5f73900c8 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,11 @@ #!/usr/bin/python3 import os import re -import shutil import sys import platform import subprocess -from setuptools import setup, errors, Extension +from setuptools import setup, Extension from setuptools.command.build_ext import build_ext from distutils.version import LooseVersion @@ -76,160 +75,18 @@ def build_extension(self, ext): ) -class get_pybind_include(object): - """Helper class to determine the pybind11 include path - - The purpose of this class is to postpone importing pybind11 - until it is actually installed, so that the ``get_include()`` - method can be invoked.""" - - def __init__(self, user=False): - self.user = user - - def __str__(self): - import pybind11 - - return pybind11.get_include(self.user) - - -ext_modules = [ - Extension( - "chiapos", - [ - "lib/FiniteStateEntropy/lib/entropy_common.c", - "lib/FiniteStateEntropy/lib/fse_compress.c", - "lib/FiniteStateEntropy/lib/fse_decompress.c", - "lib/FiniteStateEntropy/lib/hist.c", - "python-bindings/chiapos.cpp", - "uint128_t/uint128_t.cpp", - "src/b3/blake3.c", - "src/b3/blake3_portable.c", - "src/b3/blake3_dispatch.c", - "src/b3/blake3_avx2.c", - "src/b3/blake3_avx512.c", - "src/b3/blake3_sse41.c", - "src/chacha8.c", - ], - include_dirs=[ - # Path to pybind11 headers - get_pybind_include(), - get_pybind_include(user=True), - "src", - "uint128_t", - ".", - ], - ), -] - - -# As of Python 3.6, CCompiler has a `has_flag` method. -# cf http://bugs.python.org/issue26689 -def has_flag(compiler, flagname): - """Return a boolean indicating whether a flag name is supported on - the specified compiler. - """ - import tempfile - - with tempfile.NamedTemporaryFile("w", suffix=".cpp") as f: - f.write("int main (int argc, char **argv) { return 0; }") - try: - compiler.compile([f.name], extra_postargs=[flagname]) - except errors.CompileError: - return False - return True - - -def cpp_flag(compiler): - """Return the -std=c++[11/14/17] compiler flag. - - The newer version is prefered over c++11 (when it is available). - """ - flags = ["-std=c++17", "-std=c++14", "-std=c++11"] - - for flag in flags: - if has_flag(compiler, flag): - return flag - - raise RuntimeError("Unsupported compiler -- at least C++11 support " "is needed!") - - -class BuildExt(build_ext): - """A custom build extension for adding compiler-specific options.""" - - c_opts = { - "msvc": ["/EHsc", "/std:c++17", "/O2"], - "unix": [""], - } - l_opts = { - "msvc": [], - "unix": [""], - } - - if sys.platform == "darwin": - darwin_opts = ["-stdlib=libc++", "-mmacosx-version-min=10.14"] - c_opts["unix"] += darwin_opts - l_opts["unix"] += darwin_opts # type: ignore - - def build_extensions(self): - ct = self.compiler.compiler_type - opts = self.c_opts.get(ct, []) - link_opts = self.l_opts.get(ct, []) - if ct == "unix": - opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version()) - opts.append(cpp_flag(self.compiler)) - if has_flag(self.compiler, "-fvisibility=hidden"): - opts.append("-fvisibility=hidden") - elif ct == "msvc": - opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) - - # Link bladebit_harvester - if os.getenv("CP_USE_GREEN_REAPER") == "1": - opts.append("/DUSE_GREEN_REAPER=1") - opts.append("/DBLADEBIT_HARVESTER_LINKED=1") - opts.append("/Ilibs/green_reaper/include") - link_opts.append("libs/green_reaper/lib/bladebit_harvester.lib") - - for ext in self.extensions: - ext.extra_compile_args = opts - ext.extra_link_args = link_opts - build_ext.build_extensions(self) - - # Copy bladebit_harvester.dll on windows to the target build directory - # in order to package it into the root directory of the wheel - if os.getenv("CP_USE_GREEN_REAPER") == "1" and sys.platform == "win32": - shutil.copy2("libs/green_reaper/lib/bladebit_harvester.dll", self.build_lib + "/bladebit_harvester.dll") - - -if platform.system() == "Windows": - setup( - name="chiapos", - author="Mariano Sorgente", - author_email="mariano@chia.net", - description="Chia proof of space plotting, proving, and verifying (wraps C++)", - license="Apache License", - python_requires=">=3.7", - long_description=open("README.md").read(), - long_description_content_type="text/markdown", - url="https://github.com/Chia-Network/chiapos", - setup_requires=["pybind11>=2.10.0"], - tests_require=["pytest"], - ext_modules=ext_modules, - cmdclass={"build_ext": BuildExt}, - zip_safe=False, - ) -else: - setup( - name="chiapos", - author="Mariano Sorgente", - author_email="mariano@chia.net", - description="Chia proof of space plotting, proving, and verifying (wraps C++)", - license="Apache License", - python_requires=">=3.7", - long_description=open("README.md").read(), - long_description_content_type="text/markdown", - url="https://github.com/Chia-Network/chiapos", - tests_require=["pytest"], - ext_modules=[CMakeExtension("chiapos", ".")], - cmdclass=dict(build_ext=CMakeBuild), - zip_safe=False, - ) +setup( + name="chiapos", + author="Mariano Sorgente", + author_email="mariano@chia.net", + description="Chia proof of space plotting, proving, and verifying (wraps C++)", + license="Apache License", + python_requires=">=3.7", + long_description=open("README.md").read(), + long_description_content_type="text/markdown", + url="https://github.com/Chia-Network/chiapos", + tests_require=["pytest"], + ext_modules=[CMakeExtension("chiapos", ".")], + cmdclass=dict(build_ext=CMakeBuild), + zip_safe=False, +) diff --git a/src/b3/blake3.c b/src/b3/blake3.c deleted file mode 100644 index 0acefbade..000000000 --- a/src/b3/blake3.c +++ /dev/null @@ -1,598 +0,0 @@ -#include -#include -#include - -#include "blake3.h" -#include "blake3_impl.h" - -INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8], - uint8_t flags) { - memcpy(self->cv, key, BLAKE3_KEY_LEN); - self->chunk_counter = 0; - memset(self->buf, 0, BLAKE3_BLOCK_LEN); - self->buf_len = 0; - self->blocks_compressed = 0; - self->flags = flags; -} - -INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8], - uint64_t chunk_counter) { - memcpy(self->cv, key, BLAKE3_KEY_LEN); - self->chunk_counter = chunk_counter; - self->blocks_compressed = 0; - memset(self->buf, 0, BLAKE3_BLOCK_LEN); - self->buf_len = 0; -} - -INLINE size_t chunk_state_len(const blake3_chunk_state *self) { - return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + - ((size_t)self->buf_len); -} - -INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self, - const uint8_t *input, size_t input_len) { - size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); - if (take > input_len) { - take = input_len; - } - uint8_t *dest = self->buf + ((size_t)self->buf_len); - memcpy(dest, input, take); - self->buf_len += (uint8_t)take; - return take; -} - -INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) { - if (self->blocks_compressed == 0) { - return CHUNK_START; - } else { - return 0; - } -} - -typedef struct { - uint32_t input_cv[8]; - uint64_t counter; - uint8_t block[BLAKE3_BLOCK_LEN]; - uint8_t block_len; - uint8_t flags; -} output_t; - -INLINE output_t make_output(const uint32_t input_cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - output_t ret; - memcpy(ret.input_cv, input_cv, 32); - memcpy(ret.block, block, BLAKE3_BLOCK_LEN); - ret.block_len = block_len; - ret.counter = counter; - ret.flags = flags; - return ret; -} - -// Chaining values within a given chunk (specifically the compress_in_place -// interface) are represented as words. This avoids unnecessary bytes<->words -// conversion overhead in the portable implementation. However, the hash_many -// interface handles both user input and parent node blocks, so it accepts -// bytes. For that reason, chaining values in the CV stack are represented as -// bytes. -INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { - uint32_t cv_words[8]; - memcpy(cv_words, self->input_cv, 32); - blake3_compress_in_place(cv_words, self->block, self->block_len, - self->counter, self->flags); - memcpy(cv, cv_words, 32); -} - -INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out, - size_t out_len) { - uint64_t output_block_counter = seek / 64; - size_t offset_within_block = seek % 64; - uint8_t wide_buf[64]; - while (out_len > 0) { - blake3_compress_xof(self->input_cv, self->block, self->block_len, - output_block_counter, self->flags | ROOT, wide_buf); - size_t available_bytes = 64 - offset_within_block; - size_t memcpy_len; - if (out_len > available_bytes) { - memcpy_len = available_bytes; - } else { - memcpy_len = out_len; - } - memcpy(out, wide_buf + offset_within_block, memcpy_len); - out += memcpy_len; - out_len -= memcpy_len; - output_block_counter += 1; - offset_within_block = 0; - } -} - -INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, - size_t input_len) { - if (self->buf_len > 0) { - size_t take = chunk_state_fill_buf(self, input, input_len); - input += take; - input_len -= take; - if (input_len > 0) { - blake3_compress_in_place( - self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, - self->flags | chunk_state_maybe_start_flag(self)); - self->blocks_compressed += 1; - self->buf_len = 0; - memset(self->buf, 0, BLAKE3_BLOCK_LEN); - } - } - - while (input_len > BLAKE3_BLOCK_LEN) { - blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, - self->chunk_counter, - self->flags | chunk_state_maybe_start_flag(self)); - self->blocks_compressed += 1; - input += BLAKE3_BLOCK_LEN; - input_len -= BLAKE3_BLOCK_LEN; - } - - size_t take = chunk_state_fill_buf(self, input, input_len); - input += take; - input_len -= take; -} - -INLINE output_t chunk_state_output(const blake3_chunk_state *self) { - uint8_t block_flags = - self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; - return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, - block_flags); -} - -INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], - const uint32_t key[8], uint8_t flags) { - return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); -} - -// Given some input larger than one chunk, return the number of bytes that -// should go in the left subtree. This is the largest power-of-2 number of -// chunks that leaves at least 1 byte for the right subtree. -INLINE size_t left_len(size_t content_len) { - // Subtract 1 to reserve at least one byte for the right side. content_len - // should always be greater than BLAKE3_CHUNK_LEN. - size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; - return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; -} - -// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time -// on a single thread. Write out the chunk chaining values and return the -// number of chunks hashed. These chunks are never the root and never empty; -// those cases use a different codepath. -INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, - const uint32_t key[8], - uint64_t chunk_counter, uint8_t flags, - uint8_t *out) { -#if defined(BLAKE3_TESTING) - assert(0 < input_len); - assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN); -#endif - - const uint8_t *chunks_array[MAX_SIMD_DEGREE]; - size_t input_position = 0; - size_t chunks_array_len = 0; - while (input_len - input_position >= BLAKE3_CHUNK_LEN) { - chunks_array[chunks_array_len] = &input[input_position]; - input_position += BLAKE3_CHUNK_LEN; - chunks_array_len += 1; - } - - blake3_hash_many(chunks_array, chunks_array_len, - BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, - true, flags, CHUNK_START, CHUNK_END, out); - - // Hash the remaining partial chunk, if there is one. Note that the empty - // chunk (meaning the empty message) is a different codepath. - if (input_len > input_position) { - uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; - blake3_chunk_state chunk_state; - chunk_state_init(&chunk_state, key, flags); - chunk_state.chunk_counter = counter; - chunk_state_update(&chunk_state, &input[input_position], - input_len - input_position); - output_t output = chunk_state_output(&chunk_state); - output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); - return chunks_array_len + 1; - } else { - return chunks_array_len; - } -} - -// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time -// on a single thread. Write out the parent chaining values and return the -// number of parents hashed. (If there's an odd input chaining value left over, -// return it as an additional output.) These parents are never the root and -// never empty; those cases use a different codepath. -INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, - size_t num_chaining_values, - const uint32_t key[8], uint8_t flags, - uint8_t *out) { -#if defined(BLAKE3_TESTING) - assert(2 <= num_chaining_values); - assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2); -#endif - - const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; - size_t parents_array_len = 0; - while (num_chaining_values - (2 * parents_array_len) >= 2) { - parents_array[parents_array_len] = - &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; - parents_array_len += 1; - } - - blake3_hash_many(parents_array, parents_array_len, 1, key, - 0, // Parents always use counter 0. - false, flags | PARENT, - 0, // Parents have no start flags. - 0, // Parents have no end flags. - out); - - // If there's an odd child left over, it becomes an output. - if (num_chaining_values > 2 * parents_array_len) { - memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], - &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], - BLAKE3_OUT_LEN); - return parents_array_len + 1; - } else { - return parents_array_len; - } -} - -// The wide helper function returns (writes out) an array of chaining values -// and returns the length of that array. The number of chaining values returned -// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, -// if the input is shorter than that many chunks. The reason for maintaining a -// wide array of chaining values going back up the tree, is to allow the -// implementation to hash as many parents in parallel as possible. -// -// As a special case when the SIMD degree is 1, this function will still return -// at least 2 outputs. This guarantees that this function doesn't perform the -// root compression. (If it did, it would use the wrong flags, and also we -// wouldn't be able to implement exendable ouput.) Note that this function is -// not used when the whole input is only 1 chunk long; that's a different -// codepath. -// -// Why not just have the caller split the input on the first update(), instead -// of implementing this special rule? Because we don't want to limit SIMD or -// multi-threading parallelism for that update(). -static size_t blake3_compress_subtree_wide(const uint8_t *input, - size_t input_len, - const uint32_t key[8], - uint64_t chunk_counter, - uint8_t flags, uint8_t *out) { - // Note that the single chunk case does *not* bump the SIMD degree up to 2 - // when it is 1. If this implementation adds multi-threading in the future, - // this gives us the option of multi-threading even the 2-chunk case, which - // can help performance on smaller platforms. - if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) { - return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, - out); - } - - // With more than simd_degree chunks, we need to recurse. Start by dividing - // the input into left and right subtrees. (Note that this is only optimal - // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree - // of 3 or something, we'll need a more complicated strategy.) - size_t left_input_len = left_len(input_len); - size_t right_input_len = input_len - left_input_len; - const uint8_t *right_input = &input[left_input_len]; - uint64_t right_chunk_counter = - chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); - - // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to - // account for the special case of returning 2 outputs when the SIMD degree - // is 1. - uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; - size_t degree = blake3_simd_degree(); - if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { - // The special case: We always use a degree of at least two, to make - // sure there are two outputs. Except, as noted above, at the chunk - // level, where we allow degree=1. (Note that the 1-chunk-input case is - // a different codepath.) - degree = 2; - } - uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; - - // Recurse! If this implementation adds multi-threading support in the - // future, this is where it will go. - size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key, - chunk_counter, flags, cv_array); - size_t right_n = blake3_compress_subtree_wide( - right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); - - // The special case again. If simd_degree=1, then we'll have left_n=1 and - // right_n=1. Rather than compressing them into a single output, return - // them directly, to make sure we always have at least two outputs. - if (left_n == 1) { - memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); - return 2; - } - - // Otherwise, do one layer of parent node compression. - size_t num_chaining_values = left_n + right_n; - return compress_parents_parallel(cv_array, num_chaining_values, key, flags, - out); -} - -// Hash a subtree with compress_subtree_wide(), and then condense the resulting -// list of chaining values down to a single parent node. Don't compress that -// last parent node, however. Instead, return its message bytes (the -// concatenated chaining values of its children). This is necessary when the -// first call to update() supplies a complete subtree, because the topmost -// parent node of that subtree could end up being the root. It's also necessary -// for extended output in the general case. -// -// As with compress_subtree_wide(), this function is not used on inputs of 1 -// chunk or less. That's a different codepath. -INLINE void compress_subtree_to_parent_node( - const uint8_t *input, size_t input_len, const uint32_t key[8], - uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { -#if defined(BLAKE3_TESTING) - assert(input_len > BLAKE3_CHUNK_LEN); -#endif - - uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; - size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, - chunk_counter, flags, cv_array); - - // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, - // compress_subtree_wide() returns more than 2 chaining values. Condense - // them into 2 by forming parent nodes repeatedly. - uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; - while (num_cvs > 2) { - num_cvs = - compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); - memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); - } - memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); -} - -INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], - uint8_t flags) { - memcpy(self->key, key, BLAKE3_KEY_LEN); - chunk_state_init(&self->chunk, key, flags); - self->cv_stack_len = 0; -} - -void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); } - -void blake3_hasher_init_keyed(blake3_hasher *self, - const uint8_t key[BLAKE3_KEY_LEN]) { - uint32_t key_words[8]; - load_key_words(key, key_words); - hasher_init_base(self, key_words, KEYED_HASH); -} - -void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { - blake3_hasher context_hasher; - hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT); - blake3_hasher_update(&context_hasher, context, strlen(context)); - uint8_t context_key[BLAKE3_KEY_LEN]; - blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); - uint32_t context_key_words[8]; - load_key_words(context_key, context_key_words); - hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL); -} - -// As described in hasher_push_cv() below, we do "lazy merging", delaying -// merges until right before the next CV is about to be added. This is -// different from the reference implementation. Another difference is that we -// aren't always merging 1 chunk at a time. Instead, each CV might represent -// any power-of-two number of chunks, as long as the smaller-above-larger stack -// order is maintained. Instead of the "count the trailing 0-bits" algorithm -// described in the spec, we use a "count the total number of 1-bits" variant -// that doesn't require us to retain the subtree size of the CV on top of the -// stack. The principle is the same: each CV that should remain in the stack is -// represented by a 1-bit in the total number of chunks (or bytes) so far. -INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { - size_t post_merge_stack_len = (size_t)popcnt(total_len); - while (self->cv_stack_len > post_merge_stack_len) { - uint8_t *parent_node = - &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; - output_t output = parent_output(parent_node, self->key, self->chunk.flags); - output_chaining_value(&output, parent_node); - self->cv_stack_len -= 1; - } -} - -// In reference_impl.rs, we merge the new CV with existing CVs from the stack -// before pushing it. We can do that because we know more input is coming, so -// we know none of the merges are root. -// -// This setting is different. We want to feed as much input as possible to -// compress_subtree_wide(), without setting aside anything for the chunk_state. -// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once -// as a single subtree, if at all possible. -// -// This leads to two problems: -// 1) This 64 KiB input might be the only call that ever gets made to update. -// In this case, the root node of the 64 KiB subtree would be the root node -// of the whole tree, and it would need to be ROOT finalized. We can't -// compress it until we know. -// 2) This 64 KiB input might complete a larger tree, whose root node is -// similarly going to be the the root of the whole tree. For example, maybe -// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the -// node at the root of the 256 KiB subtree until we know how to finalize it. -// -// The second problem is solved with "lazy merging". That is, when we're about -// to add a CV to the stack, we don't merge it with anything first, as the -// reference impl does. Instead we do merges using the *previous* CV that was -// added, which is sitting on top of the stack, and we put the new CV -// (unmerged) on top of the stack afterwards. This guarantees that we never -// merge the root node until finalize(). -// -// Solving the first problem requires an additional tool, -// compress_subtree_to_parent_node(). That function always returns the top -// *two* chaining values of the subtree it's compressing. We then do lazy -// merging with each of them separately, so that the second CV will always -// remain unmerged. (That also helps us support extendable output when we're -// hashing an input all-at-once.) -INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], - uint64_t chunk_counter) { - hasher_merge_cv_stack(self, chunk_counter); - memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, - BLAKE3_OUT_LEN); - self->cv_stack_len += 1; -} - -void blake3_hasher_update(blake3_hasher *self, const void *input, - size_t input_len) { - // Explicitly checking for zero avoids causing UB by passing a null pointer - // to memcpy. This comes up in practice with things like: - // std::vector v; - // blake3_hasher_update(&hasher, v.data(), v.size()); - if (input_len == 0) { - return; - } - - const uint8_t *input_bytes = (const uint8_t *)input; - - // If we have some partial chunk bytes in the internal chunk_state, we need - // to finish that chunk first. - if (chunk_state_len(&self->chunk) > 0) { - size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); - if (take > input_len) { - take = input_len; - } - chunk_state_update(&self->chunk, input_bytes, take); - input_bytes += take; - input_len -= take; - // If we've filled the current chunk and there's more coming, finalize this - // chunk and proceed. In this case we know it's not the root. - if (input_len > 0) { - output_t output = chunk_state_output(&self->chunk); - uint8_t chunk_cv[32]; - output_chaining_value(&output, chunk_cv); - hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); - chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); - } else { - return; - } - } - - // Now the chunk_state is clear, and we have more input. If there's more than - // a single chunk (so, definitely not the root chunk), hash the largest whole - // subtree we can, with the full benefits of SIMD (and maybe in the future, - // multi-threading) parallelism. Two restrictions: - // - The subtree has to be a power-of-2 number of chunks. Only subtrees along - // the right edge can be incomplete, and we don't know where the right edge - // is going to be until we get to finalize(). - // - The subtree must evenly divide the total number of chunks up until this - // point (if total is not 0). If the current incomplete subtree is only - // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have - // to complete the current subtree first. - // Because we might need to break up the input to form powers of 2, or to - // evenly divide what we already have, this part runs in a loop. - while (input_len > BLAKE3_CHUNK_LEN) { - size_t subtree_len = round_down_to_power_of_2(input_len); - uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; - // Shrink the subtree_len until it evenly divides the count so far. We know - // that subtree_len itself is a power of 2, so we can use a bitmasking - // trick instead of an actual remainder operation. (Note that if the caller - // consistently passes power-of-2 inputs of the same size, as is hopefully - // typical, this loop condition will always fail, and subtree_len will - // always be the full length of the input.) - // - // An aside: We don't have to shrink subtree_len quite this much. For - // example, if count_so_far is 1, we could pass 2 chunks to - // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still - // get the right answer in the end, and we might get to use 2-way SIMD - // parallelism. The problem with this optimization, is that it gets us - // stuck always hashing 2 chunks. The total number of chunks will remain - // odd, and we'll never graduate to higher degrees of parallelism. See - // https://github.com/BLAKE3-team/BLAKE3/issues/69. - while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { - subtree_len /= 2; - } - // The shrunken subtree_len might now be 1 chunk long. If so, hash that one - // chunk by itself. Otherwise, compress the subtree into a pair of CVs. - uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; - if (subtree_len <= BLAKE3_CHUNK_LEN) { - blake3_chunk_state chunk_state; - chunk_state_init(&chunk_state, self->key, self->chunk.flags); - chunk_state.chunk_counter = self->chunk.chunk_counter; - chunk_state_update(&chunk_state, input_bytes, subtree_len); - output_t output = chunk_state_output(&chunk_state); - uint8_t cv[BLAKE3_OUT_LEN]; - output_chaining_value(&output, cv); - hasher_push_cv(self, cv, chunk_state.chunk_counter); - } else { - // This is the high-performance happy path, though getting here depends - // on the caller giving us a long enough input. - uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; - compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, - self->chunk.chunk_counter, - self->chunk.flags, cv_pair); - hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); - hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], - self->chunk.chunk_counter + (subtree_chunks / 2)); - } - self->chunk.chunk_counter += subtree_chunks; - input_bytes += subtree_len; - input_len -= subtree_len; - } - - // If there's any remaining input less than a full chunk, add it to the chunk - // state. In that case, also do a final merge loop to make sure the subtree - // stack doesn't contain any unmerged pairs. The remaining input means we - // know these merges are non-root. This merge loop isn't strictly necessary - // here, because hasher_push_chunk_cv already does its own merge loop, but it - // simplifies blake3_hasher_finalize below. - if (input_len > 0) { - chunk_state_update(&self->chunk, input_bytes, input_len); - hasher_merge_cv_stack(self, self->chunk.chunk_counter); - } -} - -void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, - size_t out_len) { - blake3_hasher_finalize_seek(self, 0, out, out_len); -} - -void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, - uint8_t *out, size_t out_len) { - // Explicitly checking for zero avoids causing UB by passing a null pointer - // to memcpy. This comes up in practice with things like: - // std::vector v; - // blake3_hasher_finalize(&hasher, v.data(), v.size()); - if (out_len == 0) { - return; - } - - // If the subtree stack is empty, then the current chunk is the root. - if (self->cv_stack_len == 0) { - output_t output = chunk_state_output(&self->chunk); - output_root_bytes(&output, seek, out, out_len); - return; - } - // If there are any bytes in the chunk state, finalize that chunk and do a - // roll-up merge between that chunk hash and every subtree in the stack. In - // this case, the extra merge loop at the end of blake3_hasher_update - // guarantees that none of the subtrees in the stack need to be merged with - // each other first. Otherwise, if there are no bytes in the chunk state, - // then the top of the stack is a chunk hash, and we start the merge from - // that. - output_t output; - size_t cvs_remaining; - if (chunk_state_len(&self->chunk) > 0) { - cvs_remaining = self->cv_stack_len; - output = chunk_state_output(&self->chunk); - } else { - // There are always at least 2 CVs in the stack in this case. - cvs_remaining = self->cv_stack_len - 2; - output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, - self->chunk.flags); - } - while (cvs_remaining > 0) { - cvs_remaining -= 1; - uint8_t parent_block[BLAKE3_BLOCK_LEN]; - memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); - output_chaining_value(&output, &parent_block[32]); - output = parent_output(parent_block, self->key, self->chunk.flags); - } - output_root_bytes(&output, seek, out, out_len); -} diff --git a/src/b3/blake3.h b/src/b3/blake3.h deleted file mode 100644 index 5060e38b7..000000000 --- a/src/b3/blake3.h +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef BLAKE3_H -#define BLAKE3_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define BLAKE3_KEY_LEN 32 -#define BLAKE3_OUT_LEN 32 -#define BLAKE3_BLOCK_LEN 64 -#define BLAKE3_CHUNK_LEN 1024 -#define BLAKE3_MAX_DEPTH 54 -#define BLAKE3_MAX_SIMD_DEGREE 16 - -// This struct is a private implementation detail. It has to be here because -// it's part of blake3_hasher below. -typedef struct { - uint32_t cv[8]; - uint64_t chunk_counter; - uint8_t buf[BLAKE3_BLOCK_LEN]; - uint8_t buf_len; - uint8_t blocks_compressed; - uint8_t flags; -} blake3_chunk_state; - -typedef struct { - uint32_t key[8]; - blake3_chunk_state chunk; - uint8_t cv_stack_len; - // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, - // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk - // requires a 4th entry, rather than merging everything down to 1, because we - // don't know whether more input is coming. This is different from how the - // reference implementation does things. - uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; -} blake3_hasher; - -void blake3_hasher_init(blake3_hasher *self); -void blake3_hasher_init_keyed(blake3_hasher *self, - const uint8_t key[BLAKE3_KEY_LEN]); -void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); -void blake3_hasher_update(blake3_hasher *self, const void *input, - size_t input_len); -void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, - size_t out_len); -void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, - uint8_t *out, size_t out_len); - -#ifdef __cplusplus -} -#endif - -#endif /* BLAKE3_H */ diff --git a/src/b3/blake3_avx2.c b/src/b3/blake3_avx2.c deleted file mode 100644 index c5a2ce9e2..000000000 --- a/src/b3/blake3_avx2.c +++ /dev/null @@ -1,325 +0,0 @@ -#include "blake3_impl.h" - -#include - -#define DEGREE 8 - -INLINE __m256i loadu(const uint8_t src[32]) { - return _mm256_loadu_si256((const __m256i *)src); -} - -INLINE void storeu(__m256i src, uint8_t dest[16]) { - _mm256_storeu_si256((__m256i *)dest, src); -} - -INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } - -// Note that clang-format doesn't like the name "xor" for some reason. -INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } - -INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } - -INLINE __m256i rot16(__m256i x) { - return _mm256_shuffle_epi8( - x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, - 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); -} - -INLINE __m256i rot12(__m256i x) { - return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)); -} - -INLINE __m256i rot8(__m256i x) { - return _mm256_shuffle_epi8( - x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, - 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); -} - -INLINE __m256i rot7(__m256i x) { - return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)); -} - -INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) { - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -INLINE void transpose_vecs(__m256i vecs[DEGREE]) { - // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high - // is 22/33/66/77. - __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); - __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); - __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); - __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); - __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); - __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); - __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); - __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); - - // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is - // 11/33. - __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); - __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); - __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); - __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); - __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); - __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); - __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); - __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); - - // Interleave 128-bit lanes. - vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); - vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); - vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); - vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); - vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); - vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); - vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); - vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); -} - -INLINE void transpose_msg_vecs(const uint8_t *const *inputs, - size_t block_offset, __m256i out[16]) { - out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]); - out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]); - out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]); - out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]); - out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]); - out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]); - out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]); - out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]); - out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]); - out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]); - out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]); - out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]); - out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]); - out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]); - out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]); - out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]); - for (size_t i = 0; i < 8; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs(&out[0]); - transpose_vecs(&out[8]); -} - -INLINE void load_counters(uint64_t counter, bool increment_counter, - __m256i *out_lo, __m256i *out_hi) { - const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter); - const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - const __m256i add1 = _mm256_and_si256(mask, add0); - __m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1); - __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), - _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000))); - __m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry); - *out_lo = l; - *out_hi = h; -} - -void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m256i h_vecs[8] = { - set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), - set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), - }; - __m256i counter_low_vec, counter_high_vec; - load_counters(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN); - __m256i block_flags_vec = set1(block_flags); - __m256i msg_vecs[16]; - transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m256i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn(v, msg_vecs, 0); - round_fn(v, msg_vecs, 1); - round_fn(v, msg_vecs, 2); - round_fn(v, msg_vecs, 3); - round_fn(v, msg_vecs, 4); - round_fn(v, msg_vecs, 5); - round_fn(v, msg_vecs, 6); - h_vecs[0] = xorv(v[0], v[8]); - h_vecs[1] = xorv(v[1], v[9]); - h_vecs[2] = xorv(v[2], v[10]); - h_vecs[3] = xorv(v[3], v[11]); - h_vecs[4] = xorv(v[4], v[12]); - h_vecs[5] = xorv(v[5], v[13]); - h_vecs[6] = xorv(v[6], v[14]); - h_vecs[7] = xorv(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs(h_vecs); - storeu(h_vecs[0], &out[0 * sizeof(__m256i)]); - storeu(h_vecs[1], &out[1 * sizeof(__m256i)]); - storeu(h_vecs[2], &out[2 * sizeof(__m256i)]); - storeu(h_vecs[3], &out[3 * sizeof(__m256i)]); - storeu(h_vecs[4], &out[4 * sizeof(__m256i)]); - storeu(h_vecs[5], &out[5 * sizeof(__m256i)]); - storeu(h_vecs[6], &out[6 * sizeof(__m256i)]); - storeu(h_vecs[7], &out[7 * sizeof(__m256i)]); -} - -#if !defined(BLAKE3_NO_SSE41) -void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#else -void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif - -void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs >= DEGREE) { - blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += DEGREE; - } - inputs += DEGREE; - num_inputs -= DEGREE; - out = &out[DEGREE * BLAKE3_OUT_LEN]; - } -#if !defined(BLAKE3_NO_SSE41) - blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); -#else - blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); -#endif -} diff --git a/src/b3/blake3_avx2_x86-64_unix.S b/src/b3/blake3_avx2_x86-64_unix.S deleted file mode 100644 index d2b14d440..000000000 --- a/src/b3/blake3_avx2_x86-64_unix.S +++ /dev/null @@ -1,1802 +0,0 @@ -#ifdef __x86_64__ -.intel_syntax noprefix -.global _blake3_hash_many_avx2 -.global blake3_hash_many_avx2 -#ifdef __APPLE__ -.text -#else -.section .text -#endif - .p2align 6 -_blake3_hash_many_avx2: -blake3_hash_many_avx2: - push r15 - push r14 - push r13 - push r12 - push rbx - push rbp - mov rbp, rsp - sub rsp, 680 - and rsp, 0xFFFFFFFFFFFFFFC0 - neg r9d - vmovd xmm0, r9d - vpbroadcastd ymm0, xmm0 - vmovdqa ymmword ptr [rsp+0x280], ymm0 - vpand ymm1, ymm0, ymmword ptr [ADD0+rip] - vpand ymm2, ymm0, ymmword ptr [ADD1+rip] - vmovdqa ymmword ptr [rsp+0x220], ymm2 - vmovd xmm2, r8d - vpbroadcastd ymm2, xmm2 - vpaddd ymm2, ymm2, ymm1 - vmovdqa ymmword ptr [rsp+0x240], ymm2 - vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] - vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] - vpcmpgtd ymm2, ymm1, ymm2 - shr r8, 32 - vmovd xmm3, r8d - vpbroadcastd ymm3, xmm3 - vpsubd ymm3, ymm3, ymm2 - vmovdqa ymmword ptr [rsp+0x260], ymm3 - shl rdx, 6 - mov qword ptr [rsp+0x2A0], rdx - cmp rsi, 8 - jc 3f -2: - vpbroadcastd ymm0, dword ptr [rcx] - vpbroadcastd ymm1, dword ptr [rcx+0x4] - vpbroadcastd ymm2, dword ptr [rcx+0x8] - vpbroadcastd ymm3, dword ptr [rcx+0xC] - vpbroadcastd ymm4, dword ptr [rcx+0x10] - vpbroadcastd ymm5, dword ptr [rcx+0x14] - vpbroadcastd ymm6, dword ptr [rcx+0x18] - vpbroadcastd ymm7, dword ptr [rcx+0x1C] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x20] - mov r13, qword ptr [rdi+0x28] - mov r14, qword ptr [rdi+0x30] - mov r15, qword ptr [rdi+0x38] - movzx eax, byte ptr [rbp+0x38] - movzx ebx, byte ptr [rbp+0x40] - or eax, ebx - xor edx, edx -.p2align 5 -9: - movzx ebx, byte ptr [rbp+0x48] - or ebx, eax - add rdx, 64 - cmp rdx, qword ptr [rsp+0x2A0] - cmove eax, ebx - mov dword ptr [rsp+0x200], eax - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x40] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x40] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x40] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0x20], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0x40], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0x60], ymm11 - vmovups xmm8, xmmword ptr [r8+rdx-0x30] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x30] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x30] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp+0x80], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0xA0], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0xC0], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0xE0], ymm11 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x20] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x20] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x20] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp+0x100], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0x120], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0x140], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0x160], ymm11 - vmovups xmm8, xmmword ptr [r8+rdx-0x10] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x10] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x10] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp+0x180], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0x1A0], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0x1C0], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0x1E0], ymm11 - vpbroadcastd ymm15, dword ptr [rsp+0x200] - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r12+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r13+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r14+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - prefetcht0 [r15+rdx+0x80] - vpaddd ymm0, ymm0, ymmword ptr [rsp] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm0, ymmword ptr [rsp+0x240] - vpxor ymm13, ymm1, ymmword ptr [rsp+0x260] - vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] - vpxor ymm15, ymm3, ymm15 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] - vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] - vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] - vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] - vpaddd ymm1, ymm1, ymmword ptr [rsp] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] - vpaddd ymm1, ymm1, ymmword ptr [rsp] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] - vpaddd ymm1, ymm1, ymmword ptr [rsp] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vpxor ymm0, ymm0, ymm8 - vpxor ymm1, ymm1, ymm9 - vpxor ymm2, ymm2, ymm10 - vpxor ymm3, ymm3, ymm11 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpxor ymm4, ymm4, ymm12 - vpxor ymm5, ymm5, ymm13 - vpxor ymm6, ymm6, ymm14 - vpxor ymm7, ymm7, ymm15 - movzx eax, byte ptr [rbp+0x38] - jne 9b - mov rbx, qword ptr [rbp+0x50] - vunpcklps ymm8, ymm0, ymm1 - vunpcklps ymm9, ymm2, ymm3 - vunpckhps ymm10, ymm0, ymm1 - vunpcklps ymm11, ymm4, ymm5 - vunpcklps ymm0, ymm6, ymm7 - vshufps ymm12, ymm8, ymm9, 78 - vblendps ymm1, ymm8, ymm12, 0xCC - vshufps ymm8, ymm11, ymm0, 78 - vunpckhps ymm13, ymm2, ymm3 - vblendps ymm2, ymm11, ymm8, 0xCC - vblendps ymm3, ymm12, ymm9, 0xCC - vperm2f128 ymm12, ymm1, ymm2, 0x20 - vmovups ymmword ptr [rbx], ymm12 - vunpckhps ymm14, ymm4, ymm5 - vblendps ymm4, ymm8, ymm0, 0xCC - vunpckhps ymm15, ymm6, ymm7 - vperm2f128 ymm7, ymm3, ymm4, 0x20 - vmovups ymmword ptr [rbx+0x20], ymm7 - vshufps ymm5, ymm10, ymm13, 78 - vblendps ymm6, ymm5, ymm13, 0xCC - vshufps ymm13, ymm14, ymm15, 78 - vblendps ymm10, ymm10, ymm5, 0xCC - vblendps ymm14, ymm14, ymm13, 0xCC - vperm2f128 ymm8, ymm10, ymm14, 0x20 - vmovups ymmword ptr [rbx+0x40], ymm8 - vblendps ymm15, ymm13, ymm15, 0xCC - vperm2f128 ymm13, ymm6, ymm15, 0x20 - vmovups ymmword ptr [rbx+0x60], ymm13 - vperm2f128 ymm9, ymm1, ymm2, 0x31 - vperm2f128 ymm11, ymm3, ymm4, 0x31 - vmovups ymmword ptr [rbx+0x80], ymm9 - vperm2f128 ymm14, ymm10, ymm14, 0x31 - vperm2f128 ymm15, ymm6, ymm15, 0x31 - vmovups ymmword ptr [rbx+0xA0], ymm11 - vmovups ymmword ptr [rbx+0xC0], ymm14 - vmovups ymmword ptr [rbx+0xE0], ymm15 - vmovdqa ymm0, ymmword ptr [rsp+0x220] - vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240] - vmovdqa ymmword ptr [rsp+0x240], ymm1 - vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] - vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] - vpcmpgtd ymm2, ymm0, ymm2 - vmovdqa ymm0, ymmword ptr [rsp+0x260] - vpsubd ymm2, ymm0, ymm2 - vmovdqa ymmword ptr [rsp+0x260], ymm2 - add rdi, 64 - add rbx, 256 - mov qword ptr [rbp+0x50], rbx - sub rsi, 8 - cmp rsi, 8 - jnc 2b - test rsi, rsi - jnz 3f -4: - vzeroupper - mov rsp, rbp - pop rbp - pop rbx - pop r12 - pop r13 - pop r14 - pop r15 - ret -.p2align 5 -3: - mov rbx, qword ptr [rbp+0x50] - mov r15, qword ptr [rsp+0x2A0] - movzx r13d, byte ptr [rbp+0x38] - movzx r12d, byte ptr [rbp+0x48] - test rsi, 0x4 - je 3f - vbroadcasti128 ymm0, xmmword ptr [rcx] - vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] - vmovdqa ymm8, ymm0 - vmovdqa ymm9, ymm1 - vbroadcasti128 ymm12, xmmword ptr [rsp+0x240] - vbroadcasti128 ymm13, xmmword ptr [rsp+0x260] - vpunpckldq ymm14, ymm12, ymm13 - vpunpckhdq ymm15, ymm12, ymm13 - vpermq ymm14, ymm14, 0x50 - vpermq ymm15, ymm15, 0x50 - vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] - vpblendd ymm14, ymm14, ymm12, 0x44 - vpblendd ymm15, ymm15, ymm12, 0x44 - vmovdqa ymmword ptr [rsp], ymm14 - vmovdqa ymmword ptr [rsp+0x20], ymm15 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x200], eax - vmovups ymm2, ymmword ptr [r8+rdx-0x40] - vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 - vmovups ymm3, ymmword ptr [r8+rdx-0x30] - vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 - vshufps ymm4, ymm2, ymm3, 136 - vshufps ymm5, ymm2, ymm3, 221 - vmovups ymm2, ymmword ptr [r8+rdx-0x20] - vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 - vmovups ymm3, ymmword ptr [r8+rdx-0x10] - vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 - vshufps ymm6, ymm2, ymm3, 136 - vshufps ymm7, ymm2, ymm3, 221 - vpshufd ymm6, ymm6, 0x93 - vpshufd ymm7, ymm7, 0x93 - vmovups ymm10, ymmword ptr [r10+rdx-0x40] - vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 - vmovups ymm11, ymmword ptr [r10+rdx-0x30] - vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 - vshufps ymm12, ymm10, ymm11, 136 - vshufps ymm13, ymm10, ymm11, 221 - vmovups ymm10, ymmword ptr [r10+rdx-0x20] - vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 - vmovups ymm11, ymmword ptr [r10+rdx-0x10] - vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 - vshufps ymm14, ymm10, ymm11, 136 - vshufps ymm15, ymm10, ymm11, 221 - vpshufd ymm14, ymm14, 0x93 - vpshufd ymm15, ymm15, 0x93 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - vpbroadcastd ymm2, dword ptr [rsp+0x200] - vmovdqa ymm3, ymmword ptr [rsp] - vmovdqa ymm11, ymmword ptr [rsp+0x20] - vpblendd ymm3, ymm3, ymm2, 0x88 - vpblendd ymm11, ymm11, ymm2, 0x88 - vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] - vmovdqa ymm10, ymm2 - mov al, 7 -9: - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm8, ymm8, ymm12 - vmovdqa ymmword ptr [rsp+0x40], ymm4 - nop - vmovdqa ymmword ptr [rsp+0x60], ymm12 - nop - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 12 - vpslld ymm9, ymm9, 20 - vpor ymm9, ymm9, ymm4 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vmovdqa ymmword ptr [rsp+0x80], ymm5 - vmovdqa ymmword ptr [rsp+0xA0], ymm13 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 7 - vpslld ymm9, ymm9, 25 - vpor ymm9, ymm9, ymm4 - vpshufd ymm0, ymm0, 0x93 - vpshufd ymm8, ymm8, 0x93 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm11, ymm11, 0x4E - vpshufd ymm2, ymm2, 0x39 - vpshufd ymm10, ymm10, 0x39 - vpaddd ymm0, ymm0, ymm6 - vpaddd ymm8, ymm8, ymm14 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 12 - vpslld ymm9, ymm9, 20 - vpor ymm9, ymm9, ymm4 - vpaddd ymm0, ymm0, ymm7 - vpaddd ymm8, ymm8, ymm15 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 7 - vpslld ymm9, ymm9, 25 - vpor ymm9, ymm9, ymm4 - vpshufd ymm0, ymm0, 0x39 - vpshufd ymm8, ymm8, 0x39 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm11, ymm11, 0x4E - vpshufd ymm2, ymm2, 0x93 - vpshufd ymm10, ymm10, 0x93 - dec al - je 9f - vmovdqa ymm4, ymmword ptr [rsp+0x40] - vmovdqa ymm5, ymmword ptr [rsp+0x80] - vshufps ymm12, ymm4, ymm5, 214 - vpshufd ymm13, ymm4, 0x0F - vpshufd ymm4, ymm12, 0x39 - vshufps ymm12, ymm6, ymm7, 250 - vpblendd ymm13, ymm13, ymm12, 0xAA - vpunpcklqdq ymm12, ymm7, ymm5 - vpblendd ymm12, ymm12, ymm6, 0x88 - vpshufd ymm12, ymm12, 0x78 - vpunpckhdq ymm5, ymm5, ymm7 - vpunpckldq ymm6, ymm6, ymm5 - vpshufd ymm7, ymm6, 0x1E - vmovdqa ymmword ptr [rsp+0x40], ymm13 - vmovdqa ymmword ptr [rsp+0x80], ymm12 - vmovdqa ymm12, ymmword ptr [rsp+0x60] - vmovdqa ymm13, ymmword ptr [rsp+0xA0] - vshufps ymm5, ymm12, ymm13, 214 - vpshufd ymm6, ymm12, 0x0F - vpshufd ymm12, ymm5, 0x39 - vshufps ymm5, ymm14, ymm15, 250 - vpblendd ymm6, ymm6, ymm5, 0xAA - vpunpcklqdq ymm5, ymm15, ymm13 - vpblendd ymm5, ymm5, ymm14, 0x88 - vpshufd ymm5, ymm5, 0x78 - vpunpckhdq ymm13, ymm13, ymm15 - vpunpckldq ymm14, ymm14, ymm13 - vpshufd ymm15, ymm14, 0x1E - vmovdqa ymm13, ymm6 - vmovdqa ymm14, ymm5 - vmovdqa ymm5, ymmword ptr [rsp+0x40] - vmovdqa ymm6, ymmword ptr [rsp+0x80] - jmp 9b -9: - vpxor ymm0, ymm0, ymm2 - vpxor ymm1, ymm1, ymm3 - vpxor ymm8, ymm8, ymm10 - vpxor ymm9, ymm9, ymm11 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vmovdqu xmmword ptr [rbx+0x40], xmm8 - vmovdqu xmmword ptr [rbx+0x50], xmm9 - vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 - vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 - vmovaps xmm8, xmmword ptr [rsp+0x280] - vmovaps xmm0, xmmword ptr [rsp+0x240] - vmovaps xmm1, xmmword ptr [rsp+0x250] - vmovaps xmm2, xmmword ptr [rsp+0x260] - vmovaps xmm3, xmmword ptr [rsp+0x270] - vblendvps xmm0, xmm0, xmm1, xmm8 - vblendvps xmm2, xmm2, xmm3, xmm8 - vmovaps xmmword ptr [rsp+0x240], xmm0 - vmovaps xmmword ptr [rsp+0x260], xmm2 - add rbx, 128 - add rdi, 32 - sub rsi, 4 -3: - test rsi, 0x2 - je 3f - vbroadcasti128 ymm0, xmmword ptr [rcx] - vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] - vmovd xmm13, dword ptr [rsp+0x240] - vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1 - vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovd xmm14, dword ptr [rsp+0x244] - vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1 - vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vinserti128 ymm13, ymm13, xmm14, 0x01 - vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] - vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x200], eax - vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] - vpbroadcastd ymm8, dword ptr [rsp+0x200] - vpblendd ymm3, ymm13, ymm8, 0x88 - vmovups ymm8, ymmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 - vshufps ymm4, ymm8, ymm9, 136 - vshufps ymm5, ymm8, ymm9, 221 - vmovups ymm8, ymmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 - vshufps ymm6, ymm8, ymm9, 136 - vshufps ymm7, ymm8, ymm9, 221 - vpshufd ymm6, ymm6, 0x93 - vpshufd ymm7, ymm7, 0x93 - mov al, 7 -9: - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm14 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm8 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm15 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm8 - vpshufd ymm0, ymm0, 0x93 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x39 - vpaddd ymm0, ymm0, ymm6 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm14 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm8 - vpaddd ymm0, ymm0, ymm7 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm15 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm8 - vpshufd ymm0, ymm0, 0x39 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x93 - dec al - jz 9f - vshufps ymm8, ymm4, ymm5, 214 - vpshufd ymm9, ymm4, 0x0F - vpshufd ymm4, ymm8, 0x39 - vshufps ymm8, ymm6, ymm7, 250 - vpblendd ymm9, ymm9, ymm8, 0xAA - vpunpcklqdq ymm8, ymm7, ymm5 - vpblendd ymm8, ymm8, ymm6, 0x88 - vpshufd ymm8, ymm8, 0x78 - vpunpckhdq ymm5, ymm5, ymm7 - vpunpckldq ymm6, ymm6, ymm5 - vpshufd ymm7, ymm6, 0x1E - vmovdqa ymm5, ymm9 - vmovdqa ymm6, ymm8 - jmp 9b -9: - vpxor ymm0, ymm0, ymm2 - vpxor ymm1, ymm1, ymm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vmovaps ymm8, ymmword ptr [rsp+0x280] - vmovaps ymm0, ymmword ptr [rsp+0x240] - vmovups ymm1, ymmword ptr [rsp+0x248] - vmovaps ymm2, ymmword ptr [rsp+0x260] - vmovups ymm3, ymmword ptr [rsp+0x268] - vblendvps ymm0, ymm0, ymm1, ymm8 - vblendvps ymm2, ymm2, ymm3, ymm8 - vmovaps ymmword ptr [rsp+0x240], ymm0 - vmovaps ymmword ptr [rsp+0x260], ymm2 - add rbx, 64 - add rdi, 16 - sub rsi, 2 -3: - test rsi, 0x1 - je 4b - vmovdqu xmm0, xmmword ptr [rcx] - vmovdqu xmm1, xmmword ptr [rcx+0x10] - vmovd xmm3, dword ptr [rsp+0x240] - vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1 - vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovdqa xmm14, xmmword ptr [ROT16+rip] - vmovdqa xmm15, xmmword ptr [ROT8+rip] - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] - vmovdqa xmm3, xmm13 - vpinsrd xmm3, xmm3, eax, 3 - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vmovups xmm9, xmmword ptr [r8+rdx-0x30] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vmovups xmm9, xmmword ptr [r8+rdx-0x10] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm14 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 12 - vpslld xmm1, xmm1, 20 - vpor xmm1, xmm1, xmm8 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm15 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 7 - vpslld xmm1, xmm1, 25 - vpor xmm1, xmm1, xmm8 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm14 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 12 - vpslld xmm1, xmm1, 20 - vpor xmm1, xmm1, xmm8 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm15 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 7 - vpslld xmm1, xmm1, 25 - vpor xmm1, xmm1, xmm8 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - jmp 4b - - -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif -.p2align 6 -ADD0: - .long 0, 1, 2, 3, 4, 5, 6, 7 -ADD1: - .long 8, 8, 8, 8, 8, 8, 8, 8 -BLAKE3_IV_0: - .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 - .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 -BLAKE3_IV_1: - .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 - .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 -BLAKE3_IV_2: - .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 - .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 -BLAKE3_IV_3: - .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A - .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A -BLAKE3_BLOCK_LEN: - .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 - .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 -ROT16: - .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 -ROT8: - .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 -CMP_MSB_MASK: - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 -BLAKE3_IV: - .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A - -#endif // __x86_64__ diff --git a/src/b3/blake3_avx512.c b/src/b3/blake3_avx512.c deleted file mode 100644 index 77a5c385c..000000000 --- a/src/b3/blake3_avx512.c +++ /dev/null @@ -1,1204 +0,0 @@ -#include "blake3_impl.h" - -#include - -#define _mm_shuffle_ps2(a, b, c) \ - (_mm_castps_si128( \ - _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) - -INLINE __m128i loadu_128(const uint8_t src[16]) { - return _mm_loadu_si128((const __m128i *)src); -} - -INLINE __m256i loadu_256(const uint8_t src[32]) { - return _mm256_loadu_si256((const __m256i *)src); -} - -INLINE __m512i loadu_512(const uint8_t src[64]) { - return _mm512_loadu_si512((const __m512i *)src); -} - -INLINE void storeu_128(__m128i src, uint8_t dest[16]) { - _mm_storeu_si128((__m128i *)dest, src); -} - -INLINE void storeu_256(__m256i src, uint8_t dest[16]) { - _mm256_storeu_si256((__m256i *)dest, src); -} - -INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } - -INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } - -INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); } - -INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } - -INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } - -INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); } - -INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); } - -INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } - -INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); } - -INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); -} - -INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); } - -INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); } - -INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); } - -INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); } - -INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); } - -INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); } - -INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); } - -INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); } - -INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); } - -INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); } - -INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); } - -INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); } - -/* - * ---------------------------------------------------------------------------- - * compress_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = add_128(add_128(*row0, m), *row1); - *row3 = xor_128(*row3, *row0); - *row3 = rot16_128(*row3); - *row2 = add_128(*row2, *row3); - *row1 = xor_128(*row1, *row2); - *row1 = rot12_128(*row1); -} - -INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = add_128(add_128(*row0, m), *row1); - *row3 = xor_128(*row3, *row0); - *row3 = rot8_128(*row3); - *row2 = add_128(*row2, *row3); - *row1 = xor_128(*row1, *row2); - *row1 = rot7_128(*row1); -} - -// Note the optimization here of leaving row1 as the unrotated row, rather than -// row0. All the message loads below are adjusted to compensate for this. See -// discussion at https://github.com/sneves/blake2-avx2/pull/4 -INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); -} - -INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); -} - -INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags) { - rows[0] = loadu_128((uint8_t *)&cv[0]); - rows[1] = loadu_128((uint8_t *)&cv[4]); - rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); - rows[3] = set4(counter_low(counter), counter_high(counter), - (uint32_t)block_len, (uint32_t)flags); - - __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]); - __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]); - __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]); - __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]); - - __m128i t0, t1, t2, t3, tt; - - // Round 1. The first round permutes the message words from the original - // input order, into the groups that get mixed in parallel. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 - t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 - t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 2. This round and all following rounds apply a fixed permutation - // to the message words from the round before. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 3 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 4 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 5 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 6 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 7 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); -} - -void blake3_compress_xof_avx512(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu_128(xor_128(rows[0], rows[2]), &out[0]); - storeu_128(xor_128(rows[1], rows[3]), &out[16]); - storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]); - storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]); -} - -void blake3_compress_in_place_avx512(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]); - storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]); -} - -/* - * ---------------------------------------------------------------------------- - * hash4_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) { - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = add_128(v[0], v[4]); - v[1] = add_128(v[1], v[5]); - v[2] = add_128(v[2], v[6]); - v[3] = add_128(v[3], v[7]); - v[12] = xor_128(v[12], v[0]); - v[13] = xor_128(v[13], v[1]); - v[14] = xor_128(v[14], v[2]); - v[15] = xor_128(v[15], v[3]); - v[12] = rot16_128(v[12]); - v[13] = rot16_128(v[13]); - v[14] = rot16_128(v[14]); - v[15] = rot16_128(v[15]); - v[8] = add_128(v[8], v[12]); - v[9] = add_128(v[9], v[13]); - v[10] = add_128(v[10], v[14]); - v[11] = add_128(v[11], v[15]); - v[4] = xor_128(v[4], v[8]); - v[5] = xor_128(v[5], v[9]); - v[6] = xor_128(v[6], v[10]); - v[7] = xor_128(v[7], v[11]); - v[4] = rot12_128(v[4]); - v[5] = rot12_128(v[5]); - v[6] = rot12_128(v[6]); - v[7] = rot12_128(v[7]); - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = add_128(v[0], v[4]); - v[1] = add_128(v[1], v[5]); - v[2] = add_128(v[2], v[6]); - v[3] = add_128(v[3], v[7]); - v[12] = xor_128(v[12], v[0]); - v[13] = xor_128(v[13], v[1]); - v[14] = xor_128(v[14], v[2]); - v[15] = xor_128(v[15], v[3]); - v[12] = rot8_128(v[12]); - v[13] = rot8_128(v[13]); - v[14] = rot8_128(v[14]); - v[15] = rot8_128(v[15]); - v[8] = add_128(v[8], v[12]); - v[9] = add_128(v[9], v[13]); - v[10] = add_128(v[10], v[14]); - v[11] = add_128(v[11], v[15]); - v[4] = xor_128(v[4], v[8]); - v[5] = xor_128(v[5], v[9]); - v[6] = xor_128(v[6], v[10]); - v[7] = xor_128(v[7], v[11]); - v[4] = rot7_128(v[4]); - v[5] = rot7_128(v[5]); - v[6] = rot7_128(v[6]); - v[7] = rot7_128(v[7]); - - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = add_128(v[0], v[5]); - v[1] = add_128(v[1], v[6]); - v[2] = add_128(v[2], v[7]); - v[3] = add_128(v[3], v[4]); - v[15] = xor_128(v[15], v[0]); - v[12] = xor_128(v[12], v[1]); - v[13] = xor_128(v[13], v[2]); - v[14] = xor_128(v[14], v[3]); - v[15] = rot16_128(v[15]); - v[12] = rot16_128(v[12]); - v[13] = rot16_128(v[13]); - v[14] = rot16_128(v[14]); - v[10] = add_128(v[10], v[15]); - v[11] = add_128(v[11], v[12]); - v[8] = add_128(v[8], v[13]); - v[9] = add_128(v[9], v[14]); - v[5] = xor_128(v[5], v[10]); - v[6] = xor_128(v[6], v[11]); - v[7] = xor_128(v[7], v[8]); - v[4] = xor_128(v[4], v[9]); - v[5] = rot12_128(v[5]); - v[6] = rot12_128(v[6]); - v[7] = rot12_128(v[7]); - v[4] = rot12_128(v[4]); - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = add_128(v[0], v[5]); - v[1] = add_128(v[1], v[6]); - v[2] = add_128(v[2], v[7]); - v[3] = add_128(v[3], v[4]); - v[15] = xor_128(v[15], v[0]); - v[12] = xor_128(v[12], v[1]); - v[13] = xor_128(v[13], v[2]); - v[14] = xor_128(v[14], v[3]); - v[15] = rot8_128(v[15]); - v[12] = rot8_128(v[12]); - v[13] = rot8_128(v[13]); - v[14] = rot8_128(v[14]); - v[10] = add_128(v[10], v[15]); - v[11] = add_128(v[11], v[12]); - v[8] = add_128(v[8], v[13]); - v[9] = add_128(v[9], v[14]); - v[5] = xor_128(v[5], v[10]); - v[6] = xor_128(v[6], v[11]); - v[7] = xor_128(v[7], v[8]); - v[4] = xor_128(v[4], v[9]); - v[5] = rot7_128(v[5]); - v[6] = rot7_128(v[6]); - v[7] = rot7_128(v[7]); - v[4] = rot7_128(v[4]); -} - -INLINE void transpose_vecs_128(__m128i vecs[4]) { - // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is - // 22/33. Note that this doesn't split the vector into two lanes, as the - // AVX2 counterparts do. - __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - - // Interleave 64-bit lanes. - __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); - __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); - __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); - __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); - - vecs[0] = abcd_0; - vecs[1] = abcd_1; - vecs[2] = abcd_2; - vecs[3] = abcd_3; -} - -INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, - size_t block_offset, __m128i out[16]) { - out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]); - out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]); - out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]); - out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]); - out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]); - out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]); - out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]); - out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]); - out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]); - out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]); - out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]); - out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]); - out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]); - out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]); - out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]); - out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]); - for (size_t i = 0; i < 4; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs_128(&out[0]); - transpose_vecs_128(&out[4]); - transpose_vecs_128(&out[8]); - transpose_vecs_128(&out[12]); -} - -INLINE void load_counters4(uint64_t counter, bool increment_counter, - __m128i *out_lo, __m128i *out_hi) { - uint64_t mask = (increment_counter ? ~0 : 0); - __m256i mask_vec = _mm256_set1_epi64x(mask); - __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3); - deltas = _mm256_and_si256(mask_vec, deltas); - __m256i counters = - _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas); - *out_lo = _mm256_cvtepi64_epi32(counters); - *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32)); -} - -void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m128i h_vecs[8] = { - set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), - set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), - }; - __m128i counter_low_vec, counter_high_vec; - load_counters4(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN); - __m128i block_flags_vec = set1_128(block_flags); - __m128i msg_vecs[16]; - transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m128i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn4(v, msg_vecs, 0); - round_fn4(v, msg_vecs, 1); - round_fn4(v, msg_vecs, 2); - round_fn4(v, msg_vecs, 3); - round_fn4(v, msg_vecs, 4); - round_fn4(v, msg_vecs, 5); - round_fn4(v, msg_vecs, 6); - h_vecs[0] = xor_128(v[0], v[8]); - h_vecs[1] = xor_128(v[1], v[9]); - h_vecs[2] = xor_128(v[2], v[10]); - h_vecs[3] = xor_128(v[3], v[11]); - h_vecs[4] = xor_128(v[4], v[12]); - h_vecs[5] = xor_128(v[5], v[13]); - h_vecs[6] = xor_128(v[6], v[14]); - h_vecs[7] = xor_128(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs_128(&h_vecs[0]); - transpose_vecs_128(&h_vecs[4]); - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]); - storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]); - storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]); - storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]); - storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]); - storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]); - storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]); - storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]); -} - -/* - * ---------------------------------------------------------------------------- - * hash8_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) { - v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = add_256(v[0], v[4]); - v[1] = add_256(v[1], v[5]); - v[2] = add_256(v[2], v[6]); - v[3] = add_256(v[3], v[7]); - v[12] = xor_256(v[12], v[0]); - v[13] = xor_256(v[13], v[1]); - v[14] = xor_256(v[14], v[2]); - v[15] = xor_256(v[15], v[3]); - v[12] = rot16_256(v[12]); - v[13] = rot16_256(v[13]); - v[14] = rot16_256(v[14]); - v[15] = rot16_256(v[15]); - v[8] = add_256(v[8], v[12]); - v[9] = add_256(v[9], v[13]); - v[10] = add_256(v[10], v[14]); - v[11] = add_256(v[11], v[15]); - v[4] = xor_256(v[4], v[8]); - v[5] = xor_256(v[5], v[9]); - v[6] = xor_256(v[6], v[10]); - v[7] = xor_256(v[7], v[11]); - v[4] = rot12_256(v[4]); - v[5] = rot12_256(v[5]); - v[6] = rot12_256(v[6]); - v[7] = rot12_256(v[7]); - v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = add_256(v[0], v[4]); - v[1] = add_256(v[1], v[5]); - v[2] = add_256(v[2], v[6]); - v[3] = add_256(v[3], v[7]); - v[12] = xor_256(v[12], v[0]); - v[13] = xor_256(v[13], v[1]); - v[14] = xor_256(v[14], v[2]); - v[15] = xor_256(v[15], v[3]); - v[12] = rot8_256(v[12]); - v[13] = rot8_256(v[13]); - v[14] = rot8_256(v[14]); - v[15] = rot8_256(v[15]); - v[8] = add_256(v[8], v[12]); - v[9] = add_256(v[9], v[13]); - v[10] = add_256(v[10], v[14]); - v[11] = add_256(v[11], v[15]); - v[4] = xor_256(v[4], v[8]); - v[5] = xor_256(v[5], v[9]); - v[6] = xor_256(v[6], v[10]); - v[7] = xor_256(v[7], v[11]); - v[4] = rot7_256(v[4]); - v[5] = rot7_256(v[5]); - v[6] = rot7_256(v[6]); - v[7] = rot7_256(v[7]); - - v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = add_256(v[0], v[5]); - v[1] = add_256(v[1], v[6]); - v[2] = add_256(v[2], v[7]); - v[3] = add_256(v[3], v[4]); - v[15] = xor_256(v[15], v[0]); - v[12] = xor_256(v[12], v[1]); - v[13] = xor_256(v[13], v[2]); - v[14] = xor_256(v[14], v[3]); - v[15] = rot16_256(v[15]); - v[12] = rot16_256(v[12]); - v[13] = rot16_256(v[13]); - v[14] = rot16_256(v[14]); - v[10] = add_256(v[10], v[15]); - v[11] = add_256(v[11], v[12]); - v[8] = add_256(v[8], v[13]); - v[9] = add_256(v[9], v[14]); - v[5] = xor_256(v[5], v[10]); - v[6] = xor_256(v[6], v[11]); - v[7] = xor_256(v[7], v[8]); - v[4] = xor_256(v[4], v[9]); - v[5] = rot12_256(v[5]); - v[6] = rot12_256(v[6]); - v[7] = rot12_256(v[7]); - v[4] = rot12_256(v[4]); - v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = add_256(v[0], v[5]); - v[1] = add_256(v[1], v[6]); - v[2] = add_256(v[2], v[7]); - v[3] = add_256(v[3], v[4]); - v[15] = xor_256(v[15], v[0]); - v[12] = xor_256(v[12], v[1]); - v[13] = xor_256(v[13], v[2]); - v[14] = xor_256(v[14], v[3]); - v[15] = rot8_256(v[15]); - v[12] = rot8_256(v[12]); - v[13] = rot8_256(v[13]); - v[14] = rot8_256(v[14]); - v[10] = add_256(v[10], v[15]); - v[11] = add_256(v[11], v[12]); - v[8] = add_256(v[8], v[13]); - v[9] = add_256(v[9], v[14]); - v[5] = xor_256(v[5], v[10]); - v[6] = xor_256(v[6], v[11]); - v[7] = xor_256(v[7], v[8]); - v[4] = xor_256(v[4], v[9]); - v[5] = rot7_256(v[5]); - v[6] = rot7_256(v[6]); - v[7] = rot7_256(v[7]); - v[4] = rot7_256(v[4]); -} - -INLINE void transpose_vecs_256(__m256i vecs[8]) { - // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high - // is 22/33/66/77. - __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); - __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); - __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); - __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); - __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); - __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); - __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); - __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); - - // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is - // 11/33. - __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); - __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); - __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); - __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); - __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); - __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); - __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); - __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); - - // Interleave 128-bit lanes. - vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); - vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); - vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); - vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); - vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); - vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); - vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); - vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); -} - -INLINE void transpose_msg_vecs8(const uint8_t *const *inputs, - size_t block_offset, __m256i out[16]) { - out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]); - out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]); - out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]); - out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]); - out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]); - out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]); - out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]); - out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]); - out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]); - out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]); - out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]); - out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]); - out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]); - out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]); - out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]); - out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]); - for (size_t i = 0; i < 8; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs_256(&out[0]); - transpose_vecs_256(&out[8]); -} - -INLINE void load_counters8(uint64_t counter, bool increment_counter, - __m256i *out_lo, __m256i *out_hi) { - uint64_t mask = (increment_counter ? ~0 : 0); - __m512i mask_vec = _mm512_set1_epi64(mask); - __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); - deltas = _mm512_and_si512(mask_vec, deltas); - __m512i counters = - _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas); - *out_lo = _mm512_cvtepi64_epi32(counters); - *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32)); -} - -void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m256i h_vecs[8] = { - set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]), - set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]), - }; - __m256i counter_low_vec, counter_high_vec; - load_counters8(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN); - __m256i block_flags_vec = set1_256(block_flags); - __m256i msg_vecs[16]; - transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m256i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn8(v, msg_vecs, 0); - round_fn8(v, msg_vecs, 1); - round_fn8(v, msg_vecs, 2); - round_fn8(v, msg_vecs, 3); - round_fn8(v, msg_vecs, 4); - round_fn8(v, msg_vecs, 5); - round_fn8(v, msg_vecs, 6); - h_vecs[0] = xor_256(v[0], v[8]); - h_vecs[1] = xor_256(v[1], v[9]); - h_vecs[2] = xor_256(v[2], v[10]); - h_vecs[3] = xor_256(v[3], v[11]); - h_vecs[4] = xor_256(v[4], v[12]); - h_vecs[5] = xor_256(v[5], v[13]); - h_vecs[6] = xor_256(v[6], v[14]); - h_vecs[7] = xor_256(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs_256(h_vecs); - storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]); - storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]); - storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]); - storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]); - storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]); - storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]); - storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]); - storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]); -} - -/* - * ---------------------------------------------------------------------------- - * hash16_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) { - v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = add_512(v[0], v[4]); - v[1] = add_512(v[1], v[5]); - v[2] = add_512(v[2], v[6]); - v[3] = add_512(v[3], v[7]); - v[12] = xor_512(v[12], v[0]); - v[13] = xor_512(v[13], v[1]); - v[14] = xor_512(v[14], v[2]); - v[15] = xor_512(v[15], v[3]); - v[12] = rot16_512(v[12]); - v[13] = rot16_512(v[13]); - v[14] = rot16_512(v[14]); - v[15] = rot16_512(v[15]); - v[8] = add_512(v[8], v[12]); - v[9] = add_512(v[9], v[13]); - v[10] = add_512(v[10], v[14]); - v[11] = add_512(v[11], v[15]); - v[4] = xor_512(v[4], v[8]); - v[5] = xor_512(v[5], v[9]); - v[6] = xor_512(v[6], v[10]); - v[7] = xor_512(v[7], v[11]); - v[4] = rot12_512(v[4]); - v[5] = rot12_512(v[5]); - v[6] = rot12_512(v[6]); - v[7] = rot12_512(v[7]); - v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = add_512(v[0], v[4]); - v[1] = add_512(v[1], v[5]); - v[2] = add_512(v[2], v[6]); - v[3] = add_512(v[3], v[7]); - v[12] = xor_512(v[12], v[0]); - v[13] = xor_512(v[13], v[1]); - v[14] = xor_512(v[14], v[2]); - v[15] = xor_512(v[15], v[3]); - v[12] = rot8_512(v[12]); - v[13] = rot8_512(v[13]); - v[14] = rot8_512(v[14]); - v[15] = rot8_512(v[15]); - v[8] = add_512(v[8], v[12]); - v[9] = add_512(v[9], v[13]); - v[10] = add_512(v[10], v[14]); - v[11] = add_512(v[11], v[15]); - v[4] = xor_512(v[4], v[8]); - v[5] = xor_512(v[5], v[9]); - v[6] = xor_512(v[6], v[10]); - v[7] = xor_512(v[7], v[11]); - v[4] = rot7_512(v[4]); - v[5] = rot7_512(v[5]); - v[6] = rot7_512(v[6]); - v[7] = rot7_512(v[7]); - - v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = add_512(v[0], v[5]); - v[1] = add_512(v[1], v[6]); - v[2] = add_512(v[2], v[7]); - v[3] = add_512(v[3], v[4]); - v[15] = xor_512(v[15], v[0]); - v[12] = xor_512(v[12], v[1]); - v[13] = xor_512(v[13], v[2]); - v[14] = xor_512(v[14], v[3]); - v[15] = rot16_512(v[15]); - v[12] = rot16_512(v[12]); - v[13] = rot16_512(v[13]); - v[14] = rot16_512(v[14]); - v[10] = add_512(v[10], v[15]); - v[11] = add_512(v[11], v[12]); - v[8] = add_512(v[8], v[13]); - v[9] = add_512(v[9], v[14]); - v[5] = xor_512(v[5], v[10]); - v[6] = xor_512(v[6], v[11]); - v[7] = xor_512(v[7], v[8]); - v[4] = xor_512(v[4], v[9]); - v[5] = rot12_512(v[5]); - v[6] = rot12_512(v[6]); - v[7] = rot12_512(v[7]); - v[4] = rot12_512(v[4]); - v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = add_512(v[0], v[5]); - v[1] = add_512(v[1], v[6]); - v[2] = add_512(v[2], v[7]); - v[3] = add_512(v[3], v[4]); - v[15] = xor_512(v[15], v[0]); - v[12] = xor_512(v[12], v[1]); - v[13] = xor_512(v[13], v[2]); - v[14] = xor_512(v[14], v[3]); - v[15] = rot8_512(v[15]); - v[12] = rot8_512(v[12]); - v[13] = rot8_512(v[13]); - v[14] = rot8_512(v[14]); - v[10] = add_512(v[10], v[15]); - v[11] = add_512(v[11], v[12]); - v[8] = add_512(v[8], v[13]); - v[9] = add_512(v[9], v[14]); - v[5] = xor_512(v[5], v[10]); - v[6] = xor_512(v[6], v[11]); - v[7] = xor_512(v[7], v[8]); - v[4] = xor_512(v[4], v[9]); - v[5] = rot7_512(v[5]); - v[6] = rot7_512(v[6]); - v[7] = rot7_512(v[7]); - v[4] = rot7_512(v[4]); -} - -// 0b10001000, or lanes a0/a2/b0/b2 in little-endian order -#define LO_IMM8 0x88 - -INLINE __m512i unpack_lo_128(__m512i a, __m512i b) { - return _mm512_shuffle_i32x4(a, b, LO_IMM8); -} - -// 0b11011101, or lanes a1/a3/b1/b3 in little-endian order -#define HI_IMM8 0xdd - -INLINE __m512i unpack_hi_128(__m512i a, __m512i b) { - return _mm512_shuffle_i32x4(a, b, HI_IMM8); -} - -INLINE void transpose_vecs_512(__m512i vecs[16]) { - // Interleave 32-bit lanes. The _0 unpack is lanes - // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes - // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15. - __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]); - __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]); - __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]); - __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]); - __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]); - __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]); - __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]); - __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]); - __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]); - __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]); - __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]); - __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]); - __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]); - __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]); - __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]); - __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]); - - // Interleave 64-bit lates. The _0 unpack is lanes - // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes - // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes - // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes - // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15. - __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0); - __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0); - __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2); - __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2); - __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0); - __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0); - __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2); - __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2); - __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0); - __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0); - __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2); - __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2); - __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0); - __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0); - __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2); - __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2); - - // Interleave 128-bit lanes. The _0 unpack is - // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is - // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on. - __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0); - __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1); - __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2); - __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3); - __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0); - __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1); - __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2); - __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3); - __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0); - __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1); - __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2); - __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3); - __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0); - __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1); - __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2); - __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3); - - // Interleave 128-bit lanes again for the final outputs. - vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0); - vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1); - vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2); - vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3); - vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4); - vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5); - vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6); - vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7); - vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0); - vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1); - vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2); - vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3); - vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4); - vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5); - vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6); - vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7); -} - -INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, - size_t block_offset, __m512i out[16]) { - out[0] = loadu_512(&inputs[0][block_offset]); - out[1] = loadu_512(&inputs[1][block_offset]); - out[2] = loadu_512(&inputs[2][block_offset]); - out[3] = loadu_512(&inputs[3][block_offset]); - out[4] = loadu_512(&inputs[4][block_offset]); - out[5] = loadu_512(&inputs[5][block_offset]); - out[6] = loadu_512(&inputs[6][block_offset]); - out[7] = loadu_512(&inputs[7][block_offset]); - out[8] = loadu_512(&inputs[8][block_offset]); - out[9] = loadu_512(&inputs[9][block_offset]); - out[10] = loadu_512(&inputs[10][block_offset]); - out[11] = loadu_512(&inputs[11][block_offset]); - out[12] = loadu_512(&inputs[12][block_offset]); - out[13] = loadu_512(&inputs[13][block_offset]); - out[14] = loadu_512(&inputs[14][block_offset]); - out[15] = loadu_512(&inputs[15][block_offset]); - for (size_t i = 0; i < 16; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs_512(out); -} - -INLINE void load_counters16(uint64_t counter, bool increment_counter, - __m512i *out_lo, __m512i *out_hi) { - const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter); - const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - const __m512i add1 = _mm512_and_si512(mask, add0); - __m512i l = _mm512_add_epi32(_mm512_set1_epi32(counter), add1); - __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT); - __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32(counter >> 32), carry, _mm512_set1_epi32(counter >> 32), _mm512_set1_epi32(1)); - *out_lo = l; - *out_hi = h; -} - -void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, - uint8_t *out) { - __m512i h_vecs[8] = { - set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]), - set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]), - }; - __m512i counter_low_vec, counter_high_vec; - load_counters16(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN); - __m512i block_flags_vec = set1_512(block_flags); - __m512i msg_vecs[16]; - transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m512i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn16(v, msg_vecs, 0); - round_fn16(v, msg_vecs, 1); - round_fn16(v, msg_vecs, 2); - round_fn16(v, msg_vecs, 3); - round_fn16(v, msg_vecs, 4); - round_fn16(v, msg_vecs, 5); - round_fn16(v, msg_vecs, 6); - h_vecs[0] = xor_512(v[0], v[8]); - h_vecs[1] = xor_512(v[1], v[9]); - h_vecs[2] = xor_512(v[2], v[10]); - h_vecs[3] = xor_512(v[3], v[11]); - h_vecs[4] = xor_512(v[4], v[12]); - h_vecs[5] = xor_512(v[5], v[13]); - h_vecs[6] = xor_512(v[6], v[14]); - h_vecs[7] = xor_512(v[7], v[15]); - - block_flags = flags; - } - - // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8 - // state vectors. Pad the matrix with zeros. After transposition, store the - // lower half of each vector. - __m512i padded[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_512(0), set1_512(0), set1_512(0), set1_512(0), - set1_512(0), set1_512(0), set1_512(0), set1_512(0), - }; - transpose_vecs_512(padded); - _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0])); - _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1])); - _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2])); - _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3])); - _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4])); - _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5])); - _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6])); - _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7])); - _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8])); - _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9])); - _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10])); - _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11])); - _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12])); - _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13])); - _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14])); - _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15])); -} - -/* - * ---------------------------------------------------------------------------- - * hash_many_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { - uint32_t cv[8]; - memcpy(cv, key, BLAKE3_KEY_LEN); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; - } - blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - memcpy(out, cv, BLAKE3_OUT_LEN); -} - -void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs >= 16) { - blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += 16; - } - inputs += 16; - num_inputs -= 16; - out = &out[16 * BLAKE3_OUT_LEN]; - } - while (num_inputs >= 8) { - blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += 8; - } - inputs += 8; - num_inputs -= 8; - out = &out[8 * BLAKE3_OUT_LEN]; - } - while (num_inputs >= 4) { - blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += 4; - } - inputs += 4; - num_inputs -= 4; - out = &out[4 * BLAKE3_OUT_LEN]; - } - while (num_inputs > 0) { - hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start, - flags_end, out); - if (increment_counter) { - counter += 1; - } - inputs += 1; - num_inputs -= 1; - out = &out[BLAKE3_OUT_LEN]; - } -} diff --git a/src/b3/blake3_avx512_x86-64_unix.S b/src/b3/blake3_avx512_x86-64_unix.S deleted file mode 100644 index 621e1aa6d..000000000 --- a/src/b3/blake3_avx512_x86-64_unix.S +++ /dev/null @@ -1,2572 +0,0 @@ -#ifdef __x86_64__ -.intel_syntax noprefix - -.global _blake3_hash_many_avx512 -.global blake3_hash_many_avx512 -.global blake3_compress_in_place_avx512 -.global _blake3_compress_in_place_avx512 -.global blake3_compress_xof_avx512 -.global _blake3_compress_xof_avx512 - -#ifdef __APPLE__ -.text -#else -.section .text -#endif -.p2align 6 -_blake3_hash_many_avx512: -blake3_hash_many_avx512: - push r15 - push r14 - push r13 - push r12 - push rbx - push rbp - mov rbp, rsp - sub rsp, 144 - and rsp, 0xFFFFFFFFFFFFFFC0 - neg r9 - kmovw k1, r9d - vmovd xmm0, r8d - vpbroadcastd ymm0, xmm0 - shr r8, 32 - vmovd xmm1, r8d - vpbroadcastd ymm1, xmm1 - vmovdqa ymm4, ymm1 - vmovdqa ymm5, ymm1 - vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] - vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] - vpcmpltud k2, ymm2, ymm0 - vpcmpltud k3, ymm3, ymm0 - vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} - vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} - knotw k2, k1 - vmovdqa32 ymm2 {k2}, ymm0 - vmovdqa32 ymm3 {k2}, ymm0 - vmovdqa32 ymm4 {k2}, ymm1 - vmovdqa32 ymm5 {k2}, ymm1 - vmovdqa ymmword ptr [rsp], ymm2 - vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3 - vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4 - vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5 - shl rdx, 6 - mov qword ptr [rsp+0x80], rdx - cmp rsi, 16 - jc 3f -2: - vpbroadcastd zmm0, dword ptr [rcx] - vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] - vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] - vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] - vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] - vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] - vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] - vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] - movzx eax, byte ptr [rbp+0x38] - movzx ebx, byte ptr [rbp+0x40] - or eax, ebx - xor edx, edx -.p2align 5 -9: - movzx ebx, byte ptr [rbp+0x48] - or ebx, eax - add rdx, 64 - cmp rdx, qword ptr [rsp+0x80] - cmove eax, ebx - mov dword ptr [rsp+0x88], eax - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x40] - mov r13, qword ptr [rdi+0x48] - mov r14, qword ptr [rdi+0x50] - mov r15, qword ptr [rdi+0x58] - vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] - vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 - vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] - vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 - vpunpcklqdq zmm8, zmm16, zmm17 - vpunpckhqdq zmm9, zmm16, zmm17 - vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] - vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 - vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] - vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 - vpunpcklqdq zmm10, zmm18, zmm19 - vpunpckhqdq zmm11, zmm18, zmm19 - mov r8, qword ptr [rdi+0x20] - mov r9, qword ptr [rdi+0x28] - mov r10, qword ptr [rdi+0x30] - mov r11, qword ptr [rdi+0x38] - mov r12, qword ptr [rdi+0x60] - mov r13, qword ptr [rdi+0x68] - mov r14, qword ptr [rdi+0x70] - mov r15, qword ptr [rdi+0x78] - vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] - vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 - vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] - vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 - vpunpcklqdq zmm12, zmm16, zmm17 - vpunpckhqdq zmm13, zmm16, zmm17 - vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] - vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 - vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] - vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 - vpunpcklqdq zmm14, zmm18, zmm19 - vpunpckhqdq zmm15, zmm18, zmm19 - vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] - vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] - vshufps zmm16, zmm8, zmm10, 136 - vshufps zmm17, zmm12, zmm14, 136 - vmovdqa32 zmm20, zmm16 - vpermt2d zmm16, zmm27, zmm17 - vpermt2d zmm20, zmm31, zmm17 - vshufps zmm17, zmm8, zmm10, 221 - vshufps zmm30, zmm12, zmm14, 221 - vmovdqa32 zmm21, zmm17 - vpermt2d zmm17, zmm27, zmm30 - vpermt2d zmm21, zmm31, zmm30 - vshufps zmm18, zmm9, zmm11, 136 - vshufps zmm8, zmm13, zmm15, 136 - vmovdqa32 zmm22, zmm18 - vpermt2d zmm18, zmm27, zmm8 - vpermt2d zmm22, zmm31, zmm8 - vshufps zmm19, zmm9, zmm11, 221 - vshufps zmm8, zmm13, zmm15, 221 - vmovdqa32 zmm23, zmm19 - vpermt2d zmm19, zmm27, zmm8 - vpermt2d zmm23, zmm31, zmm8 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x40] - mov r13, qword ptr [rdi+0x48] - mov r14, qword ptr [rdi+0x50] - mov r15, qword ptr [rdi+0x58] - vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm8, zmm24, zmm25 - vpunpckhqdq zmm9, zmm24, zmm25 - vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm10, zmm24, zmm25 - vpunpckhqdq zmm11, zmm24, zmm25 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r12+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r13+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r14+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - prefetcht0 [r15+rdx+0x80] - mov r8, qword ptr [rdi+0x20] - mov r9, qword ptr [rdi+0x28] - mov r10, qword ptr [rdi+0x30] - mov r11, qword ptr [rdi+0x38] - mov r12, qword ptr [rdi+0x60] - mov r13, qword ptr [rdi+0x68] - mov r14, qword ptr [rdi+0x70] - mov r15, qword ptr [rdi+0x78] - vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm12, zmm24, zmm25 - vpunpckhqdq zmm13, zmm24, zmm25 - vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm14, zmm24, zmm25 - vpunpckhqdq zmm15, zmm24, zmm25 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r12+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r13+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r14+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - prefetcht0 [r15+rdx+0x80] - vshufps zmm24, zmm8, zmm10, 136 - vshufps zmm30, zmm12, zmm14, 136 - vmovdqa32 zmm28, zmm24 - vpermt2d zmm24, zmm27, zmm30 - vpermt2d zmm28, zmm31, zmm30 - vshufps zmm25, zmm8, zmm10, 221 - vshufps zmm30, zmm12, zmm14, 221 - vmovdqa32 zmm29, zmm25 - vpermt2d zmm25, zmm27, zmm30 - vpermt2d zmm29, zmm31, zmm30 - vshufps zmm26, zmm9, zmm11, 136 - vshufps zmm8, zmm13, zmm15, 136 - vmovdqa32 zmm30, zmm26 - vpermt2d zmm26, zmm27, zmm8 - vpermt2d zmm30, zmm31, zmm8 - vshufps zmm8, zmm9, zmm11, 221 - vshufps zmm10, zmm13, zmm15, 221 - vpermi2d zmm27, zmm8, zmm10 - vpermi2d zmm31, zmm8, zmm10 - vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] - vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] - vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] - vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] - vmovdqa32 zmm12, zmmword ptr [rsp] - vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] - vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] - vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] - vpaddd zmm0, zmm0, zmm16 - vpaddd zmm1, zmm1, zmm18 - vpaddd zmm2, zmm2, zmm20 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm17 - vpaddd zmm1, zmm1, zmm19 - vpaddd zmm2, zmm2, zmm21 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm24 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm28 - vpaddd zmm3, zmm3, zmm30 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm25 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm29 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm18 - vpaddd zmm1, zmm1, zmm19 - vpaddd zmm2, zmm2, zmm23 - vpaddd zmm3, zmm3, zmm20 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm22 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm16 - vpaddd zmm3, zmm3, zmm29 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm17 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm25 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm27 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm30 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm19 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm29 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm20 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm18 - vpaddd zmm3, zmm3, zmm30 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm22 - vpaddd zmm1, zmm1, zmm25 - vpaddd zmm2, zmm2, zmm27 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm21 - vpaddd zmm1, zmm1, zmm16 - vpaddd zmm2, zmm2, zmm31 - vpaddd zmm3, zmm3, zmm17 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm26 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm30 - vpaddd zmm3, zmm3, zmm29 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm23 - vpaddd zmm1, zmm1, zmm25 - vpaddd zmm2, zmm2, zmm19 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm20 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm21 - vpaddd zmm3, zmm3, zmm17 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm16 - vpaddd zmm1, zmm1, zmm18 - vpaddd zmm2, zmm2, zmm24 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm28 - vpaddd zmm1, zmm1, zmm25 - vpaddd zmm2, zmm2, zmm31 - vpaddd zmm3, zmm3, zmm30 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm29 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm26 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm23 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm16 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm18 - vpaddd zmm1, zmm1, zmm19 - vpaddd zmm2, zmm2, zmm17 - vpaddd zmm3, zmm3, zmm20 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm25 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm24 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm30 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm28 - vpaddd zmm3, zmm3, zmm17 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm29 - vpaddd zmm1, zmm1, zmm16 - vpaddd zmm2, zmm2, zmm18 - vpaddd zmm3, zmm3, zmm20 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm19 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm22 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm27 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm17 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm31 - vpaddd zmm1, zmm1, zmm16 - vpaddd zmm2, zmm2, zmm25 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm30 - vpaddd zmm1, zmm1, zmm18 - vpaddd zmm2, zmm2, zmm19 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm26 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm20 - vpaddd zmm3, zmm3, zmm29 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpxord zmm0, zmm0, zmm8 - vpxord zmm1, zmm1, zmm9 - vpxord zmm2, zmm2, zmm10 - vpxord zmm3, zmm3, zmm11 - vpxord zmm4, zmm4, zmm12 - vpxord zmm5, zmm5, zmm13 - vpxord zmm6, zmm6, zmm14 - vpxord zmm7, zmm7, zmm15 - movzx eax, byte ptr [rbp+0x38] - jne 9b - mov rbx, qword ptr [rbp+0x50] - vpunpckldq zmm16, zmm0, zmm1 - vpunpckhdq zmm17, zmm0, zmm1 - vpunpckldq zmm18, zmm2, zmm3 - vpunpckhdq zmm19, zmm2, zmm3 - vpunpckldq zmm20, zmm4, zmm5 - vpunpckhdq zmm21, zmm4, zmm5 - vpunpckldq zmm22, zmm6, zmm7 - vpunpckhdq zmm23, zmm6, zmm7 - vpunpcklqdq zmm0, zmm16, zmm18 - vpunpckhqdq zmm1, zmm16, zmm18 - vpunpcklqdq zmm2, zmm17, zmm19 - vpunpckhqdq zmm3, zmm17, zmm19 - vpunpcklqdq zmm4, zmm20, zmm22 - vpunpckhqdq zmm5, zmm20, zmm22 - vpunpcklqdq zmm6, zmm21, zmm23 - vpunpckhqdq zmm7, zmm21, zmm23 - vshufi32x4 zmm16, zmm0, zmm4, 0x88 - vshufi32x4 zmm17, zmm1, zmm5, 0x88 - vshufi32x4 zmm18, zmm2, zmm6, 0x88 - vshufi32x4 zmm19, zmm3, zmm7, 0x88 - vshufi32x4 zmm20, zmm0, zmm4, 0xDD - vshufi32x4 zmm21, zmm1, zmm5, 0xDD - vshufi32x4 zmm22, zmm2, zmm6, 0xDD - vshufi32x4 zmm23, zmm3, zmm7, 0xDD - vshufi32x4 zmm0, zmm16, zmm17, 0x88 - vshufi32x4 zmm1, zmm18, zmm19, 0x88 - vshufi32x4 zmm2, zmm20, zmm21, 0x88 - vshufi32x4 zmm3, zmm22, zmm23, 0x88 - vshufi32x4 zmm4, zmm16, zmm17, 0xDD - vshufi32x4 zmm5, zmm18, zmm19, 0xDD - vshufi32x4 zmm6, zmm20, zmm21, 0xDD - vshufi32x4 zmm7, zmm22, zmm23, 0xDD - vmovdqu32 zmmword ptr [rbx], zmm0 - vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 - vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 - vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 - vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 - vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 - vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 - vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 - vmovdqa32 zmm0, zmmword ptr [rsp] - vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] - vmovdqa32 zmm2, zmm0 - vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} - vpcmpltud k2, zmm2, zmm0 - vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} - vmovdqa32 zmmword ptr [rsp], zmm2 - vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 - add rdi, 128 - add rbx, 512 - mov qword ptr [rbp+0x50], rbx - sub rsi, 16 - cmp rsi, 16 - jnc 2b - test rsi, rsi - jnz 3f -4: - vzeroupper - mov rsp, rbp - pop rbp - pop rbx - pop r12 - pop r13 - pop r14 - pop r15 - ret -.p2align 6 -3: - test esi, 0x8 - je 3f - vpbroadcastd ymm0, dword ptr [rcx] - vpbroadcastd ymm1, dword ptr [rcx+0x4] - vpbroadcastd ymm2, dword ptr [rcx+0x8] - vpbroadcastd ymm3, dword ptr [rcx+0xC] - vpbroadcastd ymm4, dword ptr [rcx+0x10] - vpbroadcastd ymm5, dword ptr [rcx+0x14] - vpbroadcastd ymm6, dword ptr [rcx+0x18] - vpbroadcastd ymm7, dword ptr [rcx+0x1C] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x20] - mov r13, qword ptr [rdi+0x28] - mov r14, qword ptr [rdi+0x30] - mov r15, qword ptr [rdi+0x38] - movzx eax, byte ptr [rbp+0x38] - movzx ebx, byte ptr [rbp+0x40] - or eax, ebx - xor edx, edx -2: - movzx ebx, byte ptr [rbp+0x48] - or ebx, eax - add rdx, 64 - cmp rdx, qword ptr [rsp+0x80] - cmove eax, ebx - mov dword ptr [rsp+0x88], eax - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x40] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x40] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x40] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm16, ymm12, ymm14, 136 - vshufps ymm17, ymm12, ymm14, 221 - vshufps ymm18, ymm13, ymm15, 136 - vshufps ymm19, ymm13, ymm15, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x30] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x30] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x30] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm20, ymm12, ymm14, 136 - vshufps ymm21, ymm12, ymm14, 221 - vshufps ymm22, ymm13, ymm15, 136 - vshufps ymm23, ymm13, ymm15, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x20] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x20] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x20] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm24, ymm12, ymm14, 136 - vshufps ymm25, ymm12, ymm14, 221 - vshufps ymm26, ymm13, ymm15, 136 - vshufps ymm27, ymm13, ymm15, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x10] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x10] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x10] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm28, ymm12, ymm14, 136 - vshufps ymm29, ymm12, ymm14, 221 - vshufps ymm30, ymm13, ymm15, 136 - vshufps ymm31, ymm13, ymm15, 221 - vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] - vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] - vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] - vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] - vmovdqa ymm12, ymmword ptr [rsp] - vmovdqa ymm13, ymmword ptr [rsp+0x40] - vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] - vpbroadcastd ymm15, dword ptr [rsp+0x88] - vpaddd ymm0, ymm0, ymm16 - vpaddd ymm1, ymm1, ymm18 - vpaddd ymm2, ymm2, ymm20 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm17 - vpaddd ymm1, ymm1, ymm19 - vpaddd ymm2, ymm2, ymm21 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm24 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm28 - vpaddd ymm3, ymm3, ymm30 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm25 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm29 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm18 - vpaddd ymm1, ymm1, ymm19 - vpaddd ymm2, ymm2, ymm23 - vpaddd ymm3, ymm3, ymm20 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm22 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm16 - vpaddd ymm3, ymm3, ymm29 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm17 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm25 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm27 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm30 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm19 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm29 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm20 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm18 - vpaddd ymm3, ymm3, ymm30 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm22 - vpaddd ymm1, ymm1, ymm25 - vpaddd ymm2, ymm2, ymm27 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm21 - vpaddd ymm1, ymm1, ymm16 - vpaddd ymm2, ymm2, ymm31 - vpaddd ymm3, ymm3, ymm17 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm26 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm30 - vpaddd ymm3, ymm3, ymm29 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm23 - vpaddd ymm1, ymm1, ymm25 - vpaddd ymm2, ymm2, ymm19 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm20 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm21 - vpaddd ymm3, ymm3, ymm17 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm16 - vpaddd ymm1, ymm1, ymm18 - vpaddd ymm2, ymm2, ymm24 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm28 - vpaddd ymm1, ymm1, ymm25 - vpaddd ymm2, ymm2, ymm31 - vpaddd ymm3, ymm3, ymm30 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm29 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm26 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm23 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm16 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm18 - vpaddd ymm1, ymm1, ymm19 - vpaddd ymm2, ymm2, ymm17 - vpaddd ymm3, ymm3, ymm20 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm25 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm24 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm30 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm28 - vpaddd ymm3, ymm3, ymm17 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm29 - vpaddd ymm1, ymm1, ymm16 - vpaddd ymm2, ymm2, ymm18 - vpaddd ymm3, ymm3, ymm20 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm19 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm22 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm27 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm17 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm31 - vpaddd ymm1, ymm1, ymm16 - vpaddd ymm2, ymm2, ymm25 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm30 - vpaddd ymm1, ymm1, ymm18 - vpaddd ymm2, ymm2, ymm19 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm26 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm20 - vpaddd ymm3, ymm3, ymm29 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpxor ymm0, ymm0, ymm8 - vpxor ymm1, ymm1, ymm9 - vpxor ymm2, ymm2, ymm10 - vpxor ymm3, ymm3, ymm11 - vpxor ymm4, ymm4, ymm12 - vpxor ymm5, ymm5, ymm13 - vpxor ymm6, ymm6, ymm14 - vpxor ymm7, ymm7, ymm15 - movzx eax, byte ptr [rbp+0x38] - jne 2b - mov rbx, qword ptr [rbp+0x50] - vunpcklps ymm8, ymm0, ymm1 - vunpcklps ymm9, ymm2, ymm3 - vunpckhps ymm10, ymm0, ymm1 - vunpcklps ymm11, ymm4, ymm5 - vunpcklps ymm0, ymm6, ymm7 - vshufps ymm12, ymm8, ymm9, 78 - vblendps ymm1, ymm8, ymm12, 0xCC - vshufps ymm8, ymm11, ymm0, 78 - vunpckhps ymm13, ymm2, ymm3 - vblendps ymm2, ymm11, ymm8, 0xCC - vblendps ymm3, ymm12, ymm9, 0xCC - vperm2f128 ymm12, ymm1, ymm2, 0x20 - vmovups ymmword ptr [rbx], ymm12 - vunpckhps ymm14, ymm4, ymm5 - vblendps ymm4, ymm8, ymm0, 0xCC - vunpckhps ymm15, ymm6, ymm7 - vperm2f128 ymm7, ymm3, ymm4, 0x20 - vmovups ymmword ptr [rbx+0x20], ymm7 - vshufps ymm5, ymm10, ymm13, 78 - vblendps ymm6, ymm5, ymm13, 0xCC - vshufps ymm13, ymm14, ymm15, 78 - vblendps ymm10, ymm10, ymm5, 0xCC - vblendps ymm14, ymm14, ymm13, 0xCC - vperm2f128 ymm8, ymm10, ymm14, 0x20 - vmovups ymmword ptr [rbx+0x40], ymm8 - vblendps ymm15, ymm13, ymm15, 0xCC - vperm2f128 ymm13, ymm6, ymm15, 0x20 - vmovups ymmword ptr [rbx+0x60], ymm13 - vperm2f128 ymm9, ymm1, ymm2, 0x31 - vperm2f128 ymm11, ymm3, ymm4, 0x31 - vmovups ymmword ptr [rbx+0x80], ymm9 - vperm2f128 ymm14, ymm10, ymm14, 0x31 - vperm2f128 ymm15, ymm6, ymm15, 0x31 - vmovups ymmword ptr [rbx+0xA0], ymm11 - vmovups ymmword ptr [rbx+0xC0], ymm14 - vmovups ymmword ptr [rbx+0xE0], ymm15 - vmovdqa ymm0, ymmword ptr [rsp] - vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20] - vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] - vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] - vmovdqa ymmword ptr [rsp], ymm0 - vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2 - add rbx, 256 - mov qword ptr [rbp+0x50], rbx - add rdi, 64 - sub rsi, 8 -3: - mov rbx, qword ptr [rbp+0x50] - mov r15, qword ptr [rsp+0x80] - movzx r13, byte ptr [rbp+0x38] - movzx r12, byte ptr [rbp+0x48] - test esi, 0x4 - je 3f - vbroadcasti32x4 zmm0, xmmword ptr [rcx] - vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] - vmovdqa xmm12, xmmword ptr [rsp] - vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10] - vpunpckldq xmm14, xmm12, xmm13 - vpunpckhdq xmm15, xmm12, xmm13 - vpermq ymm14, ymm14, 0xDC - vpermq ymm15, ymm15, 0xDC - vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] - vinserti64x4 zmm13, zmm14, ymm15, 0x01 - mov eax, 17476 - kmovw k2, eax - vpblendmd zmm13 {k2}, zmm13, zmm12 - vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov eax, 43690 - kmovw k3, eax - mov eax, 34952 - kmovw k4, eax - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x88], eax - vmovdqa32 zmm2, zmm15 - vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] - vpblendmd zmm3 {k4}, zmm13, zmm8 - vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] - vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 - vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 - vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 - vmovups zmm9, zmmword ptr [r8+rdx-0x30] - vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 - vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 - vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 - vshufps zmm4, zmm8, zmm9, 136 - vshufps zmm5, zmm8, zmm9, 221 - vmovups zmm8, zmmword ptr [r8+rdx-0x20] - vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 - vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 - vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 - vmovups zmm9, zmmword ptr [r8+rdx-0x10] - vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 - vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 - vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 - vshufps zmm6, zmm8, zmm9, 136 - vshufps zmm7, zmm8, zmm9, 221 - vpshufd zmm6, zmm6, 0x93 - vpshufd zmm7, zmm7, 0x93 - mov al, 7 -9: - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 16 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 12 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 8 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 7 - vpshufd zmm0, zmm0, 0x93 - vpshufd zmm3, zmm3, 0x4E - vpshufd zmm2, zmm2, 0x39 - vpaddd zmm0, zmm0, zmm6 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 16 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 12 - vpaddd zmm0, zmm0, zmm7 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 8 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 7 - vpshufd zmm0, zmm0, 0x39 - vpshufd zmm3, zmm3, 0x4E - vpshufd zmm2, zmm2, 0x93 - dec al - jz 9f - vshufps zmm8, zmm4, zmm5, 214 - vpshufd zmm9, zmm4, 0x0F - vpshufd zmm4, zmm8, 0x39 - vshufps zmm8, zmm6, zmm7, 250 - vpblendmd zmm9 {k3}, zmm9, zmm8 - vpunpcklqdq zmm8, zmm7, zmm5 - vpblendmd zmm8 {k4}, zmm8, zmm6 - vpshufd zmm8, zmm8, 0x78 - vpunpckhdq zmm5, zmm5, zmm7 - vpunpckldq zmm6, zmm6, zmm5 - vpshufd zmm7, zmm6, 0x1E - vmovdqa32 zmm5, zmm9 - vmovdqa32 zmm6, zmm8 - jmp 9b -9: - vpxord zmm0, zmm0, zmm2 - vpxord zmm1, zmm1, zmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 - vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 - vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 - vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 - vmovdqa xmm0, xmmword ptr [rsp] - vmovdqa xmm2, xmmword ptr [rsp+0x40] - vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] - vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] - vmovdqa xmmword ptr [rsp], xmm0 - vmovdqa xmmword ptr [rsp+0x40], xmm2 - add rbx, 128 - add rdi, 32 - sub rsi, 4 -3: - test esi, 0x2 - je 3f - vbroadcasti128 ymm0, xmmword ptr [rcx] - vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] - vmovd xmm13, dword ptr [rsp] - vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 - vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovd xmm14, dword ptr [rsp+0x4] - vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 - vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vinserti128 ymm13, ymm13, xmm14, 0x01 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x88], eax - vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] - vpbroadcastd ymm8, dword ptr [rsp+0x88] - vpblendd ymm3, ymm13, ymm8, 0x88 - vmovups ymm8, ymmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 - vshufps ymm4, ymm8, ymm9, 136 - vshufps ymm5, ymm8, ymm9, 221 - vmovups ymm8, ymmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 - vshufps ymm6, ymm8, ymm9, 136 - vshufps ymm7, ymm8, ymm9, 221 - vpshufd ymm6, ymm6, 0x93 - vpshufd ymm7, ymm7, 0x93 - mov al, 7 -9: - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 16 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 12 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 8 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 7 - vpshufd ymm0, ymm0, 0x93 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x39 - vpaddd ymm0, ymm0, ymm6 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 16 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 12 - vpaddd ymm0, ymm0, ymm7 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 8 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 7 - vpshufd ymm0, ymm0, 0x39 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x93 - dec al - jz 9f - vshufps ymm8, ymm4, ymm5, 214 - vpshufd ymm9, ymm4, 0x0F - vpshufd ymm4, ymm8, 0x39 - vshufps ymm8, ymm6, ymm7, 250 - vpblendd ymm9, ymm9, ymm8, 0xAA - vpunpcklqdq ymm8, ymm7, ymm5 - vpblendd ymm8, ymm8, ymm6, 0x88 - vpshufd ymm8, ymm8, 0x78 - vpunpckhdq ymm5, ymm5, ymm7 - vpunpckldq ymm6, ymm6, ymm5 - vpshufd ymm7, ymm6, 0x1E - vmovdqa ymm5, ymm9 - vmovdqa ymm6, ymm8 - jmp 9b -9: - vpxor ymm0, ymm0, ymm2 - vpxor ymm1, ymm1, ymm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vmovdqa xmm0, xmmword ptr [rsp] - vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10] - vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] - vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] - vmovdqa xmmword ptr [rsp], xmm0 - vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2 - add rbx, 64 - add rdi, 16 - sub rsi, 2 -3: - test esi, 0x1 - je 4b - vmovdqu xmm0, xmmword ptr [rcx] - vmovdqu xmm1, xmmword ptr [rcx+0x10] - vmovd xmm14, dword ptr [rsp] - vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 - vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - vpinsrd xmm3, xmm14, eax, 3 - vmovdqa xmm2, xmm15 - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vmovups xmm9, xmmword ptr [r8+rdx-0x30] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vmovups xmm9, xmmword ptr [r8+rdx-0x10] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - jmp 4b -.p2align 6 -_blake3_compress_in_place_avx512: -blake3_compress_in_place_avx512: - vmovdqu xmm0, xmmword ptr [rdi] - vmovdqu xmm1, xmmword ptr [rdi+0x10] - movzx eax, r8b - movzx edx, dl - shl rax, 32 - add rdx, rax - vmovq xmm3, rcx - vmovq xmm4, rdx - vpunpcklqdq xmm3, xmm3, xmm4 - vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] - vmovups xmm8, xmmword ptr [rsi] - vmovups xmm9, xmmword ptr [rsi+0x10] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [rsi+0x20] - vmovups xmm9, xmmword ptr [rsi+0x30] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - vmovdqu xmmword ptr [rdi], xmm0 - vmovdqu xmmword ptr [rdi+0x10], xmm1 - ret - -.p2align 6 -_blake3_compress_xof_avx512: -blake3_compress_xof_avx512: - vmovdqu xmm0, xmmword ptr [rdi] - vmovdqu xmm1, xmmword ptr [rdi+0x10] - movzx eax, r8b - movzx edx, dl - shl rax, 32 - add rdx, rax - vmovq xmm3, rcx - vmovq xmm4, rdx - vpunpcklqdq xmm3, xmm3, xmm4 - vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] - vmovups xmm8, xmmword ptr [rsi] - vmovups xmm9, xmmword ptr [rsi+0x10] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [rsi+0x20] - vmovups xmm9, xmmword ptr [rsi+0x30] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - vpxor xmm2, xmm2, [rdi] - vpxor xmm3, xmm3, [rdi+0x10] - vmovdqu xmmword ptr [r9], xmm0 - vmovdqu xmmword ptr [r9+0x10], xmm1 - vmovdqu xmmword ptr [r9+0x20], xmm2 - vmovdqu xmmword ptr [r9+0x30], xmm3 - ret - -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif -.p2align 6 -INDEX0: - .long 0, 1, 2, 3, 16, 17, 18, 19 - .long 8, 9, 10, 11, 24, 25, 26, 27 -INDEX1: - .long 4, 5, 6, 7, 20, 21, 22, 23 - .long 12, 13, 14, 15, 28, 29, 30, 31 -ADD0: - .long 0, 1, 2, 3, 4, 5, 6, 7 - .long 8, 9, 10, 11, 12, 13, 14, 15 -ADD1: .long 1 - -ADD16: .long 16 -BLAKE3_BLOCK_LEN: - .long 64 -.p2align 6 -BLAKE3_IV: -BLAKE3_IV_0: - .long 0x6A09E667 -BLAKE3_IV_1: - .long 0xBB67AE85 -BLAKE3_IV_2: - .long 0x3C6EF372 -BLAKE3_IV_3: - .long 0xA54FF53A - -#endif // __x86_64__ diff --git a/src/b3/blake3_dispatch.c b/src/b3/blake3_dispatch.c deleted file mode 100644 index 684772564..000000000 --- a/src/b3/blake3_dispatch.c +++ /dev/null @@ -1,245 +0,0 @@ -#include -#include -#include - -#include "blake3_impl.h" - -#if defined(IS_X86) -#if defined(_MSC_VER) -#include -#elif defined(__GNUC__) -#include -#else -#error "Unimplemented!" -#endif -#endif - -#if defined(IS_X86) -static uint64_t xgetbv() { -#if defined(_MSC_VER) - return _xgetbv(0); -#else - uint32_t eax = 0, edx = 0; - __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); - return ((uint64_t)edx << 32) | eax; -#endif -} - -static void cpuid(uint32_t out[4], uint32_t id) { -#if defined(_MSC_VER) - __cpuid((int *)out, id); -#elif defined(__i386__) || defined(_M_IX86) - __asm__ __volatile__("movl %%ebx, %1\n" - "cpuid\n" - "xchgl %1, %%ebx\n" - : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id)); -#else - __asm__ __volatile__("cpuid\n" - : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id)); -#endif -} - -static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { -#if defined(_MSC_VER) - __cpuidex((int *)out, id, sid); -#elif defined(__i386__) || defined(_M_IX86) - __asm__ __volatile__("movl %%ebx, %1\n" - "cpuid\n" - "xchgl %1, %%ebx\n" - : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id), "c"(sid)); -#else - __asm__ __volatile__("cpuid\n" - : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id), "c"(sid)); -#endif -} - -#endif - -enum cpu_feature { - SSE2 = 1 << 0, - SSSE3 = 1 << 1, - SSE41 = 1 << 2, - AVX = 1 << 3, - AVX2 = 1 << 4, - AVX512F = 1 << 5, - AVX512VL = 1 << 6, - /* ... */ - UNDEFINED = 1 << 30 -}; - -#if !defined(BLAKE3_TESTING) -static /* Allow the variable to be controlled manually for testing */ -#endif - enum cpu_feature g_cpu_features = UNDEFINED; - -#if !defined(BLAKE3_TESTING) -static -#endif - enum cpu_feature - get_cpu_features() { - - if (g_cpu_features != UNDEFINED) { - return g_cpu_features; - } else { -#if defined(IS_X86) - uint32_t regs[4] = {0}; - uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; - (void)edx; - enum cpu_feature features = 0; - cpuid(regs, 0); - const int max_id = *eax; - cpuid(regs, 1); -#if defined(__amd64__) || defined(_M_X64) - features |= SSE2; -#else - if (*edx & (1UL << 26)) - features |= SSE2; -#endif - if (*ecx & (1UL << 0)) - features |= SSSE3; - if (*ecx & (1UL << 19)) - features |= SSE41; - - if (*ecx & (1UL << 27)) { // OSXSAVE - const uint64_t mask = xgetbv(); - if ((mask & 6) == 6) { // SSE and AVX states - if (*ecx & (1UL << 28)) - features |= AVX; - if (max_id >= 7) { - cpuidex(regs, 7, 0); - if (*ebx & (1UL << 5)) - features |= AVX2; - if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm - if (*ebx & (1UL << 31)) - features |= AVX512VL; - if (*ebx & (1UL << 16)) - features |= AVX512F; - } - } - } - } - g_cpu_features = features; - return features; -#else - /* How to detect NEON? */ - return 0; -#endif - } -} - -void blake3_compress_in_place(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { -#if defined(IS_X86) - const enum cpu_feature features = get_cpu_features(); -#if !defined(BLAKE3_NO_AVX512) - if (features & AVX512VL) { - blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); - return; - } -#endif -#if !defined(BLAKE3_NO_SSE41) - if (features & SSE41) { - blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); - return; - } -#endif -#endif - blake3_compress_in_place_portable(cv, block, block_len, counter, flags); -} - -void blake3_compress_xof(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags, - uint8_t out[64]) { -#if defined(IS_X86) - const enum cpu_feature features = get_cpu_features(); -#if !defined(BLAKE3_NO_AVX512) - if (features & AVX512VL) { - blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); - return; - } -#endif -#if !defined(BLAKE3_NO_SSE41) - if (features & SSE41) { - blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); - return; - } -#endif -#endif - blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); -} - -void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { -#if defined(IS_X86) - const enum cpu_feature features = get_cpu_features(); -#if !defined(BLAKE3_NO_AVX512) - if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { - blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); - return; - } -#endif -#if !defined(BLAKE3_NO_AVX2) - if (features & AVX2) { - blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); - return; - } -#endif -#if !defined(BLAKE3_NO_SSE41) - if (features & SSE41) { - blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); - return; - } -#endif -#endif - -#if defined(BLAKE3_USE_NEON) - blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); - return; -#endif - - blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); -} - -// The dynamically detected SIMD degree of the current platform. -size_t blake3_simd_degree(void) { -#if defined(IS_X86) - const enum cpu_feature features = get_cpu_features(); -#if !defined(BLAKE3_NO_AVX512) - if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { - return 16; - } -#endif -#if !defined(BLAKE3_NO_AVX2) - if (features & AVX2) { - return 8; - } -#endif -#if !defined(BLAKE3_NO_SSE41) - if (features & SSE41) { - return 4; - } -#endif -#endif -#if defined(BLAKE3_USE_NEON) - return 4; -#endif - return 1; -} diff --git a/src/b3/blake3_impl.h b/src/b3/blake3_impl.h deleted file mode 100644 index c384671f0..000000000 --- a/src/b3/blake3_impl.h +++ /dev/null @@ -1,235 +0,0 @@ -#ifndef BLAKE3_IMPL_H -#define BLAKE3_IMPL_H - -#include -#include -#include -#include -#include - -#include "blake3.h" - -// internal flags -enum blake3_flags { - CHUNK_START = 1 << 0, - CHUNK_END = 1 << 1, - PARENT = 1 << 2, - ROOT = 1 << 3, - KEYED_HASH = 1 << 4, - DERIVE_KEY_CONTEXT = 1 << 5, - DERIVE_KEY_MATERIAL = 1 << 6, -}; - -// This C implementation tries to support recent versions of GCC, Clang, and -// MSVC. -#if defined(_MSC_VER) -#define INLINE static __forceinline -#else -#define INLINE static inline __attribute__((always_inline)) -#endif - -#if defined(__x86_64__) || defined(_M_X64) -#define IS_X86 -#define IS_X86_64 -#endif - -#if defined(__i386__) || defined(_M_IX86) -#define IS_X86 -#define IS_X86_32 -#endif - -#if defined(IS_X86) -#if defined(_MSC_VER) -#include -#endif -#include -#endif - -#if defined(IS_X86) -#define MAX_SIMD_DEGREE 16 -#elif defined(BLAKE3_USE_NEON) -#define MAX_SIMD_DEGREE 4 -#else -#define MAX_SIMD_DEGREE 1 -#endif - -// There are some places where we want a static size that's equal to the -// MAX_SIMD_DEGREE, but also at least 2. -#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) - -static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, - 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, - 0x1F83D9ABUL, 0x5BE0CD19UL}; - -static const uint8_t MSG_SCHEDULE[7][16] = { - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, - {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, - {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, - {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, - {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, - {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, -}; - -/* Find index of the highest set bit */ -/* x is assumed to be nonzero. */ -static unsigned int highest_one(uint64_t x) { -#if defined(__GNUC__) || defined(__clang__) - return 63 ^ __builtin_clzll(x); -#elif defined(_MSC_VER) && defined(IS_X86_64) - unsigned long index; - _BitScanReverse64(&index, x); - return index; -#elif defined(_MSC_VER) && defined(IS_X86_32) - if(x >> 32) { - unsigned long index; - _BitScanReverse(&index, x >> 32); - return 32 + index; - } else { - unsigned long index; - _BitScanReverse(&index, x); - return index; - } -#else - unsigned int c = 0; - if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } - if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } - if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } - if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } - if(x & 0x000000000000000cULL) { x >>= 2; c += 2; } - if(x & 0x0000000000000002ULL) { c += 1; } - return c; -#endif -} - -// Count the number of 1 bits. -INLINE unsigned int popcnt(uint64_t x) { -#if defined(__GNUC__) || defined(__clang__) - return __builtin_popcountll(x); -#else - unsigned int count = 0; - while (x != 0) { - count += 1; - x &= x - 1; - } - return count; -#endif -} - -// Largest power of two less than or equal to x. As a special case, returns 1 -// when x is 0. -INLINE uint64_t round_down_to_power_of_2(uint64_t x) { - return 1ULL << highest_one(x | 1); -} - -INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } - -INLINE uint32_t counter_high(uint64_t counter) { - return (uint32_t)(counter >> 32); -} - -INLINE uint32_t load32(const void *src) { - const uint8_t *p = (const uint8_t *)src; - return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | - ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); -} - -INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], - uint32_t key_words[8]) { - key_words[0] = load32(&key[0 * 4]); - key_words[1] = load32(&key[1 * 4]); - key_words[2] = load32(&key[2 * 4]); - key_words[3] = load32(&key[3 * 4]); - key_words[4] = load32(&key[4 * 4]); - key_words[5] = load32(&key[5 * 4]); - key_words[6] = load32(&key[6 * 4]); - key_words[7] = load32(&key[7 * 4]); -} - -void blake3_compress_in_place(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); - -void blake3_compress_xof(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags, - uint8_t out[64]); - -void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out); - -size_t blake3_simd_degree(void); - - -// Declarations for implementation-specific functions. -void blake3_compress_in_place_portable(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); - -void blake3_compress_xof_portable(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); - -void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); - -#if defined(IS_X86) -#if !defined(BLAKE3_NO_SSE41) -void blake3_compress_in_place_sse41(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); -void blake3_compress_xof_sse41(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); -void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif -#if !defined(BLAKE3_NO_AVX2) -void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif -#if !defined(BLAKE3_NO_AVX512) -void blake3_compress_in_place_avx512(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); - -void blake3_compress_xof_avx512(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); - -void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif -#endif - -#if defined(BLAKE3_USE_NEON) -void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif - - -#endif /* BLAKE3_IMPL_H */ diff --git a/src/b3/blake3_portable.c b/src/b3/blake3_portable.c deleted file mode 100644 index 9ee2f4a42..000000000 --- a/src/b3/blake3_portable.c +++ /dev/null @@ -1,168 +0,0 @@ -#include "blake3_impl.h" -#include - -INLINE void store32(void *dst, uint32_t w) { - uint8_t *p = (uint8_t *)dst; - p[0] = (uint8_t)(w >> 0); - p[1] = (uint8_t)(w >> 8); - p[2] = (uint8_t)(w >> 16); - p[3] = (uint8_t)(w >> 24); -} - -INLINE uint32_t rotr32(uint32_t w, uint32_t c) { - return (w >> c) | (w << (32 - c)); -} - -INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, - uint32_t x, uint32_t y) { - state[a] = state[a] + state[b] + x; - state[d] = rotr32(state[d] ^ state[a], 16); - state[c] = state[c] + state[d]; - state[b] = rotr32(state[b] ^ state[c], 12); - state[a] = state[a] + state[b] + y; - state[d] = rotr32(state[d] ^ state[a], 8); - state[c] = state[c] + state[d]; - state[b] = rotr32(state[b] ^ state[c], 7); -} - -INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) { - // Select the message schedule based on the round. - const uint8_t *schedule = MSG_SCHEDULE[round]; - - // Mix the columns. - g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); - g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); - g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); - g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); - - // Mix the rows. - g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); - g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); - g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); - g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); -} - -INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags) { - uint32_t block_words[16]; - block_words[0] = load32(block + 4 * 0); - block_words[1] = load32(block + 4 * 1); - block_words[2] = load32(block + 4 * 2); - block_words[3] = load32(block + 4 * 3); - block_words[4] = load32(block + 4 * 4); - block_words[5] = load32(block + 4 * 5); - block_words[6] = load32(block + 4 * 6); - block_words[7] = load32(block + 4 * 7); - block_words[8] = load32(block + 4 * 8); - block_words[9] = load32(block + 4 * 9); - block_words[10] = load32(block + 4 * 10); - block_words[11] = load32(block + 4 * 11); - block_words[12] = load32(block + 4 * 12); - block_words[13] = load32(block + 4 * 13); - block_words[14] = load32(block + 4 * 14); - block_words[15] = load32(block + 4 * 15); - - state[0] = cv[0]; - state[1] = cv[1]; - state[2] = cv[2]; - state[3] = cv[3]; - state[4] = cv[4]; - state[5] = cv[5]; - state[6] = cv[6]; - state[7] = cv[7]; - state[8] = IV[0]; - state[9] = IV[1]; - state[10] = IV[2]; - state[11] = IV[3]; - state[12] = counter_low(counter); - state[13] = counter_high(counter); - state[14] = (uint32_t)block_len; - state[15] = (uint32_t)flags; - - round_fn(state, &block_words[0], 0); - round_fn(state, &block_words[0], 1); - round_fn(state, &block_words[0], 2); - round_fn(state, &block_words[0], 3); - round_fn(state, &block_words[0], 4); - round_fn(state, &block_words[0], 5); - round_fn(state, &block_words[0], 6); -} - -void blake3_compress_in_place_portable(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - uint32_t state[16]; - compress_pre(state, cv, block, block_len, counter, flags); - cv[0] = state[0] ^ state[8]; - cv[1] = state[1] ^ state[9]; - cv[2] = state[2] ^ state[10]; - cv[3] = state[3] ^ state[11]; - cv[4] = state[4] ^ state[12]; - cv[5] = state[5] ^ state[13]; - cv[6] = state[6] ^ state[14]; - cv[7] = state[7] ^ state[15]; -} - -void blake3_compress_xof_portable(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { - uint32_t state[16]; - compress_pre(state, cv, block, block_len, counter, flags); - - store32(&out[0 * 4], state[0] ^ state[8]); - store32(&out[1 * 4], state[1] ^ state[9]); - store32(&out[2 * 4], state[2] ^ state[10]); - store32(&out[3 * 4], state[3] ^ state[11]); - store32(&out[4 * 4], state[4] ^ state[12]); - store32(&out[5 * 4], state[5] ^ state[13]); - store32(&out[6 * 4], state[6] ^ state[14]); - store32(&out[7 * 4], state[7] ^ state[15]); - store32(&out[8 * 4], state[8] ^ cv[0]); - store32(&out[9 * 4], state[9] ^ cv[1]); - store32(&out[10 * 4], state[10] ^ cv[2]); - store32(&out[11 * 4], state[11] ^ cv[3]); - store32(&out[12 * 4], state[12] ^ cv[4]); - store32(&out[13 * 4], state[13] ^ cv[5]); - store32(&out[14 * 4], state[14] ^ cv[6]); - store32(&out[15 * 4], state[15] ^ cv[7]); -} - -INLINE void hash_one_portable(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { - uint32_t cv[8]; - memcpy(cv, key, BLAKE3_KEY_LEN); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; - } - blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - memcpy(out, cv, 32); -} - -void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs > 0) { - hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, - flags_end, out); - if (increment_counter) { - counter += 1; - } - inputs += 1; - num_inputs -= 1; - out = &out[BLAKE3_OUT_LEN]; - } -} diff --git a/src/b3/blake3_sse41.c b/src/b3/blake3_sse41.c deleted file mode 100644 index b31122533..000000000 --- a/src/b3/blake3_sse41.c +++ /dev/null @@ -1,559 +0,0 @@ -#include "blake3_impl.h" - -#include - -#define DEGREE 4 - -#define _mm_shuffle_ps2(a, b, c) \ - (_mm_castps_si128( \ - _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) - -INLINE __m128i loadu(const uint8_t src[16]) { - return _mm_loadu_si128((const __m128i *)src); -} - -INLINE void storeu(__m128i src, uint8_t dest[16]) { - _mm_storeu_si128((__m128i *)dest, src); -} - -INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } - -// Note that clang-format doesn't like the name "xor" for some reason. -INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } - -INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } - -INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); -} - -INLINE __m128i rot16(__m128i x) { - return _mm_shuffle_epi8( - x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); -} - -INLINE __m128i rot12(__m128i x) { - return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); -} - -INLINE __m128i rot8(__m128i x) { - return _mm_shuffle_epi8( - x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); -} - -INLINE __m128i rot7(__m128i x) { - return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); -} - -INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = addv(addv(*row0, m), *row1); - *row3 = xorv(*row3, *row0); - *row3 = rot16(*row3); - *row2 = addv(*row2, *row3); - *row1 = xorv(*row1, *row2); - *row1 = rot12(*row1); -} - -INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = addv(addv(*row0, m), *row1); - *row3 = xorv(*row3, *row0); - *row3 = rot8(*row3); - *row2 = addv(*row2, *row3); - *row1 = xorv(*row1, *row2); - *row1 = rot7(*row1); -} - -// Note the optimization here of leaving row1 as the unrotated row, rather than -// row0. All the message loads below are adjusted to compensate for this. See -// discussion at https://github.com/sneves/blake2-avx2/pull/4 -INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); -} - -INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); -} - -INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags) { - rows[0] = loadu((uint8_t *)&cv[0]); - rows[1] = loadu((uint8_t *)&cv[4]); - rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); - rows[3] = set4(counter_low(counter), counter_high(counter), - (uint32_t)block_len, (uint32_t)flags); - - __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); - __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); - __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); - __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); - - __m128i t0, t1, t2, t3, tt; - - // Round 1. The first round permutes the message words from the original - // input order, into the groups that get mixed in parallel. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 - t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 - t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 2. This round and all following rounds apply a fixed permutation - // to the message words from the round before. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 3 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 4 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 5 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 6 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 7 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); -} - -void blake3_compress_in_place_sse41(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); - storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); -} - -void blake3_compress_xof_sse41(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu(xorv(rows[0], rows[2]), &out[0]); - storeu(xorv(rows[1], rows[3]), &out[16]); - storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); - storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); -} - -INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -INLINE void transpose_vecs(__m128i vecs[DEGREE]) { - // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is - // 22/33. Note that this doesn't split the vector into two lanes, as the - // AVX2 counterparts do. - __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - - // Interleave 64-bit lanes. - __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); - __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); - __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); - __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); - - vecs[0] = abcd_0; - vecs[1] = abcd_1; - vecs[2] = abcd_2; - vecs[3] = abcd_3; -} - -INLINE void transpose_msg_vecs(const uint8_t *const *inputs, - size_t block_offset, __m128i out[16]) { - out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); - out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); - out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); - out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); - out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); - out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); - out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); - out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); - out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); - out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); - out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); - out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); - out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); - out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); - out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); - out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); - for (size_t i = 0; i < 4; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs(&out[0]); - transpose_vecs(&out[4]); - transpose_vecs(&out[8]); - transpose_vecs(&out[12]); -} - -INLINE void load_counters(uint64_t counter, bool increment_counter, - __m128i *out_lo, __m128i *out_hi) { - const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); - const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); - const __m128i add1 = _mm_and_si128(mask, add0); - __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1); - __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), - _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); - __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry); - *out_lo = l; - *out_hi = h; -} - -void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m128i h_vecs[8] = { - set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), - set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), - }; - __m128i counter_low_vec, counter_high_vec; - load_counters(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); - __m128i block_flags_vec = set1(block_flags); - __m128i msg_vecs[16]; - transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m128i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn(v, msg_vecs, 0); - round_fn(v, msg_vecs, 1); - round_fn(v, msg_vecs, 2); - round_fn(v, msg_vecs, 3); - round_fn(v, msg_vecs, 4); - round_fn(v, msg_vecs, 5); - round_fn(v, msg_vecs, 6); - h_vecs[0] = xorv(v[0], v[8]); - h_vecs[1] = xorv(v[1], v[9]); - h_vecs[2] = xorv(v[2], v[10]); - h_vecs[3] = xorv(v[3], v[11]); - h_vecs[4] = xorv(v[4], v[12]); - h_vecs[5] = xorv(v[5], v[13]); - h_vecs[6] = xorv(v[6], v[14]); - h_vecs[7] = xorv(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs(&h_vecs[0]); - transpose_vecs(&h_vecs[4]); - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); - storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); - storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); - storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); - storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); - storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); - storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); - storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); -} - -INLINE void hash_one_sse41(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { - uint32_t cv[8]; - memcpy(cv, key, BLAKE3_KEY_LEN); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; - } - blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - memcpy(out, cv, BLAKE3_OUT_LEN); -} - -void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs >= DEGREE) { - blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += DEGREE; - } - inputs += DEGREE; - num_inputs -= DEGREE; - out = &out[DEGREE * BLAKE3_OUT_LEN]; - } - while (num_inputs > 0) { - hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start, - flags_end, out); - if (increment_counter) { - counter += 1; - } - inputs += 1; - num_inputs -= 1; - out = &out[BLAKE3_OUT_LEN]; - } -} diff --git a/src/b3/blake3_sse41_x86-64_unix.S b/src/b3/blake3_sse41_x86-64_unix.S deleted file mode 100644 index 024a8290f..000000000 --- a/src/b3/blake3_sse41_x86-64_unix.S +++ /dev/null @@ -1,2014 +0,0 @@ -#ifdef __x86_64__ -.intel_syntax noprefix -.global blake3_hash_many_sse41 -.global _blake3_hash_many_sse41 -.global blake3_compress_in_place_sse41 -.global _blake3_compress_in_place_sse41 -.global blake3_compress_xof_sse41 -.global _blake3_compress_xof_sse41 -#ifdef __APPLE__ -.text -#else -.section .text -#endif - .p2align 6 -_blake3_hash_many_sse41: -blake3_hash_many_sse41: - push r15 - push r14 - push r13 - push r12 - push rbx - push rbp - mov rbp, rsp - sub rsp, 360 - and rsp, 0xFFFFFFFFFFFFFFC0 - neg r9d - movd xmm0, r9d - pshufd xmm0, xmm0, 0x00 - movdqa xmmword ptr [rsp+0x130], xmm0 - movdqa xmm1, xmm0 - pand xmm1, xmmword ptr [ADD0+rip] - pand xmm0, xmmword ptr [ADD1+rip] - movdqa xmmword ptr [rsp+0x150], xmm0 - movd xmm0, r8d - pshufd xmm0, xmm0, 0x00 - paddd xmm0, xmm1 - movdqa xmmword ptr [rsp+0x110], xmm0 - pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] - pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] - pcmpgtd xmm1, xmm0 - shr r8, 32 - movd xmm2, r8d - pshufd xmm2, xmm2, 0x00 - psubd xmm2, xmm1 - movdqa xmmword ptr [rsp+0x120], xmm2 - mov rbx, qword ptr [rbp+0x50] - mov r15, rdx - shl r15, 6 - movzx r13d, byte ptr [rbp+0x38] - movzx r12d, byte ptr [rbp+0x48] - cmp rsi, 4 - jc 3f -2: - movdqu xmm3, xmmword ptr [rcx] - pshufd xmm0, xmm3, 0x00 - pshufd xmm1, xmm3, 0x55 - pshufd xmm2, xmm3, 0xAA - pshufd xmm3, xmm3, 0xFF - movdqu xmm7, xmmword ptr [rcx+0x10] - pshufd xmm4, xmm7, 0x00 - pshufd xmm5, xmm7, 0x55 - pshufd xmm6, xmm7, 0xAA - pshufd xmm7, xmm7, 0xFF - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -9: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movdqu xmm8, xmmword ptr [r8+rdx-0x40] - movdqu xmm9, xmmword ptr [r9+rdx-0x40] - movdqu xmm10, xmmword ptr [r10+rdx-0x40] - movdqu xmm11, xmmword ptr [r11+rdx-0x40] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp], xmm8 - movdqa xmmword ptr [rsp+0x10], xmm9 - movdqa xmmword ptr [rsp+0x20], xmm12 - movdqa xmmword ptr [rsp+0x30], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x30] - movdqu xmm9, xmmword ptr [r9+rdx-0x30] - movdqu xmm10, xmmword ptr [r10+rdx-0x30] - movdqu xmm11, xmmword ptr [r11+rdx-0x30] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0x40], xmm8 - movdqa xmmword ptr [rsp+0x50], xmm9 - movdqa xmmword ptr [rsp+0x60], xmm12 - movdqa xmmword ptr [rsp+0x70], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x20] - movdqu xmm9, xmmword ptr [r9+rdx-0x20] - movdqu xmm10, xmmword ptr [r10+rdx-0x20] - movdqu xmm11, xmmword ptr [r11+rdx-0x20] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0x80], xmm8 - movdqa xmmword ptr [rsp+0x90], xmm9 - movdqa xmmword ptr [rsp+0xA0], xmm12 - movdqa xmmword ptr [rsp+0xB0], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x10] - movdqu xmm9, xmmword ptr [r9+rdx-0x10] - movdqu xmm10, xmmword ptr [r10+rdx-0x10] - movdqu xmm11, xmmword ptr [r11+rdx-0x10] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0xC0], xmm8 - movdqa xmmword ptr [rsp+0xD0], xmm9 - movdqa xmmword ptr [rsp+0xE0], xmm12 - movdqa xmmword ptr [rsp+0xF0], xmm13 - movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] - movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] - movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] - movdqa xmm12, xmmword ptr [rsp+0x110] - movdqa xmm13, xmmword ptr [rsp+0x120] - movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] - movd xmm15, eax - pshufd xmm15, xmm15, 0x00 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - paddd xmm0, xmmword ptr [rsp] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x40] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x10] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x50] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x80] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0xC0] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x90] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0xD0] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x20] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x70] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x60] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x10] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x90] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xB0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0xE0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x30] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0xD0] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x40] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x20] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x60] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0xB0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x50] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0xF0] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xA0] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0xE0] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x70] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0x30] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x40] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0x50] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x80] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xC0] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0xF0] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xD0] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0xA0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x70] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x20] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x10] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x90] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0x80] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xE0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0xC0] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xD0] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0x20] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x30] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0x60] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xB0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0x10] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xF0] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0x90] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xE0] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x30] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xA0] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x40] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - pxor xmm0, xmm8 - pxor xmm1, xmm9 - pxor xmm2, xmm10 - pxor xmm3, xmm11 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - pxor xmm4, xmm12 - pxor xmm5, xmm13 - pxor xmm6, xmm14 - pxor xmm7, xmm15 - mov eax, r13d - jne 9b - movdqa xmm9, xmm0 - punpckldq xmm0, xmm1 - punpckhdq xmm9, xmm1 - movdqa xmm11, xmm2 - punpckldq xmm2, xmm3 - punpckhdq xmm11, xmm3 - movdqa xmm1, xmm0 - punpcklqdq xmm0, xmm2 - punpckhqdq xmm1, xmm2 - movdqa xmm3, xmm9 - punpcklqdq xmm9, xmm11 - punpckhqdq xmm3, xmm11 - movdqu xmmword ptr [rbx], xmm0 - movdqu xmmword ptr [rbx+0x20], xmm1 - movdqu xmmword ptr [rbx+0x40], xmm9 - movdqu xmmword ptr [rbx+0x60], xmm3 - movdqa xmm9, xmm4 - punpckldq xmm4, xmm5 - punpckhdq xmm9, xmm5 - movdqa xmm11, xmm6 - punpckldq xmm6, xmm7 - punpckhdq xmm11, xmm7 - movdqa xmm5, xmm4 - punpcklqdq xmm4, xmm6 - punpckhqdq xmm5, xmm6 - movdqa xmm7, xmm9 - punpcklqdq xmm9, xmm11 - punpckhqdq xmm7, xmm11 - movdqu xmmword ptr [rbx+0x10], xmm4 - movdqu xmmword ptr [rbx+0x30], xmm5 - movdqu xmmword ptr [rbx+0x50], xmm9 - movdqu xmmword ptr [rbx+0x70], xmm7 - movdqa xmm1, xmmword ptr [rsp+0x110] - movdqa xmm0, xmm1 - paddd xmm1, xmmword ptr [rsp+0x150] - movdqa xmmword ptr [rsp+0x110], xmm1 - pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] - pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] - pcmpgtd xmm0, xmm1 - movdqa xmm1, xmmword ptr [rsp+0x120] - psubd xmm1, xmm0 - movdqa xmmword ptr [rsp+0x120], xmm1 - add rbx, 128 - add rdi, 32 - sub rsi, 4 - cmp rsi, 4 - jnc 2b - test rsi, rsi - jnz 3f -4: - mov rsp, rbp - pop rbp - pop rbx - pop r12 - pop r13 - pop r14 - pop r15 - ret -.p2align 5 -3: - test esi, 0x2 - je 3f - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+0x10] - movaps xmm8, xmm0 - movaps xmm9, xmm1 - movd xmm13, dword ptr [rsp+0x110] - pinsrd xmm13, dword ptr [rsp+0x120], 1 - pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - movaps xmmword ptr [rsp], xmm13 - movd xmm14, dword ptr [rsp+0x114] - pinsrd xmm14, dword ptr [rsp+0x124], 1 - pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - movaps xmmword ptr [rsp+0x10], xmm14 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movaps xmm10, xmm2 - movups xmm4, xmmword ptr [r8+rdx-0x40] - movups xmm5, xmmword ptr [r8+rdx-0x30] - movaps xmm3, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm3, xmm5, 221 - movaps xmm5, xmm3 - movups xmm6, xmmword ptr [r8+rdx-0x20] - movups xmm7, xmmword ptr [r8+rdx-0x10] - movaps xmm3, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm3, xmm7, 221 - pshufd xmm7, xmm3, 0x93 - movups xmm12, xmmword ptr [r9+rdx-0x40] - movups xmm13, xmmword ptr [r9+rdx-0x30] - movaps xmm11, xmm12 - shufps xmm12, xmm13, 136 - shufps xmm11, xmm13, 221 - movaps xmm13, xmm11 - movups xmm14, xmmword ptr [r9+rdx-0x20] - movups xmm15, xmmword ptr [r9+rdx-0x10] - movaps xmm11, xmm14 - shufps xmm14, xmm15, 136 - pshufd xmm14, xmm14, 0x93 - shufps xmm11, xmm15, 221 - pshufd xmm15, xmm11, 0x93 - movaps xmm3, xmmword ptr [rsp] - movaps xmm11, xmmword ptr [rsp+0x10] - pinsrd xmm3, eax, 3 - pinsrd xmm11, eax, 3 - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm8, xmm12 - movaps xmmword ptr [rsp+0x20], xmm4 - movaps xmmword ptr [rsp+0x30], xmm12 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - movaps xmm12, xmmword ptr [ROT16+rip] - pshufb xmm3, xmm12 - pshufb xmm11, xmm12 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 20 - psrld xmm4, 12 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 20 - psrld xmm4, 12 - por xmm9, xmm4 - paddd xmm0, xmm5 - paddd xmm8, xmm13 - movaps xmmword ptr [rsp+0x40], xmm5 - movaps xmmword ptr [rsp+0x50], xmm13 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - movaps xmm13, xmmword ptr [ROT8+rip] - pshufb xmm3, xmm13 - pshufb xmm11, xmm13 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 25 - psrld xmm4, 7 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 25 - psrld xmm4, 7 - por xmm9, xmm4 - pshufd xmm0, xmm0, 0x93 - pshufd xmm8, xmm8, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm11, xmm11, 0x4E - pshufd xmm2, xmm2, 0x39 - pshufd xmm10, xmm10, 0x39 - paddd xmm0, xmm6 - paddd xmm8, xmm14 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - pshufb xmm3, xmm12 - pshufb xmm11, xmm12 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 20 - psrld xmm4, 12 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 20 - psrld xmm4, 12 - por xmm9, xmm4 - paddd xmm0, xmm7 - paddd xmm8, xmm15 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - pshufb xmm3, xmm13 - pshufb xmm11, xmm13 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 25 - psrld xmm4, 7 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 25 - psrld xmm4, 7 - por xmm9, xmm4 - pshufd xmm0, xmm0, 0x39 - pshufd xmm8, xmm8, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm11, xmm11, 0x4E - pshufd xmm2, xmm2, 0x93 - pshufd xmm10, xmm10, 0x93 - dec al - je 9f - movdqa xmm12, xmmword ptr [rsp+0x20] - movdqa xmm5, xmmword ptr [rsp+0x40] - pshufd xmm13, xmm12, 0x0F - shufps xmm12, xmm5, 214 - pshufd xmm4, xmm12, 0x39 - movdqa xmm12, xmm6 - shufps xmm12, xmm7, 250 - pblendw xmm13, xmm12, 0xCC - movdqa xmm12, xmm7 - punpcklqdq xmm12, xmm5 - pblendw xmm12, xmm6, 0xC0 - pshufd xmm12, xmm12, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmmword ptr [rsp+0x20], xmm13 - movdqa xmmword ptr [rsp+0x40], xmm12 - movdqa xmm5, xmmword ptr [rsp+0x30] - movdqa xmm13, xmmword ptr [rsp+0x50] - pshufd xmm6, xmm5, 0x0F - shufps xmm5, xmm13, 214 - pshufd xmm12, xmm5, 0x39 - movdqa xmm5, xmm14 - shufps xmm5, xmm15, 250 - pblendw xmm6, xmm5, 0xCC - movdqa xmm5, xmm15 - punpcklqdq xmm5, xmm13 - pblendw xmm5, xmm14, 0xC0 - pshufd xmm5, xmm5, 0x78 - punpckhdq xmm13, xmm15 - punpckldq xmm14, xmm13 - pshufd xmm15, xmm14, 0x1E - movdqa xmm13, xmm6 - movdqa xmm14, xmm5 - movdqa xmm5, xmmword ptr [rsp+0x20] - movdqa xmm6, xmmword ptr [rsp+0x40] - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - pxor xmm8, xmm10 - pxor xmm9, xmm11 - mov eax, r13d - cmp rdx, r15 - jne 2b - movups xmmword ptr [rbx], xmm0 - movups xmmword ptr [rbx+0x10], xmm1 - movups xmmword ptr [rbx+0x20], xmm8 - movups xmmword ptr [rbx+0x30], xmm9 - movdqa xmm0, xmmword ptr [rsp+0x130] - movdqa xmm1, xmmword ptr [rsp+0x110] - movdqa xmm2, xmmword ptr [rsp+0x120] - movdqu xmm3, xmmword ptr [rsp+0x118] - movdqu xmm4, xmmword ptr [rsp+0x128] - blendvps xmm1, xmm3, xmm0 - blendvps xmm2, xmm4, xmm0 - movdqa xmmword ptr [rsp+0x110], xmm1 - movdqa xmmword ptr [rsp+0x120], xmm2 - add rdi, 16 - add rbx, 64 - sub rsi, 2 -3: - test esi, 0x1 - je 4b - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+0x10] - movd xmm13, dword ptr [rsp+0x110] - pinsrd xmm13, dword ptr [rsp+0x120], 1 - pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movaps xmm3, xmm13 - pinsrd xmm3, eax, 3 - movups xmm4, xmmword ptr [r8+rdx-0x40] - movups xmm5, xmmword ptr [r8+rdx-0x30] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [r8+rdx-0x20] - movups xmm7, xmmword ptr [r8+rdx-0x10] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - movups xmmword ptr [rbx], xmm0 - movups xmmword ptr [rbx+0x10], xmm1 - jmp 4b - -.p2align 6 -blake3_compress_in_place_sse41: -_blake3_compress_in_place_sse41: - movups xmm0, xmmword ptr [rdi] - movups xmm1, xmmword ptr [rdi+0x10] - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - shl r8, 32 - add rdx, r8 - movq xmm3, rcx - movq xmm4, rdx - punpcklqdq xmm3, xmm4 - movups xmm4, xmmword ptr [rsi] - movups xmm5, xmmword ptr [rsi+0x10] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [rsi+0x20] - movups xmm7, xmmword ptr [rsi+0x30] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - movups xmmword ptr [rdi], xmm0 - movups xmmword ptr [rdi+0x10], xmm1 - ret - -.p2align 6 -blake3_compress_xof_sse41: -_blake3_compress_xof_sse41: - movups xmm0, xmmword ptr [rdi] - movups xmm1, xmmword ptr [rdi+0x10] - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movzx eax, r8b - movzx edx, dl - shl rax, 32 - add rdx, rax - movq xmm3, rcx - movq xmm4, rdx - punpcklqdq xmm3, xmm4 - movups xmm4, xmmword ptr [rsi] - movups xmm5, xmmword ptr [rsi+0x10] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [rsi+0x20] - movups xmm7, xmmword ptr [rsi+0x30] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - movdqu xmm4, xmmword ptr [rdi] - movdqu xmm5, xmmword ptr [rdi+0x10] - pxor xmm0, xmm2 - pxor xmm1, xmm3 - pxor xmm2, xmm4 - pxor xmm3, xmm5 - movups xmmword ptr [r9], xmm0 - movups xmmword ptr [r9+0x10], xmm1 - movups xmmword ptr [r9+0x20], xmm2 - movups xmmword ptr [r9+0x30], xmm3 - ret - - -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif -.p2align 6 -BLAKE3_IV: - .long 0x6A09E667, 0xBB67AE85 - .long 0x3C6EF372, 0xA54FF53A -ROT16: - .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 -ROT8: - .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 -ADD0: - .long 0, 1, 2, 3 -ADD1: - .long 4, 4, 4, 4 -BLAKE3_IV_0: - .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 -BLAKE3_IV_1: - .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 -BLAKE3_IV_2: - .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 -BLAKE3_IV_3: - .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A -BLAKE3_BLOCK_LEN: - .long 64, 64, 64, 64 -CMP_MSB_MASK: - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 - -#endif // __x86_64__ diff --git a/src/calculate_bucket.hpp b/src/calculate_bucket.hpp index 8d1cbcdac..11d736928 100644 --- a/src/calculate_bucket.hpp +++ b/src/calculate_bucket.hpp @@ -25,7 +25,7 @@ #include #include -#include "b3/blake3.h" +#include "blake3.h" #include "bits.hpp" #include "chacha8.h" #include "pos_constants.hpp" diff --git a/tests/test.cpp b/tests/test.cpp index e2d976df4..a64057ed2 100644 --- a/tests/test.cpp +++ b/tests/test.cpp @@ -621,7 +621,7 @@ void TestProofOfSpace( picosha2::hash256(hash_input.begin(), hash_input.end(), hash.begin(), hash.end()); vector qualities = prover.GetQualitiesForChallenge(hash.data()); Verifier verifier = Verifier(); - + for (uint32_t index = 0; index < qualities.size(); index++) { LargeBits proof = prover.GetFullProof(hash.data(), index); proof.ToBytes(proof_data); @@ -1082,7 +1082,7 @@ TEST_CASE("DiskProver") { SECTION("Move constructor") { - std::string filename = "prover_test.plot"; + std::string filename = "prover_test_with_a_long_name_to_avoid_sso.plot"; DiskPlotter plotter = DiskPlotter(); std::vector memo{1, 2, 3}; plotter.CreatePlotDisk( From 8816486ec6f068e8c744f2e4e0a40f6f497cef48 Mon Sep 17 00:00:00 2001 From: Earle Lowe <30607889+emlowe@users.noreply.github.com> Date: Tue, 23 Jan 2024 10:25:22 -0800 Subject: [PATCH 21/27] Add Python 3.12 (#406) Add python 3.12 to matrix --- .github/workflows/build-wheels.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 43cc4f246..74ae4ff65 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -70,6 +70,12 @@ jobs: arm: manylinux2014 intel: manylinux2014 matrix: '3.11' + - major-dot-minor: '3.12' + cibw-build: 'cp312-*' + manylinux: + arm: manylinux2014 + intel: manylinux2014 + matrix: '3.12' arch: - name: ARM matrix: arm From 8e2aa342f640a224ded90f7a054f3fe8a5f34abd Mon Sep 17 00:00:00 2001 From: Earle Lowe <30607889+emlowe@users.noreply.github.com> Date: Tue, 23 Jan 2024 12:21:49 -0800 Subject: [PATCH 22/27] Remove cmake version check from setup.py (#408) There is a cmake version is already specified in `CMakeLists.txt` and this Windows-specific check in setup.py isn't required (and wasn't accurate anyway) --- setup.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/setup.py b/setup.py index 5f73900c8..59689f22e 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,11 @@ #!/usr/bin/python3 import os -import re import sys import platform import subprocess from setuptools import setup, Extension from setuptools.command.build_ext import build_ext -from distutils.version import LooseVersion class CMakeExtension(Extension): @@ -19,7 +17,7 @@ def __init__(self, name, sourcedir=""): class CMakeBuild(build_ext): def run(self): try: - out = subprocess.check_output(["cmake", "--version"]) + subprocess.check_output(["cmake", "--version"]) except OSError: raise RuntimeError( "CMake must be installed to build" @@ -27,13 +25,6 @@ def run(self): + ", ".join(e.name for e in self.extensions) ) - if platform.system() == "Windows": - cmake_version = LooseVersion( - re.search(r"version\s*([\d.]+)", out.decode()).group(1) - ) - if cmake_version < "3.1.0": - raise RuntimeError("CMake >= 3.1.0 is required on Windows") - for ext in self.extensions: self.build_extension(ext) From 90d9a5ce61d5a6f8d0879131ba75507c8548ef11 Mon Sep 17 00:00:00 2001 From: Earle Lowe <30607889+emlowe@users.noreply.github.com> Date: Tue, 23 Jan 2024 12:23:03 -0800 Subject: [PATCH 23/27] Add python versions and upload to chia pypi for riscv workflow (#407) Matches the chiavdf workflow and adds in the python versions and an upload step --- .github/workflows/build-test-riscv64.yml | 147 +++++++++++++++++------ 1 file changed, 108 insertions(+), 39 deletions(-) diff --git a/.github/workflows/build-test-riscv64.yml b/.github/workflows/build-test-riscv64.yml index 58e424218..85bb5acb7 100644 --- a/.github/workflows/build-test-riscv64.yml +++ b/.github/workflows/build-test-riscv64.yml @@ -6,52 +6,121 @@ on: - main - dev tags: - - '**' + - "**" pull_request: branches: - - '**' + - "**" jobs: - build: + build_wheels: timeout-minutes: 60 - name: QEMU riscv64 via Debian on ubuntu-latest + name: ${{ matrix.os }} 📦 Build ${{ matrix.python.major-dot-minor }} runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: - os: [ ubuntu-latest ] + os: [ubuntu-latest] + python: + - major-dot-minor: "3.8" + matrix: "3.8" + - major-dot-minor: "3.9" + matrix: "3.9" + - major-dot-minor: "3.10" + matrix: "3.10" + - major-dot-minor: "3.11" + matrix: "3.11" steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - fetch-depth: 1 - - - name: Set up QEMU on x86_64 - if: startsWith(matrix.os, 'ubuntu-latest') - id: qemu - uses: docker/setup-qemu-action@v2 - with: - platforms: riscv64 -# image: tonistiigi/binfmt:latest - - - name: Build and Test - run: | - docker run --rm --platform linux/riscv64 \ - -v ${{ github.workspace }}:/ws --workdir=/ws \ - chianetwork/ubuntu-22.04-risc-builder:latest \ - bash -exc '\ - cmake --version && \ - uname -a && \ - export CP_USE_GREEN_REAPER=0 && \ - pip wheel -w dist . && \ - python3 -m venv venv && \ - ./venv/bin/python -m pip install dist/*.whl && \ - ./venv/bin/python -m pip install pytest && \ - ./venv/bin/python -m pytest -k "not k_21" -v tests/ - ' - - - name: Upload artifacts - uses: actions/upload-artifact@v3 - with: - name: packages - path: ./dist + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: Set up QEMU on x86_64 + if: startsWith(matrix.os, 'ubuntu-latest') + id: qemu + uses: docker/setup-qemu-action@v2 + with: + platforms: riscv64 + # image: tonistiigi/binfmt:latest + + - name: Build and Test + run: | + docker run --rm --platform linux/riscv64 \ + -v ${{ github.workspace }}:/ws --workdir=/ws \ + chianetwork/ubuntu-22.04-risc-builder:latest \ + bash -exc '\ + pyenv global ${{ matrix.python.matrix }} && \ + cmake --version && \ + uname -a && \ + export CP_USE_GREEN_REAPER=0 && \ + pip wheel -w dist . && \ + python3 -m venv venv && \ + ./venv/bin/python -m pip install dist/*.whl && \ + ./venv/bin/python -m pip install pytest && \ + ./venv/bin/python -m pytest -k "not k_21" -v tests/ + ' + + - name: Upload artifacts + uses: actions/upload-artifact@v3 + with: + name: packages + path: ./dist + if-no-files-found: error + upload: + name: Upload to Chia PyPI + runs-on: ubuntu-latest + needs: + - build_wheels + steps: + - name: Checkout code + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set Env + uses: Chia-Network/actions/setjobenv@main + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Download artifacts + if: env.RELEASE == 'true' + uses: actions/download-artifact@v3 + with: + name: packages + path: ./dist + + - name: Configure AWS credentials + if: env.RELEASE == 'true' + uses: aws-actions/configure-aws-credentials@v2 + with: + role-to-assume: arn:aws:iam::${{ secrets.CHIA_AWS_ACCOUNT_ID }}:role/installer-upload + aws-region: us-west-2 + + - name: List existing wheels + if: env.RELEASE == 'true' + shell: sh + run: | + aws s3 ls s3://download.chia.net/simple/chiavdf/ > existing_wheel_list_raw + cat existing_wheel_list_raw + cat existing_wheel_list_raw | tr -s ' ' | cut -d ' ' -f 4 > existing_wheel_list + + - name: List new wheels + if: env.RELEASE == 'true' + shell: sh + run: | + (cd dist/; ls ${{ inputs.package_name }}-*.whl) > new_wheel_list + cat new_wheel_list | xargs -I % sh -c 'ls -l dist/%' + + - name: Choose wheels to upload + if: env.RELEASE == 'true' + shell: sh + run: | + grep -F -x -v -f existing_wheel_list new_wheel_list > upload_wheel_list + cat upload_wheel_list + + - name: Upload wheels + if: env.RELEASE == 'true' + shell: sh + run: | + cat upload_wheel_list | xargs -I % sh -c 'aws s3 cp dist/% s3://download.chia.net/simple/chiapos/' From 76b94767caa5f883c3c93b7ff2b0e10808993fb5 Mon Sep 17 00:00:00 2001 From: Earle Lowe <30607889+emlowe@users.noreply.github.com> Date: Tue, 23 Jan 2024 13:02:06 -0800 Subject: [PATCH 24/27] Update gulrak to 1.5.14 (#409) Update to the newest gulrak 1.5.14 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a42d77976..8c486aa70 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,7 +56,7 @@ endif() FetchContent_Declare( gulrak GIT_REPOSITORY https://github.com/gulrak/filesystem.git - GIT_TAG v1.5.6 + GIT_TAG v1.5.14 ) FetchContent_MakeAvailable(gulrak) From 57fbead688851dff3f1220ee3fcc700d5436c995 Mon Sep 17 00:00:00 2001 From: Earle Lowe <30607889+emlowe@users.noreply.github.com> Date: Tue, 23 Jan 2024 13:44:58 -0800 Subject: [PATCH 25/27] Update catch2 to 3.5.2 (#410) Update Catch2 to latest released version 3.5.2 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c486aa70..d84043d7a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -162,7 +162,7 @@ ENDIF() FetchContent_Declare( Catch2 GIT_REPOSITORY https://github.com/catchorg/Catch2.git - GIT_TAG v3.3.2 + GIT_TAG v3.5.2 ) FetchContent_MakeAvailable(Catch2) From f35fe5082fac4543be215e41f3524478cf6568dc Mon Sep 17 00:00:00 2001 From: Earle Lowe <30607889+emlowe@users.noreply.github.com> Date: Tue, 23 Jan 2024 13:45:33 -0800 Subject: [PATCH 26/27] Remove small runner workarounds (#411) See if the larger default runners allow us to remove some of these small runner hacks --- .github/workflows/build-test-cplusplus.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/build-test-cplusplus.yml b/.github/workflows/build-test-cplusplus.yml index c0b87a2f3..07b590ebf 100644 --- a/.github/workflows/build-test-cplusplus.yml +++ b/.github/workflows/build-test-cplusplus.yml @@ -42,16 +42,10 @@ jobs: - name: cmake, RunTests with address- and undefined sanitizer on Ubuntu run: | - sudo fallocate -l 16G /swapfile - sudo chmod 600 /swapfile - sudo mkswap /swapfile - sudo swapon /swapfile - swapon -s mkdir build-asan cd build-asan cmake -DCMAKE_BUILD_TYPE=ASAN ../ cmake --build . -- -j 6 - swapon -s ./RunTests tsan: @@ -67,7 +61,7 @@ jobs: cd build-tsan cmake -DCMAKE_BUILD_TYPE=TSAN ../ cmake --build . -- -j 6 - TSAN_OPTIONS="memory_limit_mb=6000" ./RunTests + ./RunTests mac: name: MacOS From f28916f1f0080cfee6a448f410d8fbbe9f0adb05 Mon Sep 17 00:00:00 2001 From: Earle Lowe <30607889+emlowe@users.noreply.github.com> Date: Tue, 23 Jan 2024 13:58:23 -0800 Subject: [PATCH 27/27] Update cxxopts to 3.1.1 (#412) --- CMakeLists.txt | 2 +- src/cli.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d84043d7a..77a48219c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,7 +30,7 @@ endif() FetchContent_Declare( cxxopts GIT_REPOSITORY https://github.com/jarro2783/cxxopts.git - GIT_TAG v2.2.1 + GIT_TAG v3.1.1 ) FetchContent_MakeAvailable(cxxopts) diff --git a/src/cli.cpp b/src/cli.cpp index 8f96d7364..3044c9785 100644 --- a/src/cli.cpp +++ b/src/cli.cpp @@ -310,7 +310,7 @@ int main(int argc, char *argv[]) try { cout << "Invalid operation '" << operation << "'. Use create/prove/verify/check" << endl; } return 0; -} catch (const cxxopts::OptionException &e) { +} catch (const cxxopts::exceptions::exception &e) { cout << "error parsing options: " << e.what() << endl; return 1; } catch (const std::exception &e) {