From fef7a567c02b0c3c91795777b60c10b3492f0a47 Mon Sep 17 00:00:00 2001 From: Martun Karapetyan Date: Wed, 4 Oct 2023 00:37:42 -0700 Subject: [PATCH] Removing asm code. --- .../multiprecision/modular/asm_functions.hpp | 217 ------------------ .../modular/modular_adaptor.hpp | 27 +-- .../modular/modular_functions_fixed.hpp | 88 +++---- 3 files changed, 35 insertions(+), 297 deletions(-) delete mode 100644 include/nil/crypto3/multiprecision/modular/asm_functions.hpp diff --git a/include/nil/crypto3/multiprecision/modular/asm_functions.hpp b/include/nil/crypto3/multiprecision/modular/asm_functions.hpp deleted file mode 100644 index d140ac9d..00000000 --- a/include/nil/crypto3/multiprecision/modular/asm_functions.hpp +++ /dev/null @@ -1,217 +0,0 @@ -//---------------------------------------------------------------------------// -// Copyright (c) 2020 Mikhail Komarov -// Copyright (c) 2021 Aleksei Moskvin -// -// Distributed under the Boost Software License, Version 1.0 -// See accompanying file LICENSE_1_0.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt -//---------------------------------------------------------------------------// - -#ifndef BOOST_MULTIPRECISION_ASM_FUNCTIONS_HPP -#define BOOST_MULTIPRECISION_ASM_FUNCTIONS_HPP - -#include - -namespace nil { - namespace crypto3 { - namespace multiprecision { - namespace backends { -#if BOOST_ARCH_X86_64 - template - void sub_asm(size_t n, Limb1 *x, const Limb2 *y) { - __asm__ volatile( - "movq (%[y]), %%rax \n\t" - "subq %%rax, (%[x]) \n\t" - "pushf \n\t" - "movq $1, %%rbx \n\t" - // Loop for sub - "4: \n\t" - "popf \n\t" - "movq (%[y], %%rbx, 8), %%rax \n\t" - "sbbq %%rax, (%[x], %%rbx, 8) \n\t" - "pushf \n\t" - "inc %%rbx \n\t" - "cmp %%rbx, %[limbs] \n\t" - "jne 4b \n\t" - "popf \n\t" - : - : [limbs] "r"(n), [x] "r"(x), [y] "r"(y) - : "cc", "memory", "%rax", "%rcx", "%rbx"); - } - - template - bool reduce_limb_asm(const size_t &n, Limb1 *res, const Limb2 *x, const Limb3 &inv) { - bool carry = false; - __asm__ volatile( - // Else check result with mod - "movq $0, %%r12 \n\t" - "0: \n\t" - "movq %%r12, %%r11 \n\t" - - "movq (%[res], %%r11, 8), %%rax \n\t" - "mulq %[inv] \n\t" - "movq %%rax, %%r10 \n\t" - - "movq (%[x]), %%rax \n\t" - "mulq %%r10 \n\t" - "movq %%rax, %%r8 \n\t" - "movq %%rdx, %%r9 \n\t" - - "mov $1, %%rbx \n\t" - "1: \n\t" - "movq (%[x], %%rbx, 8), %%rax \n\t" - "mulq %%r10 \n\t" - "addq %%r8, (%[res], %%r11, 8) \n\t" - "movq $0, %%r8 \n\t" - "adcq %%rax, %%r9 \n\t" - "adcq %%rdx, %%r8 \n\t" - // swap tmp2, tmp1 - "movq %%r9, %%rax \n\t" - "movq %%r8, %%r9 \n\t" - "movq %%rax, %%r8 \n\t" - // swap end - "movq $1, %%rdx \n\t" - "addq %%rdx, %%r11 \n\t" - "inc %%rbx \n\t" - "cmp %%rbx, %[limbs] \n\t" - "jne 1b \n\t" - "mov %%r11, %%rbx \n\t" - "addq %%r8, (%[res], %%rbx, 8) \n\t" - "adcq %%r9, 8(%[res], %%rbx, 8) \n\t" - "movb $0, %[carry] \n\t" - "jnc 2f \n\t" - "adcq $0, 16(%[res], %%rbx, 8) \n\t" - "movb $1, %[carry] \n\t" - "2: \n\t" - "inc %%r12 \n\t" - "cmpq %[limbs], %%r12 \n\t" - "jne 0b \n\t" - : [carry] "+r"(carry) - : [limbs] "r"(n), [res] "r"(res), [x] "r"(x), [inv] "r"(inv) - : "cc", "memory", "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12"); - // r8, r9 - tmp1, tmp2 - // r10 - k - return carry; - } - - template - int cmp_asm(size_t n, const Limb1 *x, const Limb2 *y) { - int result = 0; - __asm__ volatile( - // Else check result with mod - "mov $0, %[res] \n\t" - "movq %[limbs], %%rbx \n\t" - "1: \n\t" - "movq -8(%[y], %%rbx, 8), %%rax \n\t" - "cmpq %%rax, -8(%[x], %%rbx, 8) \n\t" - "jb 2f \n\t" - "ja 3f \n\t" - "dec %%rbx \n\t" - "jnz 1b \n\t" - "jmp 4f \n\t" - // Start sub - "2: \n\t" - "dec %[res] \n\t" - "jmp 4f \n\t" - "3: \n\t" - "inc %[res] \n\t" - "4: \n\t" - : [res] "=&r"(result) - : [limbs] "r"(n), [x] "r"(x), [y] "r"(y) - : "cc", "memory", "%rax", "%rcx", "%rbx"); - return result; - } - - template - void sub_mod_asm(size_t n, Limb1 *x, const Limb2 *y, const Limb3 *mod) { - __asm__ volatile( - "pushf \n\t" - "movq $0, %%rbx \n\t" - // Start circle sub from 0st limb - "1: \n\t" - "popf \n\t" - "movq (%[y], %%rbx, 8), %%rax \n\t" - "sbbq %%rax, (%[x], %%rbx, 8) \n\t" - "pushf \n\t" - "inc %%rbx \n\t" - "cmp %%rbx, %[limbs] \n\t" - "jne 1b \n\t" - "popf \n\t" - // If it's more than zero (no carry bit) just go to end - "jnc 4f \n\t" - // Else add mod to result - "clc \n\t" - "pushf \n\t" - "movq $0, %%rbx \n\t" - "2: \n\t" - "popf \n\t" - "movq (%[mod], %%rbx, 8), %%rax \n\t" - "adcq %%rax, (%[x], %%rbx, 8) \n\t" - "pushf \n\t" - "inc %%rbx \n\t" - "cmp %%rbx, %[limbs] \n\t" - "jne 2b \n\t" - "popf \n\t" - "4: \n\t" - : - : [limbs] "r"(n), [x] "r"(x), [y] "r"(y), [mod] "r"(mod) - : "cc", "memory", "%rax", "%rcx", "%rbx"); - } - - template - void add_mod_asm(size_t n, Limb1 *x, const Limb2 *y, const Limb3 *mod) { - __asm__ volatile( - "movq (%[y]), %%rax \n\t" - "addq %%rax, (%[x]) \n\t" - "movq $1, %%rbx \n\t" - "pushf \n\t" - // Start circle add from 1st limb - "1: \n\t" - "popf \n\t" - "movq (%[y], %%rbx, 8), %%rax \n\t" - "adcq %%rax, (%[x], %%rbx, 8) \n\t" - "pushf \n\t" - "inc %%rbx \n\t" - "cmp %%rbx, %[limbs] \n\t" - "jne 1b \n\t" - "popf \n\t" - // If was carry, we always need sub mod - "jc 3f \n\t" - - // Else check result with mod - "movq %[limbs], %%rbx \n\t" - "2: \n\t" - "movq -8(%[mod], %%rbx, 8), %%rax \n\t" - "cmpq %%rax, -8(%[x], %%rbx, 8) \n\t" - "jb 5f \n\t" - "ja 3f \n\t" - "dec %%rbx \n\t" - "jnz 2b \n\t" - // Start sub - "3: \n\t" - "movq (%[mod]), %%rax \n\t" - "subq %%rax, (%[x]) \n\t" - "pushf \n\t" - "movq $1, %%rbx \n\t" - // Loop for sub - "4: \n\t" - "popf \n\t" - "movq (%[mod], %%rbx, 8), %%rax \n\t" - "sbbq %%rax, (%[x], %%rbx, 8) \n\t" - "pushf \n\t" - "inc %%rbx \n\t" - "cmp %%rbx, %[limbs] \n\t" - "jne 4b \n\t" - "popf \n\t" - "5: \n\t" - : - : [limbs] "r"(n), [x] "r"(x), [y] "r"(y), [mod] "r"(mod) - : "cc", "memory", "%rax", "%rcx", "%rbx"); - } -#endif - } // namespace backends - } // namespace multiprecision - } // namespace crypto3 -} // namespace nil - -#endif //_MULTIPRECISION_BARRETT_PARAMS_HPP diff --git a/include/nil/crypto3/multiprecision/modular/modular_adaptor.hpp b/include/nil/crypto3/multiprecision/modular/modular_adaptor.hpp index 3535ed3f..2eed84cb 100644 --- a/include/nil/crypto3/multiprecision/modular/modular_adaptor.hpp +++ b/include/nil/crypto3/multiprecision/modular/modular_adaptor.hpp @@ -28,6 +28,8 @@ #include #include +#include + namespace nil { namespace crypto3 { namespace multiprecision { @@ -302,30 +304,15 @@ namespace nil { constexpr void eval_subtract( modular_adaptor, StorageType> &result, const modular_adaptor, StorageType> &o) { + BOOST_ASSERT(result.mod_data().get_mod() == o.mod_data().get_mod()); using ui_type = typename std::tuple_element< 0, typename cpp_int_backend::unsigned_types>::type; using default_ops::eval_lt; -#ifndef BOOST_MP_NO_CONSTEXPR_DETECTION -#if BOOST_ARCH_X86_64 - auto limbs_count = result.base_data().size(); - if (!BOOST_MP_IS_CONST_EVALUATED(result.base_data().limbs()) && - !is_trivial_cpp_int>::value && - result.base_data().size() == o.base_data().size() && - result.base_data().size() == result.mod_data().get_mod().backend().size()) { - - sub_mod_asm(limbs_count, result.base_data().limbs(), o.base_data().limbs(), - result.mod_data().get_mod().backend().limbs()); - result.base_data().resize(limbs_count, limbs_count); - result.base_data().normalize(); - } else -#endif -#endif - { - eval_subtract(result.base_data(), o.base_data()); - if (eval_lt(result.base_data(), ui_type(0u))) { - eval_add(result.base_data(), result.mod_data().get_mod().backend()); - } + + eval_subtract(result.base_data(), o.base_data()); + if (eval_lt(result.base_data(), ui_type(0u))) { + eval_add(result.base_data(), result.mod_data().get_mod().backend()); } } diff --git a/include/nil/crypto3/multiprecision/modular/modular_functions_fixed.hpp b/include/nil/crypto3/multiprecision/modular/modular_functions_fixed.hpp index 0c9f319b..92123cce 100644 --- a/include/nil/crypto3/multiprecision/modular/modular_functions_fixed.hpp +++ b/include/nil/crypto3/multiprecision/modular/modular_functions_fixed.hpp @@ -12,9 +12,10 @@ #define BOOST_MULTIPRECISION_MODULAR_FUNCTIONS_FIXED_PRECISION_HPP #include -#include #include +#include + #include #include @@ -394,6 +395,7 @@ namespace nil { /// result should fit in the output parameter max_precision::value >= max_precision::value>::type> constexpr void montgomery_reduce(Backend1 &result) const { + using default_ops::eval_add; using default_ops::eval_bitwise_and; using default_ops::eval_left_shift; @@ -403,48 +405,24 @@ namespace nil { Backend_doubled_padded_limbs accum(result); Backend_doubled_padded_limbs prod; -#ifndef BOOST_MP_NO_CONSTEXPR_DETECTION -#if BOOST_ARCH_X86_64 - if (!BOOST_MP_IS_CONST_EVALUATED(result.limbs()) && result.size() == m_mod.backend().size() - && !is_trivial_cpp_int::value && result.size() > 1) { - bool carry = - reduce_limb_asm(m_mod.backend().size(), accum.limbs(), m_mod.backend().limbs(), - static_cast(m_montgomery_p_dash)); - if (carry || cmp_asm(m_mod.backend().size(), accum.limbs() + m_mod.backend().size(), - m_mod.backend().limbs()) >= 0) { - sub_asm(m_mod.backend().size(), accum.limbs() + m_mod.backend().size(), - m_mod.backend().limbs()); - } - // Now result in first m_mod.backend().size() limbs, so we can do - // eval_bitwise_and(accum, m_modulus_mask); - // or just copy n limbs to result - for (size_t i = 0; i < m_mod.backend().size(); ++i) { - result.limbs()[i] = accum.limbs()[i + m_mod.backend().size()]; - } - result.resize(m_mod.backend().size(), m_mod.backend().size()); - result.normalize(); - } else -#endif -#endif - { - for (auto i = 0; i < m_mod.backend().size(); ++i) { - eval_multiply(prod, m_mod.backend(), - static_cast(static_cast( - custom_get_limb_value(accum, i) * - /// to prevent overflow error in constexpr - static_cast(m_montgomery_p_dash)))); - eval_left_shift(prod, i * limb_bits); - eval_add(accum, prod); - } - custom_right_shift(accum, m_mod.backend().size() * limb_bits); - if (!eval_lt(accum, m_mod.backend())) { - eval_subtract(accum, m_mod.backend()); - } - if (m_mod.backend().size() < accum.size()) { - accum.resize(m_mod.backend().size(), m_mod.backend().size()); - } - result = accum; + + for (auto i = 0; i < m_mod.backend().size(); ++i) { + eval_multiply(prod, m_mod.backend(), + static_cast(static_cast( + custom_get_limb_value(accum, i) * + /// to prevent overflow error in constexpr + static_cast(m_montgomery_p_dash)))); + eval_left_shift(prod, i * limb_bits); + eval_add(accum, prod); + } + custom_right_shift(accum, m_mod.backend().size() * limb_bits); + if (!eval_lt(accum, m_mod.backend())) { + eval_subtract(accum, m_mod.backend()); + } + if (m_mod.backend().size() < accum.size()) { + accum.resize(m_mod.backend().size(), m_mod.backend().size()); } + result = accum; } template::value >= max_precision::value>::type> constexpr void regular_add(Backend1 &result, const Backend2 &y) const { + using default_ops::eval_add; using default_ops::eval_lt; using default_ops::eval_subtract; @@ -459,25 +438,14 @@ namespace nil { // TODO: maybe reduce input parameters /// input parameters should be lesser than modulus // BOOST_ASSERT(eval_lt(x, m_mod.backend()) && eval_lt(y, m_mod.backend())); -#ifndef BOOST_MP_NO_CONSTEXPR_DETECTION -#if BOOST_ARCH_X86_64 - if (!BOOST_MP_IS_CONST_EVALUATED(result.limbs()) && result.size() == y.size() - && result.size() == m_mod.backend().size() && !is_trivial_cpp_int::value) { - add_mod_asm(limbs_count, result.limbs(), y.limbs(), m_mod.backend().limbs()); - result.resize(limbs_count, limbs_count); - result.normalize(); - } else -#endif -#endif - { - using T = typename policy_type::Backend_padded_limbs_u; - T tmp(result), modulus(m_mod.backend()); - eval_add(tmp, y); - if (!eval_lt(tmp, modulus)) { - eval_subtract(tmp, modulus); - } - result = tmp; + + using T = typename policy_type::Backend_padded_limbs_u; + T tmp(result), modulus(m_mod.backend()); + eval_add(tmp, y); + if (!eval_lt(tmp, modulus)) { + eval_subtract(tmp, modulus); } + result = tmp; } template