From 33fd09963bcdbc4ec82b41915c562b3fda0854cb Mon Sep 17 00:00:00 2001 From: Brian Smith Date: Fri, 11 Oct 2024 21:26:01 -0700 Subject: [PATCH] ec: Use 3 fewer squarings for P-256 scalar inversion. Back in 2021 a GitHub user "Nik-U" showed a better addition chain for P-256 scalar inversion. This addition chain is slightly better than that one. --- src/ec/suite_b/ops/p256.rs | 102 ++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 59 deletions(-) diff --git a/src/ec/suite_b/ops/p256.rs b/src/ec/suite_b/ops/p256.rs index 3d7b53b01c..bb730bd7a9 100644 --- a/src/ec/suite_b/ops/p256.rs +++ b/src/ec/suite_b/ops/p256.rs @@ -170,6 +170,7 @@ pub static PRIVATE_SCALAR_OPS: PrivateScalarOps = PrivateScalarOps { scalar_inv_to_mont: p256_scalar_inv_to_mont, }; +#[allow(clippy::just_underscores_and_digits)] fn p256_scalar_inv_to_mont(a: Scalar, _cpu: cpu::Features) -> Scalar { // Calculate the modular inverse of scalar |a| using Fermat's Little // Theorem: @@ -207,32 +208,29 @@ fn p256_scalar_inv_to_mont(a: Scalar, _cpu: cpu::Features) -> Scalar { binary_op_assign(p256_scalar_mul_mont, acc, b); } - // Indexes into `d`. - const B_1: usize = 0; - const B_10: usize = 1; - const B_11: usize = 2; - const B_101: usize = 3; - const B_111: usize = 4; - const B_1111: usize = 5; - const B_10101: usize = 6; - const B_101111: usize = 7; - const DIGIT_COUNT: usize = 8; - - let mut d = [Scalar::zero(); DIGIT_COUNT]; - - d[B_1] = a; - d[B_10] = sqr(&d[B_1]); - d[B_11] = mul(&d[B_10], &d[B_1]); - d[B_101] = mul(&d[B_10], &d[B_11]); - d[B_111] = mul(&d[B_101], &d[B_10]); - let b_1010 = sqr(&d[B_101]); - d[B_1111] = mul(&b_1010, &d[B_101]); - d[B_10101] = sqr_mul(&b_1010, 0 + 1, &d[B_1]); - let b_101010 = sqr(&d[B_10101]); - d[B_101111] = mul(&b_101010, &d[B_101]); - let b_111111 = mul(&b_101010, &d[B_10101]); - - let ff = sqr_mul(&b_111111, 0 + 2, &d[B_11]); + let _1 = &a; + + let _10 = sqr(_1); // 2 + let _100 = sqr(&_10); // 4 + let _101 = mul(&_100, _1); // 5 + let _111 = mul(&_101, &_10); // 7 + + let _1000 = sqr(&_100); // 8 + let _10000 = sqr(&_1000); // 16 + let _100000 = sqr(&_10000); // 32 + + let _100111 = mul(&_111, &_100000); // 39 = 7 + 32 + let _101011 = mul(&_100, &_100111); // 43 = 4 + 39 + let _101111 = mul(&_100, &_101011); // 47 = 4 + 39 + let _1001111 = mul(&_100000, &_101111); // 79 = 32 + 47 + let _86 = sqr(&_101011); // 86 = 43 * 2 + let _1011011 = mul(&_101, &_86); // 91 = 5 + 86 + let _92 = mul(_1, &_1011011); // 92 = 1 + 91 + let _1100011 = mul(&_111, &_92); // 99 = 7 + 92 + let _10111111 = mul(&_92, &_1100011); // 191 = 92 + 99 + let _11011111 = mul(&_100000, &_10111111); // 223 = 32 + 191 + + let ff = mul(&_100000, &_11011111); // 255 = 32 + 223 let ffff = sqr_mul(&ff, 0 + 8, &ff); let ffffffff = sqr_mul(&ffff, 0 + 16, &ffff); @@ -247,39 +245,25 @@ fn p256_scalar_inv_to_mont(a: Scalar, _cpu: cpu::Features) -> Scalar { // 1011110011100110111110101010110110100111000101111001111010000100 // 1111001110111001110010101100001011111100011000110010010101001111 - #[allow(clippy::cast_possible_truncation)] - static REMAINING_WINDOWS: [(u8, u8); 26] = [ - (6, B_101111 as u8), - (2 + 3, B_111 as u8), - (2 + 2, B_11 as u8), - (1 + 4, B_1111 as u8), - (5, B_10101 as u8), - (1 + 3, B_101 as u8), - (3, B_101 as u8), - (3, B_101 as u8), - (2 + 3, B_111 as u8), - (3 + 6, B_101111 as u8), - (2 + 4, B_1111 as u8), - (1 + 1, B_1 as u8), - (4 + 1, B_1 as u8), - (2 + 4, B_1111 as u8), - (2 + 3, B_111 as u8), - (1 + 3, B_111 as u8), - (2 + 3, B_111 as u8), - (2 + 3, B_101 as u8), - (1 + 2, B_11 as u8), - (4 + 6, B_101111 as u8), - (2, B_11 as u8), - (3 + 2, B_11 as u8), - (3 + 2, B_11 as u8), - (2 + 1, B_1 as u8), - (2 + 5, B_10101 as u8), - (2 + 4, B_1111 as u8), - ]; - - for &(squarings, digit) in &REMAINING_WINDOWS { - sqr_mul_acc(&mut acc, Limb::from(squarings), &d[usize::from(digit)]); - } + sqr_mul_acc(&mut acc, 6, &_101111); + sqr_mul_acc(&mut acc, 2 + 3, &_111); + sqr_mul_acc(&mut acc, 2 + 8, &_11011111); + sqr_mul_acc(&mut acc, 1 + 3, &_101); + sqr_mul_acc(&mut acc, 1 + 7, &_1011011); + sqr_mul_acc(&mut acc, 1 + 6, &_100111); + sqr_mul_acc(&mut acc, 3 + 6, &_101111); + sqr_mul_acc(&mut acc, 2 + 3, &_111); + sqr_mul_acc(&mut acc, 3, &_101); + sqr_mul_acc(&mut acc, 4 + 7, &_1001111); + sqr_mul_acc(&mut acc, 2 + 3, &_111); + sqr_mul_acc(&mut acc, 1 + 3, &_111); + sqr_mul_acc(&mut acc, 2 + 3, &_111); + sqr_mul_acc(&mut acc, 2 + 6, &_101011); + sqr_mul_acc(&mut acc, 4 + 8, &_10111111); + sqr_mul_acc(&mut acc, 3 + 7, &_1100011); + sqr_mul_acc(&mut acc, 2 + 1, _1); + sqr_mul_acc(&mut acc, 2 + 3, &_101); + sqr_mul_acc(&mut acc, 1 + 7, &_1001111); acc }