Skip to content

Commit

Permalink
[ChaCha] speed improvement and spec conformance (#44)
Browse files Browse the repository at this point in the history
* [ChaCha] speed improvement and spec conformance

* Improve ref speed by 10x removing the use of (fake) simd
* Clearly separate the bernstein/original version
  from the IETF standard version in term of counters size
  and nonce handling
* Separate XChaCha from the 'normal' ChaCha context

* add the ability to seek in the chacha stream

* fix example and documentation

* [poly1305] make key size apparent at type level

also switch initialization to be less mutable

* [SALSA] improve performance and tweak APIs

* performance bump by ~ 3x not using the fake simd instructions
* separate salsa and xsalsa
* add some docs

* add MIGRATION GUIDE and CHANGELOG for this work
  • Loading branch information
vincenthz authored Mar 15, 2023
1 parent 7591d41 commit 233dd2d
Show file tree
Hide file tree
Showing 10 changed files with 591 additions and 414 deletions.
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
# unreleased

* Improve performance of Salsa by 3x and Chacha by 10x
* Clearly distinguish at the type level various variants of chacha:
* Chacha as IETF (recommended)
* Chacha as original paper (64 bits counters)
* XChacha
* Distinguish at the type level Salsa and XSalsa

Breaking Changes:

* Chacha, Salsa and Poly1305 interface changes to expect fixed sized array instead of slice, for stronger type safety
and less runtime error.
* `Chacha::new_xchacha20::<ROUNDS>()` is now `XChacha::<ROUNDS>::init()`
* `Salsa::new_salsa20::<ROUNDS>()` is now `XSalsa::<ROUNDS>::init()`

# 0.4.4

* fix legacy blake2b and blake2s `output_bits` interface returning a value 8 times bigger.
Expand Down
37 changes: 37 additions & 0 deletions MIGRATION_GUIDE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
## Changing slice by array reference

For slice to array reference changes of the form:

```
fn function(value: &[u8]) { ... }
```

tto:

```
fn function(value: &[u8; 12]) { ... }
```

In the case of the caller using array of the right size already,
no changes need to be done. When the caller is using a subslice,
one can use the following construction, from:

```
fn caller() {
let slice = &[....];
function(&slice[0..12]);
}
```

to:

```
use core::convert::TryFrom; // not-necessary in latest rust edition
fn caller() {
let slice = &[....];
function(<&[u8; 12]>::try_from(&slice[0..12]).unwrap());
}

Note the .unwrap() is just one way to (not) handle the error, and the caller
should integrate the failing `try_from` case with the error handling
conventions of the caller code.
201 changes: 82 additions & 119 deletions src/chacha/reference.rs
Original file line number Diff line number Diff line change
@@ -1,54 +1,23 @@
use crate::cryptoutil::{read_u32_le, write_u32v_le};
use crate::simd::u32x4;

#[derive(Clone)]
pub(crate) struct State<const ROUNDS: usize> {
a: u32x4,
b: u32x4,
c: u32x4,
d: u32x4,
state: [u32; 16],
}

// b row <<< 8, c row <<< 16, d row <<< 24
macro_rules! swizzle {
($b: expr, $c: expr, $d: expr) => {{
let u32x4(b10, b11, b12, b13) = $b;
$b = u32x4(b11, b12, b13, b10);
let u32x4(c10, c11, c12, c13) = $c;
$c = u32x4(c12, c13, c10, c11);
let u32x4(d10, d11, d12, d13) = $d;
$d = u32x4(d13, d10, d11, d12);
}};
macro_rules! QR {
($a:ident, $b:ident, $c:ident, $d:ident) => {
$a = $a.wrapping_add($b);
$d = ($d ^ $a).rotate_left(16);
$c = $c.wrapping_add($d);
$b = ($b ^ $c).rotate_left(12);
$a = $a.wrapping_add($b);
$d = ($d ^ $a).rotate_left(8);
$c = $c.wrapping_add($d);
$b = ($b ^ $c).rotate_left(7);
};
}

macro_rules! round {
($state: expr) => {{
$state.a = $state.a + $state.b;
rotate!($state.d, $state.a, S16);
$state.c = $state.c + $state.d;
rotate!($state.b, $state.c, S12);
$state.a = $state.a + $state.b;
rotate!($state.d, $state.a, S8);
$state.c = $state.c + $state.d;
rotate!($state.b, $state.c, S7);
}};
}

macro_rules! rotate {
($a: expr, $b: expr, $c:expr) => {{
let v = $a ^ $b;
let r = S32 - $c;
let right = v >> r;
$a = (v << $c) ^ right
}};
}

static S32: u32x4 = u32x4(32, 32, 32, 32);
static S16: u32x4 = u32x4(16, 16, 16, 16);
static S12: u32x4 = u32x4(12, 12, 12, 12);
static S8: u32x4 = u32x4(8, 8, 8, 8);
static S7: u32x4 = u32x4(7, 7, 7, 7);

impl<const ROUNDS: usize> State<ROUNDS> {
// state initialization constant le-32bit array of b"expand 16-byte k"
const CST16: [u32; 4] = [0x61707865, 0x3120646e, 0x79622d36, 0x6b206574];
Expand All @@ -63,110 +32,104 @@ impl<const ROUNDS: usize> State<ROUNDS> {

/// Initialize the state with key and nonce
pub(crate) fn init(key: &[u8], nonce: &[u8]) -> Self {
let (a, b, c) = match key.len() {
16 => Self::init_key16(key),
32 => Self::init_key32(key),
let mut state = [0u32; 16];
match key.len() {
16 => {
state[0] = Self::CST16[0];
state[1] = Self::CST16[1];
state[2] = Self::CST16[2];
state[3] = Self::CST16[3];
}
32 => {
state[0] = Self::CST32[0];
state[1] = Self::CST32[1];
state[2] = Self::CST32[2];
state[3] = Self::CST32[3];
state[4] = read_u32_le(&key[0..4]);
state[5] = read_u32_le(&key[4..8]);
state[6] = read_u32_le(&key[8..12]);
state[7] = read_u32_le(&key[12..16]);
state[8] = read_u32_le(&key[16..20]);
state[9] = read_u32_le(&key[20..24]);
state[10] = read_u32_le(&key[24..28]);
state[11] = read_u32_le(&key[28..32]);
}
_ => unreachable!(),
};
let d = Self::init_nonce(nonce);
Self { a, b, c, d }
}

#[inline]
fn init_key16(key: &[u8]) -> (u32x4, u32x4, u32x4) {
let constant: &[u32; 4] = &Self::CST16;
let c = u32x4(constant[0], constant[1], constant[2], constant[3]);
let k1 = u32x4(
read_u32_le(&key[0..4]),
read_u32_le(&key[4..8]),
read_u32_le(&key[8..12]),
read_u32_le(&key[12..16]),
);
(c, k1, k1)
}

#[inline]
fn init_key32(key: &[u8]) -> (u32x4, u32x4, u32x4) {
let constant: &[u32; 4] = &Self::CST32;
let c = u32x4(constant[0], constant[1], constant[2], constant[3]);
let k1 = u32x4(
read_u32_le(&key[0..4]),
read_u32_le(&key[4..8]),
read_u32_le(&key[8..12]),
read_u32_le(&key[12..16]),
);
let k2 = u32x4(
read_u32_le(&key[16..20]),
read_u32_le(&key[20..24]),
read_u32_le(&key[24..28]),
read_u32_le(&key[28..32]),
);
(c, k1, k2)
}

#[inline]
fn init_nonce(nonce: &[u8]) -> u32x4 {
if nonce.len() == 16 {
u32x4(
read_u32_le(&nonce[0..4]),
read_u32_le(&nonce[4..8]),
read_u32_le(&nonce[8..12]),
read_u32_le(&nonce[12..16]),
)
state[12] = read_u32_le(&nonce[0..4]);
state[13] = read_u32_le(&nonce[4..8]);
state[14] = read_u32_le(&nonce[8..12]);
state[15] = read_u32_le(&nonce[12..16]);
} else if nonce.len() == 12 {
u32x4(
0,
read_u32_le(&nonce[0..4]),
read_u32_le(&nonce[4..8]),
read_u32_le(&nonce[8..12]),
)
// 12 is already set to 0
state[13] = read_u32_le(&nonce[0..4]);
state[14] = read_u32_le(&nonce[4..8]);
state[15] = read_u32_le(&nonce[8..12]);
} else {
u32x4(0, 0, read_u32_le(&nonce[0..4]), read_u32_le(&nonce[4..8]))
// 12 and 13 already set to 0
state[14] = read_u32_le(&nonce[0..4]);
state[15] = read_u32_le(&nonce[4..8]);
}
Self { state }
}

#[inline]
pub(crate) fn rounds(&mut self) {
let [mut x0, mut x1, mut x2, mut x3, mut x4, mut x5, mut x6, mut x7, mut x8, mut x9, mut x10, mut x11, mut x12, mut x13, mut x14, mut x15] =
self.state;

for _ in 0..(ROUNDS / 2) {
round!(self);
swizzle!(self.b, self.c, self.d);
round!(self);
swizzle!(self.d, self.c, self.b);
QR!(x0, x4, x8, x12);
QR!(x1, x5, x9, x13);
QR!(x2, x6, x10, x14);
QR!(x3, x7, x11, x15);

QR!(x0, x5, x10, x15);
QR!(x1, x6, x11, x12);
QR!(x2, x7, x8, x13);
QR!(x3, x4, x9, x14);
}

self.state = [
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
];
}

#[inline]
pub(crate) fn set_counter(&mut self, counter: u32) {
self.state[12] = counter;
}

#[inline]
pub(crate) fn increment(&mut self) {
self.d = self.d + u32x4(1, 0, 0, 0);
self.state[12] = self.state[12].wrapping_add(1);
}

#[inline]
pub(crate) fn increment64(&mut self) {
self.state[12] = self.state[12].wrapping_add(1);
if self.state[12] == 0 {
self.state[13] = self.state[13].wrapping_add(1);
}
}

#[inline]
/// Add back the initial state
pub(crate) fn add_back(&mut self, initial: &Self) {
self.a = self.a + initial.a;
self.b = self.b + initial.b;
self.c = self.c + initial.c;
self.d = self.d + initial.d;
for i in 0..16 {
self.state[i] = self.state[i].wrapping_add(initial.state[i]);
}
}

#[inline]
pub(crate) fn output_bytes(&self, output: &mut [u8]) {
let u32x4(a1, a2, a3, a4) = self.a;
let u32x4(b1, b2, b3, b4) = self.b;
let u32x4(c1, c2, c3, c4) = self.c;
let u32x4(d1, d2, d3, d4) = self.d;
write_u32v_le(
output,
&[
a1, a2, a3, a4, b1, b2, b3, b4, c1, c2, c3, c4, d1, d2, d3, d4,
],
);
write_u32v_le(output, &self.state);
}

#[inline]
pub(crate) fn output_ad_bytes(&self, output: &mut [u8; 32]) {
let u32x4(a1, a2, a3, a4) = self.a;
let u32x4(d1, d2, d3, d4) = self.d;
write_u32v_le(&mut output[..], &[a1, a2, a3, a4, d1, d2, d3, d4]);
write_u32v_le(&mut output[0..16], &self.state[0..4]);
write_u32v_le(&mut output[16..32], &self.state[12..16]);
}
}
16 changes: 16 additions & 0 deletions src/chacha/sse2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,24 @@ impl<const ROUNDS: usize> State<ROUNDS> {
}
}

#[inline]
pub(crate) fn set_counter(&mut self, counter: u32) {
let mut align = Align128::zero();
align.from_m128i(self.d);
align.0[0] = counter;
self.d = align.to_m128i();
}

#[inline]
pub(crate) fn increment(&mut self) {
let mut align = Align128::zero();
align.from_m128i(self.d);
align.0[0] = align.0[0].wrapping_add(1);
self.d = align.to_m128i();
}

#[inline]
pub(crate) fn increment64(&mut self) {
let mut align = Align128::zero();
align.from_m128i(self.d);
let (a, overflowed) = align.0[0].overflowing_add(1);
Expand Down
Loading

0 comments on commit 233dd2d

Please sign in to comment.