From 6ecc4671d73cac6f601ff826f9289cf1a9b1b219 Mon Sep 17 00:00:00 2001 From: macaba Date: Mon, 6 Apr 2020 17:38:07 +0100 Subject: [PATCH 01/59] Initial integration of intrinsics --- src/NaCl.Core/Base/ChaCha20Base.cs | 13 + src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs | 570 ++++++++++++++++++ src/NaCl.Core/Base/Snuffle.cs | 9 + src/NaCl.Core/NaCl.Core.csproj | 12 +- .../NaCl.Core.Benchmarks/ChaCha20Benchmark.cs | 3 + .../NaCl.Core.Benchmarks.csproj | 4 +- test/NaCl.Core.Benchmarks/Program.cs | 8 +- test/NaCl.Core.Tests/NaCl.Core.Tests.csproj | 2 +- 8 files changed, 610 insertions(+), 11 deletions(-) create mode 100644 src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs diff --git a/src/NaCl.Core/Base/ChaCha20Base.cs b/src/NaCl.Core/Base/ChaCha20Base.cs index 57ac702..1cea088 100644 --- a/src/NaCl.Core/Base/ChaCha20Base.cs +++ b/src/NaCl.Core/Base/ChaCha20Base.cs @@ -64,6 +64,19 @@ public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter ArrayUtils.StoreArray16UInt32LittleEndian(block, 0, state); } +#if INTRINSICS + public override unsafe void ProcessStream(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int initialCounter, int offset = 0) + { + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + SetInitialState(state, nonce, initialCounter); + fixed(uint* x = state) + fixed (byte* m = input, c = output.Slice(offset)) + { + ChaCha20BaseIntrinsics.ChaCha20(x, m, c, (ulong)input.Length); + } + } +#endif + /// /// Process a pseudorandom keystream block, converting the key and part of the into a , and the remainder of the . /// diff --git a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs new file mode 100644 index 0000000..1b260f7 --- /dev/null +++ b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs @@ -0,0 +1,570 @@ +#if INTRINSICS +#pragma warning disable IDE0007 // Use implicit type + +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace NaCl.Core.Base +{ + public static class ChaCha20BaseIntrinsics + { + private static Vector128 rot8_128 = Vector128.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); + private static Vector128 rot16_128 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); + private static Vector256 rot8_256 = Vector256.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); + private static Vector256 rot16_256 = Vector256.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void ChaCha20(uint* x, byte* m, byte* c, ulong bytes) + { + if (Avx2.IsSupported && bytes >= 512) //Fix the AVX2 section! + { + Vector256 x_0 = Vector256.Create(x[0]); + Vector256 x_1 = Vector256.Create(x[1]); + Vector256 x_2 = Vector256.Create(x[2]); + Vector256 x_3 = Vector256.Create(x[3]); + Vector256 x_4 = Vector256.Create(x[4]); + Vector256 x_5 = Vector256.Create(x[5]); + Vector256 x_6 = Vector256.Create(x[6]); + Vector256 x_7 = Vector256.Create(x[7]); + Vector256 x_8 = Vector256.Create(x[8]); + Vector256 x_9 = Vector256.Create(x[9]); + Vector256 x_10 = Vector256.Create(x[10]); + Vector256 x_11 = Vector256.Create(x[11]); + Vector256 x_12; + Vector256 x_13; + Vector256 x_14 = Vector256.Create(x[14]); + Vector256 x_15 = Vector256.Create(x[15]); + + Vector256 orig0 = x_0; + Vector256 orig1 = x_1; + Vector256 orig2 = x_2; + Vector256 orig3 = x_3; + Vector256 orig4 = x_4; + Vector256 orig5 = x_5; + Vector256 orig6 = x_6; + Vector256 orig7 = x_7; + Vector256 orig8 = x_8; + Vector256 orig9 = x_9; + Vector256 orig10 = x_10; + Vector256 orig11 = x_11; + Vector256 orig12; + Vector256 orig13; + Vector256 orig14 = x_14; + Vector256 orig15 = x_15; + + while (bytes >= 512) + { + Vector256 addv12 = Vector256.Create(0, 1, 2, 3).AsUInt32(); + Vector256 addv13 = Vector256.Create(4, 5, 6, 7).AsUInt32(); + Vector256 permute = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7).AsUInt32(); + Vector256 t12, t13; + x_0 = orig0; + x_1 = orig1; + x_2 = orig2; + x_3 = orig3; + x_4 = orig4; + x_5 = orig5; + x_6 = orig6; + x_7 = orig7; + x_8 = orig8; + x_9 = orig9; + x_10 = orig10; + x_11 = orig11; + x_14 = orig14; + x_15 = orig15; + uint in12 = x[12]; + uint in13 = x[13]; + ulong in1213 = in12 | ((ulong)in13 << 32); + x_12 = x_13 = Avx2.BroadcastScalarToVector256(Sse2.X64.ConvertScalarToVector128UInt64(in1213)).AsUInt32(); + t12 = Avx2.Add(addv12.AsUInt64(), x_12.AsUInt64()).AsUInt32(); + t13 = Avx2.Add(addv13.AsUInt64(), x_13.AsUInt64()).AsUInt32(); + x_12 = Avx2.UnpackLow(t12, t13); + x_13 = Avx2.UnpackHigh(t12, t13); + t12 = Avx2.UnpackLow(x_12, x_13); + t13 = Avx2.UnpackHigh(x_12, x_13); + x_12 = Avx2.PermuteVar8x32(t12, permute); + x_13 = Avx2.PermuteVar8x32(t13, permute); + + orig12 = x_12; + orig13 = x_13; + + in1213 += 8; + + x[12] = (uint)(in1213 & 0xFFFFFFFF); + x[13] = (uint)((in1213 >> 32) & 0xFFFFFFFF); + for (int i = 0; i < 20; i += 2) + { + Vec256Round(ref x_0, ref x_4, ref x_8, ref x_12, ref x_1, ref x_5, ref x_9, ref x_13, ref x_2, ref x_6, ref x_10, ref x_14, ref x_3, ref x_7, ref x_11, ref x_15); + Vec256Round(ref x_0, ref x_5, ref x_10, ref x_15, ref x_1, ref x_6, ref x_11, ref x_12, ref x_2, ref x_7, ref x_8, ref x_13, ref x_3, ref x_4, ref x_9, ref x_14); + } + + Vector256 t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, t_13, t_14, t_15; + t_0 = t_1 = t_2 = t_3 = t_4 = t_5 = t_6 = t_7 = t_8 = t_9 = t_10 = t_11 = t_12 = t_13 = t_14 = t_15 = Vector256.Create((uint)0); + // ONEOCTO enter + OneQuadUnpack(ref x_0, ref x_1, ref x_2, ref x_3, ref t_0, ref t_1, ref t_2, ref t_3, ref orig0, ref orig1, ref orig2, ref orig3); + OneQuadUnpack(ref x_4, ref x_5, ref x_6, ref x_7, ref t_4, ref t_5, ref t_6, ref t_7, ref orig4, ref orig5, ref orig6, ref orig7); + t_0 = Avx2.Permute2x128(x_0, x_4, 0x20); + t_4 = Avx2.Permute2x128(x_0, x_4, 0x31); + t_1 = Avx2.Permute2x128(x_1, x_5, 0x20); + t_5 = Avx2.Permute2x128(x_1, x_5, 0x31); + t_2 = Avx2.Permute2x128(x_2, x_6, 0x20); + t_6 = Avx2.Permute2x128(x_2, x_6, 0x31); + t_3 = Avx2.Permute2x128(x_3, x_7, 0x20); + t_7 = Avx2.Permute2x128(x_3, x_7, 0x31); + t_0 = Avx2.Xor(t_0, Avx.LoadVector256(m).AsUInt32()); + t_1 = Avx2.Xor(t_1, Avx.LoadVector256(m + 64).AsUInt32()); + t_2 = Avx2.Xor(t_2, Avx.LoadVector256(m + 128).AsUInt32()); + t_3 = Avx2.Xor(t_3, Avx.LoadVector256(m + 192).AsUInt32()); + t_4 = Avx2.Xor(t_4, Avx.LoadVector256(m + 256).AsUInt32()); + t_5 = Avx2.Xor(t_5, Avx.LoadVector256(m + 320).AsUInt32()); + t_6 = Avx2.Xor(t_6, Avx.LoadVector256(m + 384).AsUInt32()); + t_7 = Avx2.Xor(t_7, Avx.LoadVector256(m + 448).AsUInt32()); + Avx.Store(c, t_0.AsByte()); + Avx.Store(c + 64, t_1.AsByte()); + Avx.Store(c + 128, t_2.AsByte()); + Avx.Store(c + 192, t_3.AsByte()); + Avx.Store(c + 256, t_4.AsByte()); + Avx.Store(c + 320, t_5.AsByte()); + Avx.Store(c + 384, t_6.AsByte()); + Avx.Store(c + 448, t_7.AsByte()); + // ONEOCTO exit + + m += 32; + c += 32; + + // ONEOCTO enter + OneQuadUnpack(ref x_8, ref x_9, ref x_10, ref x_11, ref t_8, ref t_9, ref t_10, ref t_11, ref orig8, ref orig9, ref orig10, ref orig11); + OneQuadUnpack(ref x_12, ref x_13, ref x_14, ref x_15, ref t_12, ref t_13, ref t_14, ref t_15, ref orig12, ref orig13, ref orig14, ref orig15); + t_8 = Avx2.Permute2x128(x_8, x_12, 0x20); + t_12 = Avx2.Permute2x128(x_8, x_12, 0x31); + t_9 = Avx2.Permute2x128(x_9, x_13, 0x20); + t_13 = Avx2.Permute2x128(x_9, x_13, 0x31); + t_10 = Avx2.Permute2x128(x_10, x_14, 0x20); + t_14 = Avx2.Permute2x128(x_10, x_14, 0x31); + t_11 = Avx2.Permute2x128(x_11, x_15, 0x20); + t_15 = Avx2.Permute2x128(x_11, x_15, 0x31); + t_8 = Avx2.Xor(t_8, Avx.LoadVector256(m).AsUInt32()); + t_9 = Avx2.Xor(t_9, Avx.LoadVector256(m + 64).AsUInt32()); + t_10 = Avx2.Xor(t_10, Avx.LoadVector256(m + 128).AsUInt32()); + t_11 = Avx2.Xor(t_11, Avx.LoadVector256(m + 192).AsUInt32()); + t_12 = Avx2.Xor(t_12, Avx.LoadVector256(m + 256).AsUInt32()); + t_13 = Avx2.Xor(t_13, Avx.LoadVector256(m + 320).AsUInt32()); + t_14 = Avx2.Xor(t_14, Avx.LoadVector256(m + 384).AsUInt32()); + t_15 = Avx2.Xor(t_15, Avx.LoadVector256(m + 448).AsUInt32()); + Avx.Store(c, t_8.AsByte()); + Avx.Store(c + 64, t_9.AsByte()); + Avx.Store(c + 128, t_10.AsByte()); + Avx.Store(c + 192, t_11.AsByte()); + Avx.Store(c + 256, t_12.AsByte()); + Avx.Store(c + 320, t_13.AsByte()); + Avx.Store(c + 384, t_14.AsByte()); + Avx.Store(c + 448, t_15.AsByte()); + // ONEOCTO exit + m -= 32; + c -= 32; + bytes -= 512; + c += 512; + m += 512; + } + } + if (bytes >= 256) + { + Vector128 x_0 = Vector128.Create(x[0]); + Vector128 x_1 = Vector128.Create(x[1]); + Vector128 x_2 = Vector128.Create(x[2]); + Vector128 x_3 = Vector128.Create(x[3]); + Vector128 x_4 = Vector128.Create(x[4]); + Vector128 x_5 = Vector128.Create(x[5]); + Vector128 x_6 = Vector128.Create(x[6]); + Vector128 x_7 = Vector128.Create(x[7]); + Vector128 x_8 = Vector128.Create(x[8]); + Vector128 x_9 = Vector128.Create(x[9]); + Vector128 x_10 = Vector128.Create(x[10]); + Vector128 x_11 = Vector128.Create(x[11]); + Vector128 x_12; + Vector128 x_13; + Vector128 x_14 = Vector128.Create(x[14]); + Vector128 x_15 = Vector128.Create(x[15]); + Vector128 orig0 = x_0; + Vector128 orig1 = x_1; + Vector128 orig2 = x_2; + Vector128 orig3 = x_3; + Vector128 orig4 = x_4; + Vector128 orig5 = x_5; + Vector128 orig6 = x_6; + Vector128 orig7 = x_7; + Vector128 orig8 = x_8; + Vector128 orig9 = x_9; + Vector128 orig10 = x_10; + Vector128 orig11 = x_11; + Vector128 orig12; + Vector128 orig13; + Vector128 orig14 = x_14; + Vector128 orig15 = x_15; + Vector128 t12, t13; + + while (bytes >= 256) + { + Vector128 addv12 = Vector128.Create(0, 1).AsUInt32(); + Vector128 addv13 = Vector128.Create(2, 3).AsUInt32(); + + x_0 = orig0; + x_1 = orig1; + x_2 = orig2; + x_3 = orig3; + x_4 = orig4; + x_5 = orig5; + x_6 = orig6; + x_7 = orig7; + x_8 = orig8; + x_9 = orig9; + x_10 = orig10; + x_11 = orig11; + x_14 = orig14; + x_15 = orig15; + + uint in12 = x[12]; + uint in13 = x[13]; + ulong in1213 = in12 | ((ulong)in13) << 32; + t12 = Vector128.Create(in1213).AsUInt32(); + t13 = Vector128.Create(in1213).AsUInt32(); + + x_12 = Sse2.Add(Vector128.AsUInt64(addv12), Vector128.AsUInt64(t12)).AsUInt32(); + x_13 = Sse2.Add(Vector128.AsUInt64(addv13), Vector128.AsUInt64(t13)).AsUInt32(); + + t12 = Sse2.UnpackLow(x_12, x_13); + t13 = Sse2.UnpackHigh(x_12, x_13); + + x_12 = Sse2.UnpackLow(t12, t13); + x_13 = Sse2.UnpackHigh(t12, t13); + + orig12 = x_12; + orig13 = x_13; + + in1213 += 4; + + x[12] = (uint)(in1213 & 0xFFFFFFFF); + x[13] = (uint)(in1213 >> 32 & 0xFFFFFFFF); + + for (int i = 0; i < 20; i += 2) + { + Vec128QuarterRound(ref x_0, ref x_4, ref x_8, ref x_12); + Vec128QuarterRound(ref x_1, ref x_5, ref x_9, ref x_13); + Vec128QuarterRound(ref x_2, ref x_6, ref x_10, ref x_14); + Vec128QuarterRound(ref x_3, ref x_7, ref x_11, ref x_15); + Vec128QuarterRound(ref x_0, ref x_5, ref x_10, ref x_15); + Vec128QuarterRound(ref x_1, ref x_6, ref x_11, ref x_12); + Vec128QuarterRound(ref x_2, ref x_7, ref x_8, ref x_13); + Vec128QuarterRound(ref x_3, ref x_4, ref x_9, ref x_14); + } + OneQuad(ref x_0, ref x_1, ref x_2, ref x_3, ref orig0, ref orig1, ref orig2, ref orig3, m, c); + m += 16; + c += 16; + OneQuad(ref x_4, ref x_5, ref x_6, ref x_7, ref orig4, ref orig5, ref orig6, ref orig7, m, c); + m += 16; + c += 16; + OneQuad(ref x_8, ref x_9, ref x_10, ref x_11, ref orig8, ref orig9, ref orig10, ref orig11, m, c); + m += 16; + c += 16; + OneQuad(ref x_12, ref x_13, ref x_14, ref x_15, ref orig12, ref orig13, ref orig14, ref orig15, m, c); + m -= 48; + c -= 48; + bytes -= 256; + c += 256; + m += 256; + } + } + while (bytes >= 64) + { + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); + Vector128 t_1; + + for (int i = 0; i < 20; i += 2) + { + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_1 = Sse2.Xor(x_1, x_2); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 12); + t_1 = Sse2.ShiftRightLogical(t_1, 20); + x_1 = Sse2.Xor(x_1, t_1); + + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_0 = Sse2.Shuffle(x_0, 147); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_3 = Sse2.Shuffle(x_3, 78); + x_1 = Sse2.Xor(x_1, x_2); + x_2 = Sse2.Shuffle(x_2, 57); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 7); + t_1 = Sse2.ShiftRightLogical(t_1, 25); + x_1 = Sse2.Xor(x_1, t_1); + + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_1 = Sse2.Xor(x_1, x_2); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 12); + t_1 = Sse2.ShiftRightLogical(t_1, 20); + x_1 = Sse2.Xor(x_1, t_1); + + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_0 = Sse2.Shuffle(x_0, 57); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_3 = Sse2.Shuffle(x_3, 78); + x_1 = Sse2.Xor(x_1, x_2); + x_2 = Sse2.Shuffle(x_2, 147); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 7); + t_1 = Sse2.ShiftRightLogical(t_1, 25); + x_1 = Sse2.Xor(x_1, t_1); + } + x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x)); + x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4)); + x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8)); + x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12)); + x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); + x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32(); + x_2 = Sse2.Xor(x_2.AsByte(), Sse2.LoadVector128(m + 32)).AsUInt32(); + x_3 = Sse2.Xor(x_3.AsByte(), Sse2.LoadVector128(m + 48)).AsUInt32(); + Sse2.Store(c, x_0.AsByte()); + Sse2.Store(c + 16, x_1.AsByte()); + Sse2.Store(c + 32, x_2.AsByte()); + Sse2.Store(c + 48, x_3.AsByte()); + + uint in12 = x[12]; + uint in13 = x[13]; + in12++; + if (in12 == 0) + { + in13++; + } + x[12] = in12; + x[13] = in13; + + bytes -= 64; + c += 64; + m += 64; + } + if (bytes > 0) + { + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); + Vector128 t_1; + for (int i = 0; i < 20; i += 2) + { + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_1 = Sse2.Xor(x_1, x_2); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 12); + t_1 = Sse2.ShiftRightLogical(t_1, 20); + x_1 = Sse2.Xor(x_1, t_1); + + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_0 = Sse2.Shuffle(x_0, 0x93); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_3 = Sse2.Shuffle(x_3, 0x4e); + x_1 = Sse2.Xor(x_1, x_2); + x_2 = Sse2.Shuffle(x_2, 0x39); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 7); + t_1 = Sse2.ShiftRightLogical(t_1, 25); + x_1 = Sse2.Xor(x_1, t_1); + + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_1 = Sse2.Xor(x_1, x_2); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 12); + t_1 = Sse2.ShiftRightLogical(t_1, 20); + x_1 = Sse2.Xor(x_1, t_1); + + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_0 = Sse2.Shuffle(x_0, 0x39); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_3 = Sse2.Shuffle(x_3, 0x4e); + x_1 = Sse2.Xor(x_1, x_2); + x_2 = Sse2.Shuffle(x_2, 0x93); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 7); + t_1 = Sse2.ShiftRightLogical(t_1, 25); + x_1 = Sse2.Xor(x_1, t_1); + } + x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x)); + x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4)); + x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8)); + x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12)); + byte* partialblock = stackalloc byte[64]; + Sse2.Store(partialblock, Vector128.AsByte(x_0)); + Sse2.Store(partialblock + 16, Vector128.AsByte(x_1)); + Sse2.Store(partialblock + 32, Vector128.AsByte(x_2)); + Sse2.Store(partialblock + 48, Vector128.AsByte(x_3)); + + for (ulong i = 0; i < bytes; i++) + { + c[i] = (byte)(m[i] ^ partialblock[i]); + } + for (int n = 0; n < 64 / sizeof(int); n++) + { + ((int*)partialblock)[n] = 0; + } + } + } + + // 256 byte methods + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void OneQuad(ref Vector128 x_A, ref Vector128 x_B, ref Vector128 x_C, ref Vector128 x_D, ref Vector128 origA, ref Vector128 origB, ref Vector128 origC, ref Vector128 origD, byte* m, byte* c) + { + Vector128 t_A, t_B, t_C, t_D, t0, t1, t2, t3; + x_A = Sse2.Add(x_A, origA); + x_B = Sse2.Add(x_B, origB); + x_C = Sse2.Add(x_C, origC); + x_D = Sse2.Add(x_D, origD); + t_A = Sse2.UnpackLow(x_A, x_B); + t_B = Sse2.UnpackLow(x_C, x_D); + t_C = Sse2.UnpackHigh(x_A, x_B); + t_D = Sse2.UnpackHigh(x_C, x_D); + x_A = Sse2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_B = Sse2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_C = Sse2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + x_D = Sse2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + t0 = Sse2.Xor(x_A.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); + Sse2.Store(c, t0.AsByte()); + t1 = Sse2.Xor(x_B.AsByte(), Sse2.LoadVector128(m + 64)).AsUInt32(); + Sse2.Store(c + 64, t1.AsByte()); + t2 = Sse2.Xor(x_C.AsByte(), Sse2.LoadVector128(m + 128)).AsUInt32(); + Sse2.Store(c + 128, t2.AsByte()); + t3 = Sse2.Xor(x_D.AsByte(), Sse2.LoadVector128(m + 192)).AsUInt32(); + Sse2.Store(c + 192, t3.AsByte()); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vec128QuarterRound(ref Vector128 x_A, ref Vector128 x_B, ref Vector128 x_C, ref Vector128 x_D) + { + Vector128 t_A, t_C; + x_A = Sse2.Add(x_A, x_B); + t_A = Sse2.Xor(x_D, x_A); + x_D = Ssse3.Shuffle(t_A.AsByte(), rot16_128).AsUInt32(); + x_C = Sse2.Add(x_C, x_D); + t_C = Sse2.Xor(x_B, x_C); + x_B = Sse2.Or(Sse2.ShiftLeftLogical(t_C, 12), Sse2.ShiftRightLogical(t_C, 20)); + x_A = Sse2.Add(x_A, x_B); + t_A = Sse2.Xor(x_D, x_A); + x_D = Ssse3.Shuffle(t_A.AsByte(), rot8_128).AsUInt32(); + x_C = Sse2.Add(x_C, x_D); + t_C = Sse2.Xor(x_B, x_C); + x_B = Sse2.Or(Sse2.ShiftLeftLogical(t_C, 7), Sse2.ShiftRightLogical(t_C, 25)); + } + + // 512 byte methods + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 Vector256Rotate(Vector256 a, byte imm) => Avx2.Or(Avx2.ShiftLeftLogical(a, imm), Avx2.ShiftRightLogical(a, (byte)(32 - imm))); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vec256Round(ref Vector256 A1, ref Vector256 B1, ref Vector256 C1, ref Vector256 D1, ref Vector256 A2, ref Vector256 B2, ref Vector256 C2, ref Vector256 D2, ref Vector256 A3, ref Vector256 B3, ref Vector256 C3, ref Vector256 D3, ref Vector256 A4, ref Vector256 B4, ref Vector256 C4, ref Vector256 D4) + { + Vector256Line1(ref A1, ref B1, ref C1, ref D1); + Vector256Line1(ref A2, ref B2, ref C2, ref D2); + Vector256Line1(ref A3, ref B3, ref C3, ref D3); + Vector256Line1(ref A4, ref B4, ref C4, ref D4); + Vector256Line2(ref A1, ref B1, ref C1, ref D1); + Vector256Line2(ref A2, ref B2, ref C2, ref D2); + Vector256Line2(ref A3, ref B3, ref C3, ref D3); + Vector256Line2(ref A4, ref B4, ref C4, ref D4); + Vector256Line3(ref A1, ref B1, ref C1, ref D1); + Vector256Line3(ref A2, ref B2, ref C2, ref D2); + Vector256Line3(ref A3, ref B3, ref C3, ref D3); + Vector256Line3(ref A4, ref B4, ref C4, ref D4); + Vector256Line4(ref A1, ref B1, ref C1, ref D1); + Vector256Line4(ref A2, ref B2, ref C2, ref D2); + Vector256Line4(ref A3, ref B3, ref C3, ref D3); + Vector256Line4(ref A4, ref B4, ref C4, ref D4); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line1(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_A = Avx2.Add(x_A, x_B); + x_D = Avx2.Shuffle(Avx2.Xor(x_D, x_A).AsByte(), rot16_256).AsUInt32(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line2(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_C = Avx2.Add(x_C, x_D); + x_B = Vector256Rotate(Avx2.Xor(x_B, x_C), 12); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line3(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_A = Avx2.Add(x_A, x_B); + x_D = Avx2.Shuffle(Avx2.Xor(x_D, x_A).AsByte(), rot8_256).AsUInt32(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line4(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_C = Avx2.Add(x_C, x_D); + x_B = Vector256Rotate(Avx2.Xor(x_B, x_C), 7); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void OneQuadUnpack(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D, ref Vector256 t_A, ref Vector256 t_B, ref Vector256 t_C, ref Vector256 t_D, ref Vector256 orig_A, ref Vector256 orig_B, ref Vector256 orig_C, ref Vector256 orig_D) + { + x_A = Avx2.Add(x_A, orig_A); + x_B = Avx2.Add(x_B, orig_B); + x_C = Avx2.Add(x_C, orig_C); + x_D = Avx2.Add(x_D, orig_D); + t_A = Avx2.UnpackLow(x_A, x_B); + t_B = Avx2.UnpackLow(x_C, x_D); + t_C = Avx2.UnpackHigh(x_A, x_B); + t_D = Avx2.UnpackHigh(x_C, x_D); + x_A = Avx2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_B = Avx2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_C = Avx2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + x_D = Avx2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + } + } +} +#pragma warning restore IDE0007 // Use implicit type +#endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/Snuffle.cs b/src/NaCl.Core/Base/Snuffle.cs index d237860..17ad94d 100644 --- a/src/NaCl.Core/Base/Snuffle.cs +++ b/src/NaCl.Core/Base/Snuffle.cs @@ -59,6 +59,10 @@ public Snuffle(ReadOnlyMemory key, int initialCounter) /// ByteBuffer. public abstract void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block); +#if INTRINSICS + public abstract void ProcessStream(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int initialCounter, int offset = 0); +#endif + /// /// The size of the randomly generated nonces. /// ChaCha20 uses 12-byte nonces, but XSalsa20 and XChaCha20 use 24-byte nonces. @@ -193,6 +197,10 @@ public virtual void Decrypt(ReadOnlySpan ciphertext, ReadOnlySpan no Process(nonce, plaintext, ciphertext); } + +#if INTRINSICS + private void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) => ProcessStream(nonce, output, input, InitialCounter, offset); +#else /// /// Processes the Encryption/Decryption function. /// @@ -237,6 +245,7 @@ private void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan /// Formats the nonce length exception message. diff --git a/src/NaCl.Core/NaCl.Core.csproj b/src/NaCl.Core/NaCl.Core.csproj index 3cc2a11..c77f15f 100644 --- a/src/NaCl.Core/NaCl.Core.csproj +++ b/src/NaCl.Core/NaCl.Core.csproj @@ -1,8 +1,8 @@ - + - netstandard1.6;netstandard2.0;netstandard2.1;netcoreapp3.1 - netstandard1.6;netstandard2.0;netstandard2.1;netcoreapp3.1;net45;net48 + netstandard1.6;netstandard2.0;netstandard2.1;netcoreapp3.1;netcoreapp5.0 + netstandard1.6;netstandard2.0;netstandard2.1;netcoreapp3.1;netcoreapp5.0;net45;net48 true 1.2.0 David De Smet @@ -26,10 +26,14 @@ - + FCL_BITOPS + + INTRINSICS + + diff --git a/test/NaCl.Core.Benchmarks/ChaCha20Benchmark.cs b/test/NaCl.Core.Benchmarks/ChaCha20Benchmark.cs index a4a1b15..cbe9380 100644 --- a/test/NaCl.Core.Benchmarks/ChaCha20Benchmark.cs +++ b/test/NaCl.Core.Benchmarks/ChaCha20Benchmark.cs @@ -8,8 +8,11 @@ using Internal; using BenchmarkDotNet.Attributes; + using BenchmarkDotNet.Jobs; [BenchmarkCategory("Stream Cipher")] + [SimpleJob(RuntimeMoniker.NetCoreApp21, baseline: true)] + [SimpleJob(RuntimeMoniker.NetCoreApp31)] [MemoryDiagnoser] [RPlotExporter, RankColumn] public class ChaCha20Benchmark diff --git a/test/NaCl.Core.Benchmarks/NaCl.Core.Benchmarks.csproj b/test/NaCl.Core.Benchmarks/NaCl.Core.Benchmarks.csproj index cca4c1f..a0c58ad 100644 --- a/test/NaCl.Core.Benchmarks/NaCl.Core.Benchmarks.csproj +++ b/test/NaCl.Core.Benchmarks/NaCl.Core.Benchmarks.csproj @@ -1,8 +1,8 @@ - + Exe - netcoreapp3.1 + netcoreapp3.1;netcoreapp2.1 diff --git a/test/NaCl.Core.Benchmarks/Program.cs b/test/NaCl.Core.Benchmarks/Program.cs index 1bdc82b..9e0dfc0 100644 --- a/test/NaCl.Core.Benchmarks/Program.cs +++ b/test/NaCl.Core.Benchmarks/Program.cs @@ -11,11 +11,11 @@ static void Main(string[] args) // Execute following code: // $ dotnet run -c release --framework netcoreapp3.1 //BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); - BenchmarkRunner.Run(); + //BenchmarkRunner.Run(); BenchmarkRunner.Run(); - BenchmarkRunner.Run(); - BenchmarkRunner.Run(); - BenchmarkRunner.Run(); + //BenchmarkRunner.Run(); + //BenchmarkRunner.Run(); + //BenchmarkRunner.Run(); Console.ReadLine(); } diff --git a/test/NaCl.Core.Tests/NaCl.Core.Tests.csproj b/test/NaCl.Core.Tests/NaCl.Core.Tests.csproj index bac8f8e..545a2a5 100644 --- a/test/NaCl.Core.Tests/NaCl.Core.Tests.csproj +++ b/test/NaCl.Core.Tests/NaCl.Core.Tests.csproj @@ -1,7 +1,7 @@  - netcoreapp3.1 + netcoreapp3.1;netcoreapp2.1 latest true From 24901e36639bf1643f3c3f864edb57c39a960d43 Mon Sep 17 00:00:00 2001 From: macaba Date: Mon, 6 Apr 2020 17:42:19 +0100 Subject: [PATCH 02/59] Update Program.cs --- test/NaCl.Core.Benchmarks/Program.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/NaCl.Core.Benchmarks/Program.cs b/test/NaCl.Core.Benchmarks/Program.cs index 9e0dfc0..1bdc82b 100644 --- a/test/NaCl.Core.Benchmarks/Program.cs +++ b/test/NaCl.Core.Benchmarks/Program.cs @@ -11,11 +11,11 @@ static void Main(string[] args) // Execute following code: // $ dotnet run -c release --framework netcoreapp3.1 //BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); - //BenchmarkRunner.Run(); + BenchmarkRunner.Run(); BenchmarkRunner.Run(); - //BenchmarkRunner.Run(); - //BenchmarkRunner.Run(); - //BenchmarkRunner.Run(); + BenchmarkRunner.Run(); + BenchmarkRunner.Run(); + BenchmarkRunner.Run(); Console.ReadLine(); } From 6424c400e48f67cc202bfedbdfd20c7efbf2095c Mon Sep 17 00:00:00 2001 From: macaba Date: Mon, 6 Apr 2020 17:45:34 +0100 Subject: [PATCH 03/59] Removed 5.0 target to allow CI to build --- src/NaCl.Core/NaCl.Core.csproj | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/NaCl.Core/NaCl.Core.csproj b/src/NaCl.Core/NaCl.Core.csproj index c77f15f..df0da33 100644 --- a/src/NaCl.Core/NaCl.Core.csproj +++ b/src/NaCl.Core/NaCl.Core.csproj @@ -1,8 +1,8 @@  - netstandard1.6;netstandard2.0;netstandard2.1;netcoreapp3.1;netcoreapp5.0 - netstandard1.6;netstandard2.0;netstandard2.1;netcoreapp3.1;netcoreapp5.0;net45;net48 + netstandard1.6;netstandard2.0;netstandard2.1;netcoreapp3.1 + netstandard1.6;netstandard2.0;netstandard2.1;netcoreapp3.1;net45;net48 true 1.2.0 David De Smet From f1f3460095c05e3933fa6b1f81366310a55030e6 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Sat, 8 Oct 2022 10:18:15 +0100 Subject: [PATCH 04/59] Remove treat warning as errors --- src/NaCl.Core/NaCl.Core.csproj | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/NaCl.Core/NaCl.Core.csproj b/src/NaCl.Core/NaCl.Core.csproj index 5fa4576..b64ce1e 100644 --- a/src/NaCl.Core/NaCl.Core.csproj +++ b/src/NaCl.Core/NaCl.Core.csproj @@ -5,7 +5,6 @@ netstandard2.0;netstandard2.1;netcoreapp3.1;net45;net48;net5.0;net6.0 true latest - true @@ -62,6 +61,10 @@ FCL_BITOPS + + INTRINSICS + + preview From b3d9aa6dcf161a12d057bc308d327e01cf98a5e4 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Sat, 8 Oct 2022 23:10:25 +0100 Subject: [PATCH 05/59] Alter Dencrypt test --- .../NaCl.Core.Benchmarks/ChaCha20Benchmark.cs | 54 ++++++++++--------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/test/NaCl.Core.Benchmarks/ChaCha20Benchmark.cs b/test/NaCl.Core.Benchmarks/ChaCha20Benchmark.cs index db2100c..d4430d6 100644 --- a/test/NaCl.Core.Benchmarks/ChaCha20Benchmark.cs +++ b/test/NaCl.Core.Benchmarks/ChaCha20Benchmark.cs @@ -2,10 +2,8 @@ { using System; using System.Collections.Generic; - using System.Linq; using Base; - using Internal; using BenchmarkDotNet.Attributes; @@ -19,6 +17,7 @@ public class ChaCha20Benchmark private Memory key; private Memory nonce; private Memory message; + private Memory cipherText; private ChaCha20 cipher; [Params( @@ -42,6 +41,10 @@ public void Setup() message = new byte[Size]; rnd.NextBytes(message.Span); + cipherText = new byte[Size]; + var c = new ChaCha20(key, 0); + c.Encrypt(message.Span, nonce.Span, cipherText.Span); + cipher = new ChaCha20(key, 0); } @@ -55,32 +58,35 @@ public void Encrypt() [Benchmark] [BenchmarkCategory("Decryption")] - [ArgumentsSource(nameof(TestVectors))] - public void Decrypt(Tests.Vectors.Rfc8439TestVector test) + public void Decrypt() { - var plaintext = new byte[test.CipherText.Length]; - var cipher = new ChaCha20(test.Key, test.InitialCounter); - cipher.Decrypt(test.CipherText, test.Nonce, plaintext); + var plaintext = new byte[cipherText.Length]; + cipher.Decrypt(cipherText.Span, nonce.Span, plaintext); } - public IEnumerable TestVectors() - { - //foreach (var test in Tests.Rfc8439TestVector.Rfc8439TestVectors) - // yield return test; - - yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[0]; - yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[1]; - yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[2]; - yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[3]; - yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[4]; - yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[5]; - yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[6]; - yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[7]; - } - - // TODO: Use the encrypt value (from Encrypt method) to benchmark decryption //[Benchmark] //[BenchmarkCategory("Decryption")] - //public byte[] Decrypt(byte[] ciphertext) => cipher.Decrypt(ciphertext); + //[ArgumentsSource(nameof(TestVectors))] + //public void Decrypt(Tests.Vectors.Rfc8439TestVector test) + //{ + // var plaintext = new byte[test.CipherText.Length]; + // var cipher = new ChaCha20(test.Key, test.InitialCounter); + // cipher.Decrypt(test.CipherText, test.Nonce, plaintext); + //} + + //public IEnumerable TestVectors() + //{ + // //foreach (var test in Tests.Rfc8439TestVector.Rfc8439TestVectors) + // // yield return test; + + // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[0]; + // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[1]; + // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[2]; + // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[3]; + // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[4]; + // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[5]; + // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[6]; + // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[7]; + //} } } From a6b8a4ad92b61f60e4d11116441cd8defdd7382a Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Mon, 10 Oct 2022 12:17:12 +0100 Subject: [PATCH 06/59] Add Salsa20 and XSalsa20 benchmarks --- test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs | 83 +++++++++++++++++++ .../XSalsa20Benchmark .cs | 83 +++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs create mode 100644 test/NaCl.Core.Benchmarks/XSalsa20Benchmark .cs diff --git a/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs b/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs new file mode 100644 index 0000000..0b2a27e --- /dev/null +++ b/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs @@ -0,0 +1,83 @@ +namespace NaCl.Core.Benchmarks +{ + using System; + using System.Collections.Generic; + + using Base; + + using BenchmarkDotNet.Attributes; + + [BenchmarkCategory("Stream Cipher")] + [MemoryDiagnoser] + [RPlotExporter, RankColumn] + public class Salsa20Benchmark + { + private static readonly Random rnd = new Random(42); + + private Memory key; + private Memory nonce; + private Memory message; + private Memory cipherText; + private Salsa20 cipher; + + [Params( + (int)1E+2, // 100 bytes + (int)1E+3, // 1 000 bytes = 1 KB + (int)1E+4, // 10 000 bytes = 10 KB + (int)1E+5, // 100 000 bytes = 100 KB + (int)1E+6, // 1 000 000 bytes = 1 MB + (int)1E+7)] // 10 000 000 bytes = 10 MB + public int Size { get; set; } + + [GlobalSetup] + public void Setup() + { + key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + rnd.NextBytes(key.Span); + + nonce = new byte[12]; + rnd.NextBytes(nonce.Span); + + message = new byte[Size]; + rnd.NextBytes(message.Span); + + cipherText = new byte[Size]; + var c = new Salsa20(key, 0); + c.Encrypt(message.Span, nonce.Span, cipherText.Span); + + cipher = new Salsa20(key, 0); + } + + [Benchmark] + [BenchmarkCategory("Encryption")] + public void Encrypt() + { + var ciphertext = new byte[message.Length]; + cipher.Encrypt(message.Span, nonce.Span, ciphertext); + } + + [Benchmark] + [BenchmarkCategory("Decryption")] + public void Decrypt() + { + var plaintext = new byte[cipherText.Length]; + cipher.Decrypt(cipherText.Span, nonce.Span, plaintext); + } + + //[Benchmark] + //[BenchmarkCategory("Decryption")] + //[ArgumentsSource(nameof(TestVectors))] + //public void Decrypt(Tests.Vectors.Salsa20TestVector test) + //{ + // var plaintext = new byte[test.CipherText.Length]; + // var cipher = new Salsa20(test.Key, test.InitialCounter); + // cipher.Decrypt(test.CipherText, test.Nonce, plaintext); + //} + + //public IEnumerable TestVectors() + //{ + // //foreach (var test in ParseTestVectors(GetTestVector());) + // // yield return test; + //} + } +} diff --git a/test/NaCl.Core.Benchmarks/XSalsa20Benchmark .cs b/test/NaCl.Core.Benchmarks/XSalsa20Benchmark .cs new file mode 100644 index 0000000..4a76567 --- /dev/null +++ b/test/NaCl.Core.Benchmarks/XSalsa20Benchmark .cs @@ -0,0 +1,83 @@ +namespace NaCl.Core.Benchmarks +{ + using System; + using System.Collections.Generic; + + using Base; + + using BenchmarkDotNet.Attributes; + + [BenchmarkCategory("Stream Cipher")] + [MemoryDiagnoser] + [RPlotExporter, RankColumn] + public class XSalsa20Benchmark + { + private static readonly Random rnd = new Random(42); + + private Memory key; + private Memory nonce; + private Memory message; + private Memory cipherText; + private XSalsa20 cipher; + + [Params( + (int)1E+2, // 100 bytes + (int)1E+3, // 1 000 bytes = 1 KB + (int)1E+4, // 10 000 bytes = 10 KB + (int)1E+5, // 100 000 bytes = 100 KB + (int)1E+6, // 1 000 000 bytes = 1 MB + (int)1E+7)] // 10 000 000 bytes = 10 MB + public int Size { get; set; } + + [GlobalSetup] + public void Setup() + { + key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + rnd.NextBytes(key.Span); + + nonce = new byte[12]; + rnd.NextBytes(nonce.Span); + + message = new byte[Size]; + rnd.NextBytes(message.Span); + + cipherText = new byte[Size]; + var c = new XSalsa20(key, 0); + c.Encrypt(message.Span, nonce.Span, cipherText.Span); + + cipher = new XSalsa20(key, 0); + } + + [Benchmark] + [BenchmarkCategory("Encryption")] + public void Encrypt() + { + var ciphertext = new byte[message.Length]; + cipher.Encrypt(message.Span, nonce.Span, ciphertext); + } + + [Benchmark] + [BenchmarkCategory("Decryption")] + public void Decrypt() + { + var plaintext = new byte[cipherText.Length]; + cipher.Decrypt(cipherText.Span, nonce.Span, plaintext); + } + + //[Benchmark] + //[BenchmarkCategory("Decryption")] + //[ArgumentsSource(nameof(TestVectors))] + //public void Decrypt(Tests.Vectors.Rfc8439TestVector test) + //{ + // var plaintext = new byte[test.CipherText.Length]; + // var cipher = new Salsa20(test.Key, test.InitialCounter); + // cipher.Decrypt(test.CipherText, test.Nonce, plaintext); + //} + + //public IEnumerable TestVectors() + //{ + // //foreach (var test in ParseTestVectors(GetTestVector());) + // // yield return test; + //} + } +} From 565ad2c4a8c12d68deaee5212295e7c50f1d97ec Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 11 Oct 2022 12:59:05 +0100 Subject: [PATCH 07/59] Added back .net 6 support --- src/NaCl.Core/NaCl.Core.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/NaCl.Core/NaCl.Core.csproj b/src/NaCl.Core/NaCl.Core.csproj index b64ce1e..a873a24 100644 --- a/src/NaCl.Core/NaCl.Core.csproj +++ b/src/NaCl.Core/NaCl.Core.csproj @@ -61,7 +61,7 @@ FCL_BITOPS - + INTRINSICS From 2184dbaa953359e133819b678692846ffb0c49bf Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 11 Oct 2022 12:59:26 +0100 Subject: [PATCH 08/59] Parameterise Salsa20TestVectors --- test/NaCl.Core.Tests/Salsa20Tests.cs | 35 ++++++++++++++-------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/test/NaCl.Core.Tests/Salsa20Tests.cs b/test/NaCl.Core.Tests/Salsa20Tests.cs index 3e2e57b..011f154 100644 --- a/test/NaCl.Core.Tests/Salsa20Tests.cs +++ b/test/NaCl.Core.Tests/Salsa20Tests.cs @@ -15,6 +15,7 @@ using Base; using Internal; using Vectors; + using System.Linq; [Category("CI")] public class Salsa20Tests @@ -262,29 +263,27 @@ public void Salsa20BlockWhenLengthIsInvalidFails() act.Should().Throw(); } - [Fact] - public void Salsa20TestVectors() - { - var tests = ParseTestVectors(GetTestVector()); + public static IEnumerable Salsa20TestData => ParseTestVectors(GetTestVector()).Select(d => new object[] { d }); - foreach (var test in tests) - { - _output.WriteLine($"Salsa20 - {test.Name}"); + [Theory] + [MemberData(nameof(Salsa20TestData))] + public void Salsa20TestVectors(Salsa20TestVector test) + { + _output.WriteLine($"Salsa20 - {test.Name}"); - var input = new byte[512]; - var output = new byte[512]; + var input = new byte[512]; + var output = new byte[512]; - var cipher = new Salsa20(test.Key, 0); - cipher.Encrypt(input, test.IV, output); + var cipher = new Salsa20(test.Key, 0); + cipher.Encrypt(input, test.IV, output); - ToBlock1(output).Should().Be(test.ExpectedBlock1); - ToBlock4(output).Should().Be(test.ExpectedBlock4); - ToBlock5(output).Should().Be(test.ExpectedBlock5); - ToBlock8(output).Should().Be(test.ExpectedBlock8); - } + ToBlock1(output).Should().Be(test.ExpectedBlock1); + ToBlock4(output).Should().Be(test.ExpectedBlock4); + ToBlock5(output).Should().Be(test.ExpectedBlock5); + ToBlock8(output).Should().Be(test.ExpectedBlock8); } - private string GetTestVector() + private static string GetTestVector() { try { @@ -297,7 +296,7 @@ private string GetTestVector() } } - private IList ParseTestVectors(string raw) + private static IList ParseTestVectors(string raw) { var lines = raw.Split(new[] {'\r', '\n'}); From e18036f55a8be10d61bd25a385aaeaf940ec491c Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 11 Oct 2022 13:03:10 +0100 Subject: [PATCH 09/59] Added Intrinsics check --- src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs | 7 +++- src/NaCl.Core/Base/Snuffle.cs | 34 ++++++++++++-------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs index 1b260f7..f994e8d 100644 --- a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs @@ -1,6 +1,7 @@ #if INTRINSICS #pragma warning disable IDE0007 // Use implicit type +using System; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; @@ -17,7 +18,10 @@ public static class ChaCha20BaseIntrinsics [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void ChaCha20(uint* x, byte* m, byte* c, ulong bytes) { - if (Avx2.IsSupported && bytes >= 512) //Fix the AVX2 section! + if (!Sse3.IsSupported) + throw new Exception("Error this vectorisation is not supported on this CPU"); + + if (Avx2.IsSupported && bytes >= 512) { Vector256 x_0 = Vector256.Create(x[0]); Vector256 x_1 = Vector256.Create(x[1]); @@ -73,6 +77,7 @@ public static unsafe void ChaCha20(uint* x, byte* m, byte* c, ulong bytes) x_11 = orig11; x_14 = orig14; x_15 = orig15; + uint in12 = x[12]; uint in13 = x[13]; ulong in1213 = in12 | ((ulong)in13 << 32); diff --git a/src/NaCl.Core/Base/Snuffle.cs b/src/NaCl.Core/Base/Snuffle.cs index dd0812b..506efeb 100644 --- a/src/NaCl.Core/Base/Snuffle.cs +++ b/src/NaCl.Core/Base/Snuffle.cs @@ -4,6 +4,11 @@ using System.Buffers; using System.Security.Cryptography; +#if INTRINSICS + using System.Runtime.Intrinsics.X86; +#endif + + /// /// Abstract base class for XSalsa20, ChaCha20, XChaCha20 and their variants. /// @@ -113,10 +118,6 @@ public void Decrypt(ReadOnlySpan ciphertext, ReadOnlySpan nonce, Spa Process(nonce, plaintext, ciphertext); } - -#if INTRINSICS - private void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) => ProcessStream(nonce, output, input, InitialCounter, offset); -#else /// /// Processes the Encryption/Decryption function. /// @@ -126,6 +127,14 @@ public void Decrypt(ReadOnlySpan ciphertext, ReadOnlySpan nonce, Spa /// The output's starting offset. private void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) { +#if INTRINSICS + if (Sse3.IsSupported) + { + ProcessStream(nonce, output, input, InitialCounter, offset); + return; + } +#endif + var length = input.Length; var numBlocks = (length / BlockSizeInBytes) + 1; @@ -159,16 +168,15 @@ private void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan - /// Formats the nonce length exception message. - /// - /// The crypto primitive name. - /// The actual nonce length. - /// The expected nonce length. - /// System.String. - internal static string FormatNonceLengthExceptionMessage(string name, int actual, int expected) => $"{name} uses {expected * 8}-bit nonces, but got a {actual * 8}-bit nonce. The nonce length in bytes must be {expected}."; + /// + /// Formats the nonce length exception message. + /// + /// The crypto primitive name. + /// The actual nonce length. + /// The expected nonce length. + /// System.String. + internal static string FormatNonceLengthExceptionMessage(string name, int actual, int expected) => $"{name} uses {expected * 8}-bit nonces, but got a {actual * 8}-bit nonce. The nonce length in bytes must be {expected}."; /// /// XOR the specified output. From fdc1e1cd1db6e7473d009364a61d2a44fa98caa6 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 11 Oct 2022 13:03:45 +0100 Subject: [PATCH 10/59] Added intrinsics support for Salsa20 (64 <= is broken) --- src/NaCl.Core/Base/Salsa20Base.cs | 13 + src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs | 544 ++++++++++++++++++++ 2 files changed, 557 insertions(+) create mode 100644 src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs diff --git a/src/NaCl.Core/Base/Salsa20Base.cs b/src/NaCl.Core/Base/Salsa20Base.cs index 949c048..dfb091c 100644 --- a/src/NaCl.Core/Base/Salsa20Base.cs +++ b/src/NaCl.Core/Base/Salsa20Base.cs @@ -53,6 +53,19 @@ public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter ArrayUtils.StoreArray16UInt32LittleEndian(block, 0, state); } +#if INTRINSICS + public override unsafe void ProcessStream(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int initialCounter, int offset = 0) + { + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + SetInitialState(state, nonce, initialCounter); + fixed (uint* x = state) + fixed (byte* m = input, c = output.Slice(offset)) + { + Salsa20BaseIntrinsics.Salsa20(x, m, c, (ulong)input.Length); + } + } +#endif + /// /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . /// diff --git a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs new file mode 100644 index 0000000..9759a6d --- /dev/null +++ b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs @@ -0,0 +1,544 @@ +#if INTRINSICS +#pragma warning disable IDE0007 // Use implicit type + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace NaCl.Core.Base +{ + public static class Salsa20BaseIntrinsics + { + private static Vector128 rot8_128 = Vector128.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); + private static Vector128 rot16_128 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); + private static Vector256 rot8_256 = Vector256.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); + private static Vector256 rot16_256 = Vector256.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) + { + if (!Sse3.IsSupported) + throw new Exception("Error this vectorisation is not supported on this CPU"); + + if (Avx2.IsSupported && bytes >= 512) + { + Vector256 x_0 = Vector256.Create(x[0]); + Vector256 x_1 = Vector256.Create(x[1]); + Vector256 x_2 = Vector256.Create(x[2]); + Vector256 x_3 = Vector256.Create(x[3]); + Vector256 x_4 = Vector256.Create(x[4]); + Vector256 x_5 = Vector256.Create(x[5]); + Vector256 x_6 = Vector256.Create(x[6]); + Vector256 x_7 = Vector256.Create(x[7]); + Vector256 x_8; + Vector256 x_9; + Vector256 x_10 = Vector256.Create(x[10]); + Vector256 x_11 = Vector256.Create(x[11]); + Vector256 x_12 = Vector256.Create(x[12]); + Vector256 x_13 = Vector256.Create(x[13]); + Vector256 x_14 = Vector256.Create(x[14]); + Vector256 x_15 = Vector256.Create(x[15]); + + Vector256 orig0 = x_0; + Vector256 orig1 = x_1; + Vector256 orig2 = x_2; + Vector256 orig3 = x_3; + Vector256 orig4 = x_4; + Vector256 orig5 = x_5; + Vector256 orig6 = x_6; + Vector256 orig7 = x_7; + Vector256 orig8; + Vector256 orig9; + Vector256 orig10 = x_10; + Vector256 orig11 = x_11; + Vector256 orig12 = x_12; + Vector256 orig13 = x_13; + Vector256 orig14 = x_14; + Vector256 orig15 = x_15; + + while (bytes >= 512) + { + Vector256 addv8 = Vector256.Create(0, 1, 2, 3).AsUInt32(); + Vector256 addv9 = Vector256.Create(4, 5, 6, 7).AsUInt32(); + Vector256 permute = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7).AsUInt32(); + Vector256 t8, t9; + x_0 = orig0; + x_1 = orig1; + x_2 = orig2; + x_3 = orig3; + x_4 = orig4; + x_5 = orig5; + x_6 = orig6; + x_7 = orig7; + x_10 = orig10; + x_11 = orig11; + x_12 = orig12; + x_13 = orig13; + x_14 = orig14; + x_15 = orig15; + + uint in8 = x[8]; + uint in9 = x[9]; + ulong in89 = in8 | ((ulong)in9 << 32); + x_8 = x_9 = Avx2.BroadcastScalarToVector256(Sse2.X64.ConvertScalarToVector128UInt64(in89)).AsUInt32(); + t8 = Avx2.Add(addv8.AsUInt64(), x_8.AsUInt64()).AsUInt32(); + t9 = Avx2.Add(addv9.AsUInt64(), x_9.AsUInt64()).AsUInt32(); + x_8 = Avx2.UnpackLow(t8, t9); + x_9 = Avx2.UnpackHigh(t8, t9); + t8 = Avx2.UnpackLow(x_8, x_9); + t9 = Avx2.UnpackHigh(x_8, x_9); + x_8 = Avx2.PermuteVar8x32(t8, permute); + x_9 = Avx2.PermuteVar8x32(t9, permute); + + orig8 = x_8; + orig9 = x_9; + + in89 += 8; + + x[8] = (uint)(in89 & 0xFFFFFFFF); + x[9] = (uint)((in89 >> 32) & 0xFFFFFFFF); + for (int i = 0; i < 20; i += 2) + { + Vec256Round(ref x_0, ref x_4, ref x_8, ref x_12, ref x_5, ref x_9, ref x_13, ref x_1, ref x_10, ref x_14, ref x_2, ref x_6, ref x_15, ref x_3, ref x_7, ref x_11); + Vec256Round(ref x_0, ref x_1, ref x_2, ref x_3, ref x_5, ref x_6, ref x_7, ref x_4, ref x_10, ref x_11, ref x_8, ref x_9, ref x_15, ref x_12, ref x_13, ref x_14); + } + + Vector256 t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, t_13, t_14, t_15; + t_0 = t_1 = t_2 = t_3 = t_4 = t_5 = t_6 = t_7 = t_8 = t_9 = t_10 = t_11 = t_12 = t_13 = t_14 = t_15 = Vector256.Create((uint)0); + // ONEOCTO enter + OneQuadUnpack(ref x_0, ref x_1, ref x_2, ref x_3, ref t_0, ref t_1, ref t_2, ref t_3, ref orig0, ref orig1, ref orig2, ref orig3); + OneQuadUnpack(ref x_4, ref x_5, ref x_6, ref x_7, ref t_4, ref t_5, ref t_6, ref t_7, ref orig4, ref orig5, ref orig6, ref orig7); + t_0 = Avx2.Permute2x128(x_0, x_4, 0x20); + t_4 = Avx2.Permute2x128(x_0, x_4, 0x31); + t_1 = Avx2.Permute2x128(x_1, x_5, 0x20); + t_5 = Avx2.Permute2x128(x_1, x_5, 0x31); + t_2 = Avx2.Permute2x128(x_2, x_6, 0x20); + t_6 = Avx2.Permute2x128(x_2, x_6, 0x31); + t_3 = Avx2.Permute2x128(x_3, x_7, 0x20); + t_7 = Avx2.Permute2x128(x_3, x_7, 0x31); + t_0 = Avx2.Xor(t_0, Avx.LoadVector256(m).AsUInt32()); + t_1 = Avx2.Xor(t_1, Avx.LoadVector256(m + 64).AsUInt32()); + t_2 = Avx2.Xor(t_2, Avx.LoadVector256(m + 128).AsUInt32()); + t_3 = Avx2.Xor(t_3, Avx.LoadVector256(m + 192).AsUInt32()); + t_4 = Avx2.Xor(t_4, Avx.LoadVector256(m + 256).AsUInt32()); + t_5 = Avx2.Xor(t_5, Avx.LoadVector256(m + 320).AsUInt32()); + t_6 = Avx2.Xor(t_6, Avx.LoadVector256(m + 384).AsUInt32()); + t_7 = Avx2.Xor(t_7, Avx.LoadVector256(m + 448).AsUInt32()); + Avx.Store(c, t_0.AsByte()); + Avx.Store(c + 64, t_1.AsByte()); + Avx.Store(c + 128, t_2.AsByte()); + Avx.Store(c + 192, t_3.AsByte()); + Avx.Store(c + 256, t_4.AsByte()); + Avx.Store(c + 320, t_5.AsByte()); + Avx.Store(c + 384, t_6.AsByte()); + Avx.Store(c + 448, t_7.AsByte()); + // ONEOCTO exit + + m += 32; + c += 32; + + // ONEOCTO enter + OneQuadUnpack(ref x_8, ref x_9, ref x_10, ref x_11, ref t_8, ref t_9, ref t_10, ref t_11, ref orig8, ref orig9, ref orig10, ref orig11); + OneQuadUnpack(ref x_12, ref x_13, ref x_14, ref x_15, ref t_12, ref t_13, ref t_14, ref t_15, ref orig12, ref orig13, ref orig14, ref orig15); + t_8 = Avx2.Permute2x128(x_8, x_12, 0x20); + t_12 = Avx2.Permute2x128(x_8, x_12, 0x31); + t_9 = Avx2.Permute2x128(x_9, x_13, 0x20); + t_13 = Avx2.Permute2x128(x_9, x_13, 0x31); + t_10 = Avx2.Permute2x128(x_10, x_14, 0x20); + t_14 = Avx2.Permute2x128(x_10, x_14, 0x31); + t_11 = Avx2.Permute2x128(x_11, x_15, 0x20); + t_15 = Avx2.Permute2x128(x_11, x_15, 0x31); + t_8 = Avx2.Xor(t_8, Avx.LoadVector256(m).AsUInt32()); + t_9 = Avx2.Xor(t_9, Avx.LoadVector256(m + 64).AsUInt32()); + t_10 = Avx2.Xor(t_10, Avx.LoadVector256(m + 128).AsUInt32()); + t_11 = Avx2.Xor(t_11, Avx.LoadVector256(m + 192).AsUInt32()); + t_12 = Avx2.Xor(t_12, Avx.LoadVector256(m + 256).AsUInt32()); + t_13 = Avx2.Xor(t_13, Avx.LoadVector256(m + 320).AsUInt32()); + t_14 = Avx2.Xor(t_14, Avx.LoadVector256(m + 384).AsUInt32()); + t_15 = Avx2.Xor(t_15, Avx.LoadVector256(m + 448).AsUInt32()); + Avx.Store(c, t_8.AsByte()); + Avx.Store(c + 64, t_9.AsByte()); + Avx.Store(c + 128, t_10.AsByte()); + Avx.Store(c + 192, t_11.AsByte()); + Avx.Store(c + 256, t_12.AsByte()); + Avx.Store(c + 320, t_13.AsByte()); + Avx.Store(c + 384, t_14.AsByte()); + Avx.Store(c + 448, t_15.AsByte()); + // ONEOCTO exit + m -= 32; + c -= 32; + bytes -= 512; + c += 512; + m += 512; + } + } + if (bytes >= 256) + { + Vector128 x_0 = Vector128.Create(x[0]); + Vector128 x_1 = Vector128.Create(x[1]); + Vector128 x_2 = Vector128.Create(x[2]); + Vector128 x_3 = Vector128.Create(x[3]); + Vector128 x_4 = Vector128.Create(x[4]); + Vector128 x_5 = Vector128.Create(x[5]); + Vector128 x_6 = Vector128.Create(x[6]); + Vector128 x_7 = Vector128.Create(x[7]); + Vector128 x_8; + Vector128 x_9; + Vector128 x_10 = Vector128.Create(x[10]); + Vector128 x_11 = Vector128.Create(x[11]); + Vector128 x_12 = Vector128.Create(x[12]); + Vector128 x_13 = Vector128.Create(x[13]); + Vector128 x_14 = Vector128.Create(x[14]); + Vector128 x_15 = Vector128.Create(x[15]); + Vector128 orig0 = x_0; + Vector128 orig1 = x_1; + Vector128 orig2 = x_2; + Vector128 orig3 = x_3; + Vector128 orig4 = x_4; + Vector128 orig5 = x_5; + Vector128 orig6 = x_6; + Vector128 orig7 = x_7; + Vector128 orig8; + Vector128 orig9; + Vector128 orig10 = x_10; + Vector128 orig11 = x_11; + Vector128 orig12 = x_12; + Vector128 orig13 = x_13; + Vector128 orig14 = x_14; + Vector128 orig15 = x_15; + Vector128 t8, t9; + + while (bytes >= 256) + { + Vector128 addv8 = Vector128.Create(0, 1).AsUInt32(); + Vector128 addv9 = Vector128.Create(2, 3).AsUInt32(); + + x_0 = orig0; + x_1 = orig1; + x_2 = orig2; + x_3 = orig3; + x_4 = orig4; + x_5 = orig5; + x_6 = orig6; + x_7 = orig7; + x_10 = orig10; + x_11 = orig11; + x_12 = orig12; + x_13 = orig13; + x_14 = orig14; + x_15 = orig15; + + uint in8 = x[8]; + uint in9 = x[9]; + ulong in89 = in8 | ((ulong)in9) << 32; + t8 = Vector128.Create(in89).AsUInt32(); + t9 = Vector128.Create(in89).AsUInt32(); + + x_8 = Sse2.Add(Vector128.AsUInt64(addv8), Vector128.AsUInt64(t8)).AsUInt32(); + x_9 = Sse2.Add(Vector128.AsUInt64(addv9), Vector128.AsUInt64(t9)).AsUInt32(); + + t8 = Sse2.UnpackLow(x_8, x_9); + t9 = Sse2.UnpackHigh(x_8, x_9); + + x_8 = Sse2.UnpackLow(t8, t9); + x_9 = Sse2.UnpackHigh(t8, t9); + + orig8 = x_8; + orig9 = x_9; + + in89 += 4; + + x[8] = (uint)(in89 & 0xFFFFFFFF); + x[9] = (uint)(in89 >> 32 & 0xFFFFFFFF); + + for (int i = 0; i < 20; i += 2) + { + Vec128QuarterRound(ref x_0, ref x_4, ref x_8, ref x_12); + Vec128QuarterRound(ref x_5, ref x_9, ref x_13, ref x_1); + Vec128QuarterRound(ref x_10, ref x_14, ref x_2, ref x_6); + Vec128QuarterRound(ref x_15, ref x_3, ref x_7, ref x_11); + + Vec128QuarterRound(ref x_0, ref x_1, ref x_2, ref x_3); + Vec128QuarterRound(ref x_5, ref x_6, ref x_7, ref x_4); + Vec128QuarterRound(ref x_10, ref x_11, ref x_8, ref x_9); + Vec128QuarterRound(ref x_15, ref x_12, ref x_13, ref x_14); + } + OneQuad(ref x_0, ref x_1, ref x_2, ref x_3, ref orig0, ref orig1, ref orig2, ref orig3, m, c); + m += 16; + c += 16; + OneQuad(ref x_4, ref x_5, ref x_6, ref x_7, ref orig4, ref orig5, ref orig6, ref orig7, m, c); + m += 16; + c += 16; + OneQuad(ref x_8, ref x_9, ref x_10, ref x_11, ref orig8, ref orig9, ref orig10, ref orig11, m, c); + m += 16; + c += 16; + OneQuad(ref x_12, ref x_13, ref x_14, ref x_15, ref orig12, ref orig13, ref orig14, ref orig15, m, c); + m -= 48; + c -= 48; + bytes -= 256; + c += 256; + m += 256; + } + } + while (bytes >= 64) + { + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); + + Vector128 w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + Vector128 w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + Vector128 w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + Vector128 w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + + // TODO: Is SIMDS transposing faster than manual loading/deloading? + //Vector128 x_0 = Vector128.Create(x[0], x[5], x[10], x[15]); + //Vector128 x_1 = Vector128.Create(x[4], x[9], x[14], x[3]); + //Vector128 x_2 = Vector128.Create(x[8], x[13], x[2], x[7]); + //Vector128 x_3 = Vector128.Create(x[12], x[1], x[6], x[11]); + + for (int i = 0; i < 20; i += 2) + { + x_1 = Sse2.Xor(x_1, Vector128Rotate(Sse2.Add(x_0, x_3), 7)); + x_2 = Sse2.Xor(x_2, Vector128Rotate(Sse2.Add(x_1, x_0), 9)); + x_3 = Sse2.Xor(x_3, Vector128Rotate(Sse2.Add(x_2, x_1), 13)); + x_0 = Sse2.Xor(x_0, Vector128Rotate(Sse2.Add(x_3, x_2), 18)); + + x_1 = Sse2.Shuffle(x_1, 0b_10_01_00_11); + x_3 = Sse2.Shuffle(x_3, 0b_00_11_10_01); + + x_3 = Sse2.Xor(x_3, Vector128Rotate(Sse2.Add(x_0, x_1), 7)); + x_2 = Sse2.Xor(x_2, Vector128Rotate(Sse2.Add(x_3, x_0), 9)); + x_1 = Sse2.Xor(x_1, Vector128Rotate(Sse2.Add(x_2, x_3), 13)); + x_0 = Sse2.Xor(x_0, Vector128Rotate(Sse2.Add(x_1, x_2), 18)); + + x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); + x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); + } + x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x)); + x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4)); + x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8)); + x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12)); + + w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + + x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); + x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32(); + x_2 = Sse2.Xor(x_2.AsByte(), Sse2.LoadVector128(m + 32)).AsUInt32(); + x_3 = Sse2.Xor(x_3.AsByte(), Sse2.LoadVector128(m + 48)).AsUInt32(); + Sse2.Store(c, x_0.AsByte()); + Sse2.Store(c + 16, x_1.AsByte()); + Sse2.Store(c + 32, x_2.AsByte()); + Sse2.Store(c + 48, x_3.AsByte()); + + uint in8 = x[8]; + uint in9 = x[9]; + in8++; + if (in8 == 0) + { + in9++; + } + x[8] = in8; + x[9] = in9; + + bytes -= 64; + c += 64; + m += 64; + } + if (bytes > 0) + { + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); + + Vector128 w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + Vector128 w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + Vector128 w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + Vector128 w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + + // TODO: Is SIMDS transposing faster than manual loading/deloading? + //Vector128 x_0 = Vector128.Create(x[0], x[5], x[10], x[15]); + //Vector128 x_1 = Vector128.Create(x[4], x[9], x[14], x[3]); + //Vector128 x_2 = Vector128.Create(x[8], x[13], x[2], x[7]); + //Vector128 x_3 = Vector128.Create(x[12], x[1], x[6], x[11]); + + for (int i = 0; i < 20; i += 2) + { + x_1 = Sse2.Xor(x_1, Vector128Rotate(Sse2.Add(x_0, x_3), 7)); + x_2 = Sse2.Xor(x_2, Vector128Rotate(Sse2.Add(x_1, x_0), 9)); + x_3 = Sse2.Xor(x_3, Vector128Rotate(Sse2.Add(x_2, x_1), 13)); + x_0 = Sse2.Xor(x_0, Vector128Rotate(Sse2.Add(x_3, x_2), 18)); + + x_1 = Sse2.Shuffle(x_1, 0b_10_01_00_11); + x_3 = Sse2.Shuffle(x_3, 0b_00_11_10_01); + + x_3 = Sse2.Xor(x_3, Vector128Rotate(Sse2.Add(x_0, x_1), 7)); + x_2 = Sse2.Xor(x_2, Vector128Rotate(Sse2.Add(x_3, x_0), 9)); + x_1 = Sse2.Xor(x_1, Vector128Rotate(Sse2.Add(x_2, x_3), 13)); + x_0 = Sse2.Xor(x_0, Vector128Rotate(Sse2.Add(x_1, x_2), 18)); + + x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); + x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); + } + x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x)); + x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4)); + x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8)); + x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12)); + + w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + + byte* partialblock = stackalloc byte[64]; + Sse2.Store(partialblock, Vector128.AsByte(x_0)); + Sse2.Store(partialblock + 16, Vector128.AsByte(x_1)); + Sse2.Store(partialblock + 32, Vector128.AsByte(x_2)); + Sse2.Store(partialblock + 48, Vector128.AsByte(x_3)); + + for (ulong i = 0; i < bytes; i++) + { + c[i] = (byte)(m[i] ^ partialblock[i]); + } + for (int n = 0; n < 64 / sizeof(int); n++) + { + ((int*)partialblock)[n] = 0; + } + } + } + + // 256 byte methods + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 Vector128Rotate(Vector128 a, byte imm) => Sse2.Or(Sse2.ShiftLeftLogical(a, imm), Sse2.ShiftRightLogical(a, (byte)(32 - imm))); + + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void OneQuad(ref Vector128 x_A, ref Vector128 x_B, ref Vector128 x_C, ref Vector128 x_D, ref Vector128 origA, ref Vector128 origB, ref Vector128 origC, ref Vector128 origD, byte* m, byte* c) + { + Vector128 t_A, t_B, t_C, t_D, t0, t1, t2, t3; + x_A = Sse2.Add(x_A, origA); + x_B = Sse2.Add(x_B, origB); + x_C = Sse2.Add(x_C, origC); + x_D = Sse2.Add(x_D, origD); + t_A = Sse2.UnpackLow(x_A, x_B); + t_B = Sse2.UnpackLow(x_C, x_D); + t_C = Sse2.UnpackHigh(x_A, x_B); + t_D = Sse2.UnpackHigh(x_C, x_D); + x_A = Sse2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_B = Sse2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_C = Sse2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + x_D = Sse2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + t0 = Sse2.Xor(x_A.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); + Sse2.Store(c, t0.AsByte()); + t1 = Sse2.Xor(x_B.AsByte(), Sse2.LoadVector128(m + 64)).AsUInt32(); + Sse2.Store(c + 64, t1.AsByte()); + t2 = Sse2.Xor(x_C.AsByte(), Sse2.LoadVector128(m + 128)).AsUInt32(); + Sse2.Store(c + 128, t2.AsByte()); + t3 = Sse2.Xor(x_D.AsByte(), Sse2.LoadVector128(m + 192)).AsUInt32(); + Sse2.Store(c + 192, t3.AsByte()); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vec128QuarterRound(ref Vector128 x_A, ref Vector128 x_B, ref Vector128 x_C, ref Vector128 x_D) + { + x_B = Sse2.Xor(x_B, Vector128Rotate(Sse2.Add(x_A, x_D), 7)); + x_C = Sse2.Xor(x_C, Vector128Rotate(Sse2.Add(x_B, x_A), 9)); + x_D = Sse2.Xor(x_D, Vector128Rotate(Sse2.Add(x_C, x_B), 13)); + x_A = Sse2.Xor(x_A, Vector128Rotate(Sse2.Add(x_D, x_C), 18)); + } + + // 512 byte methods + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 Vector256Rotate(Vector256 a, byte imm) => Avx2.Or(Avx2.ShiftLeftLogical(a, imm), Avx2.ShiftRightLogical(a, (byte)(32 - imm))); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vec256Round(ref Vector256 A1, ref Vector256 B1, ref Vector256 C1, ref Vector256 D1, ref Vector256 A2, ref Vector256 B2, ref Vector256 C2, ref Vector256 D2, ref Vector256 A3, ref Vector256 B3, ref Vector256 C3, ref Vector256 D3, ref Vector256 A4, ref Vector256 B4, ref Vector256 C4, ref Vector256 D4) + { + Vector256Line1(ref A1, ref B1, ref C1, ref D1); + Vector256Line1(ref A2, ref B2, ref C2, ref D2); + Vector256Line1(ref A3, ref B3, ref C3, ref D3); + Vector256Line1(ref A4, ref B4, ref C4, ref D4); + Vector256Line2(ref A1, ref B1, ref C1, ref D1); + Vector256Line2(ref A2, ref B2, ref C2, ref D2); + Vector256Line2(ref A3, ref B3, ref C3, ref D3); + Vector256Line2(ref A4, ref B4, ref C4, ref D4); + Vector256Line3(ref A1, ref B1, ref C1, ref D1); + Vector256Line3(ref A2, ref B2, ref C2, ref D2); + Vector256Line3(ref A3, ref B3, ref C3, ref D3); + Vector256Line3(ref A4, ref B4, ref C4, ref D4); + Vector256Line4(ref A1, ref B1, ref C1, ref D1); + Vector256Line4(ref A2, ref B2, ref C2, ref D2); + Vector256Line4(ref A3, ref B3, ref C3, ref D3); + Vector256Line4(ref A4, ref B4, ref C4, ref D4); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line1(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_B = Avx2.Xor(x_B,Vector256Rotate(Avx2.Add(x_A, x_D), 7)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line2(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_C = Avx2.Xor(x_C, Vector256Rotate(Avx2.Add(x_B, x_A), 9)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line3(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_D = Avx2.Xor(x_D, Vector256Rotate(Avx2.Add(x_C, x_B), 13)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line4(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_A = Avx2.Xor(x_A, Vector256Rotate(Avx2.Add(x_D, x_C), 18)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void OneQuadUnpack(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D, ref Vector256 t_A, ref Vector256 t_B, ref Vector256 t_C, ref Vector256 t_D, ref Vector256 orig_A, ref Vector256 orig_B, ref Vector256 orig_C, ref Vector256 orig_D) + { + x_A = Avx2.Add(x_A, orig_A); + x_B = Avx2.Add(x_B, orig_B); + x_C = Avx2.Add(x_C, orig_C); + x_D = Avx2.Add(x_D, orig_D); + t_A = Avx2.UnpackLow(x_A, x_B); + t_B = Avx2.UnpackLow(x_C, x_D); + t_C = Avx2.UnpackHigh(x_A, x_B); + t_D = Avx2.UnpackHigh(x_C, x_D); + x_A = Avx2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_B = Avx2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_C = Avx2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + x_D = Avx2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + } + } +} +#pragma warning restore IDE0007 // Use implicit type +#endif \ No newline at end of file From 0b81a0d21b8b5cec8d857e5685d1366157913145 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 11 Oct 2022 15:00:04 +0100 Subject: [PATCH 11/59] Fix Salsa20 intrinsics for bytes >= 64 --- src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs | 146 +++++++++++--------- 1 file changed, 80 insertions(+), 66 deletions(-) diff --git a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs index 9759a6d..98af3e2 100644 --- a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs @@ -283,26 +283,31 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) } while (bytes >= 64) { - Vector128 x_0 = Sse2.LoadVector128(x); - Vector128 x_1 = Sse2.LoadVector128(x + 4); - Vector128 x_2 = Sse2.LoadVector128(x + 8); - Vector128 x_3 = Sse2.LoadVector128(x + 12); - - Vector128 w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - Vector128 w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - Vector128 w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - Vector128 w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); - // TODO: Is SIMDS transposing faster than manual loading/deloading? - //Vector128 x_0 = Vector128.Create(x[0], x[5], x[10], x[15]); - //Vector128 x_1 = Vector128.Create(x[4], x[9], x[14], x[3]); - //Vector128 x_2 = Vector128.Create(x[8], x[13], x[2], x[7]); - //Vector128 x_3 = Vector128.Create(x[12], x[1], x[6], x[11]); + Vector128 x_0 = Vector128.Create(x[0], x[5], x[10], x[15]); + Vector128 x_1 = Vector128.Create(x[4], x[9], x[14], x[3]); + Vector128 x_2 = Vector128.Create(x[8], x[13], x[2], x[7]); + Vector128 x_3 = Vector128.Create(x[12], x[1], x[6], x[11]); + + //Vector128 x_0 = Sse2.LoadVector128(x); + //Vector128 x_1 = Sse2.LoadVector128(x + 4); + //Vector128 x_2 = Sse2.LoadVector128(x + 8); + //Vector128 x_3 = Sse2.LoadVector128(x + 12); + + //Vector128 w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + //Vector128 w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + //Vector128 w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + //Vector128 w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + //x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + //x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + //x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + //x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + + Vector128 orig_0 = x_0; + Vector128 orig_1 = x_1; + Vector128 orig_2 = x_2; + Vector128 orig_3 = x_3; for (int i = 0; i < 20; i += 2) { @@ -312,6 +317,7 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) x_0 = Sse2.Xor(x_0, Vector128Rotate(Sse2.Add(x_3, x_2), 18)); x_1 = Sse2.Shuffle(x_1, 0b_10_01_00_11); + x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); x_3 = Sse2.Shuffle(x_3, 0b_00_11_10_01); x_3 = Sse2.Xor(x_3, Vector128Rotate(Sse2.Add(x_0, x_1), 7)); @@ -320,22 +326,28 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) x_0 = Sse2.Xor(x_0, Vector128Rotate(Sse2.Add(x_1, x_2), 18)); x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); + x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); } - x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x)); - x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4)); - x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8)); - x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12)); - - w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Vector128 t_0 = Sse2.Add(x_0, orig_0); + Vector128 t_1 = Sse2.Add(x_1, orig_1); + Vector128 t_2 = Sse2.Add(x_2, orig_2); + Vector128 t_3 = Sse2.Add(x_3, orig_3); + + x_0 = Vector128.Create(t_0.GetElement(0), t_3.GetElement(1), t_2.GetElement(2), t_1.GetElement(3)); + x_1 = Vector128.Create(t_1.GetElement(0), t_0.GetElement(1), t_3.GetElement(2), t_2.GetElement(3)); + x_2 = Vector128.Create(t_2.GetElement(0), t_1.GetElement(1), t_0.GetElement(2), t_3.GetElement(3)); + x_3 = Vector128.Create(t_3.GetElement(0), t_2.GetElement(1), t_1.GetElement(2), t_0.GetElement(3)); + + //w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + //w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + //w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + //w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + //x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + //x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + //x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + //x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32(); @@ -362,26 +374,31 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) } if (bytes > 0) { - Vector128 x_0 = Sse2.LoadVector128(x); - Vector128 x_1 = Sse2.LoadVector128(x + 4); - Vector128 x_2 = Sse2.LoadVector128(x + 8); - Vector128 x_3 = Sse2.LoadVector128(x + 12); - - Vector128 w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - Vector128 w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - Vector128 w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - Vector128 w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); - // TODO: Is SIMDS transposing faster than manual loading/deloading? - //Vector128 x_0 = Vector128.Create(x[0], x[5], x[10], x[15]); - //Vector128 x_1 = Vector128.Create(x[4], x[9], x[14], x[3]); - //Vector128 x_2 = Vector128.Create(x[8], x[13], x[2], x[7]); - //Vector128 x_3 = Vector128.Create(x[12], x[1], x[6], x[11]); + Vector128 x_0 = Vector128.Create(x[0], x[5], x[10], x[15]); + Vector128 x_1 = Vector128.Create(x[4], x[9], x[14], x[3]); + Vector128 x_2 = Vector128.Create(x[8], x[13], x[2], x[7]); + Vector128 x_3 = Vector128.Create(x[12], x[1], x[6], x[11]); + + //Vector128 x_0 = Sse2.LoadVector128(x); + //Vector128 x_1 = Sse2.LoadVector128(x + 4); + //Vector128 x_2 = Sse2.LoadVector128(x + 8); + //Vector128 x_3 = Sse2.LoadVector128(x + 12); + + //Vector128 w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + //Vector128 w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + //Vector128 w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + //Vector128 w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + //x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + //x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + //x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + //x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + + Vector128 orig_0 = x_0; + Vector128 orig_1 = x_1; + Vector128 orig_2 = x_2; + Vector128 orig_3 = x_3; for (int i = 0; i < 20; i += 2) { @@ -391,6 +408,7 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) x_0 = Sse2.Xor(x_0, Vector128Rotate(Sse2.Add(x_3, x_2), 18)); x_1 = Sse2.Shuffle(x_1, 0b_10_01_00_11); + x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); x_3 = Sse2.Shuffle(x_3, 0b_00_11_10_01); x_3 = Sse2.Xor(x_3, Vector128Rotate(Sse2.Add(x_0, x_1), 7)); @@ -399,22 +417,18 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) x_0 = Sse2.Xor(x_0, Vector128Rotate(Sse2.Add(x_1, x_2), 18)); x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); + x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); } - x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x)); - x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4)); - x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8)); - x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12)); - - w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Vector128 t_0 = Sse2.Add(x_0, orig_0); + Vector128 t_1 = Sse2.Add(x_1, orig_1); + Vector128 t_2 = Sse2.Add(x_2, orig_2); + Vector128 t_3 = Sse2.Add(x_3, orig_3); + + x_0 = Vector128.Create(t_0.GetElement(0), t_3.GetElement(1), t_2.GetElement(2), t_1.GetElement(3)); + x_1 = Vector128.Create(t_1.GetElement(0), t_0.GetElement(1), t_3.GetElement(2), t_2.GetElement(3)); + x_2 = Vector128.Create(t_2.GetElement(0), t_1.GetElement(1), t_0.GetElement(2), t_3.GetElement(3)); + x_3 = Vector128.Create(t_3.GetElement(0), t_2.GetElement(1), t_1.GetElement(2), t_0.GetElement(3)); byte* partialblock = stackalloc byte[64]; Sse2.Store(partialblock, Vector128.AsByte(x_0)); From e149e3c336e8206184bfdd7c4a4212afe768fd60 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 11 Oct 2022 17:38:03 +0100 Subject: [PATCH 12/59] Fix benchmark --- test/NaCl.Core.Benchmarks/Program.cs | 2 ++ test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/NaCl.Core.Benchmarks/Program.cs b/test/NaCl.Core.Benchmarks/Program.cs index 7497375..f984f79 100644 --- a/test/NaCl.Core.Benchmarks/Program.cs +++ b/test/NaCl.Core.Benchmarks/Program.cs @@ -14,8 +14,10 @@ static void Main(string[] args) BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); //BenchmarkRunner.Run(); //BenchmarkRunner.Run(); + //BenchmarkRunner.Run(); //BenchmarkRunner.Run(); //BenchmarkRunner.Run(); + //BenchmarkRunner.Run(); //BenchmarkRunner.Run(); Console.ReadLine(); diff --git a/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs b/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs index 0b2a27e..f0492aa 100644 --- a/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs +++ b/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs @@ -35,7 +35,7 @@ public void Setup() key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; rnd.NextBytes(key.Span); - nonce = new byte[12]; + nonce = new byte[8]; rnd.NextBytes(nonce.Span); message = new byte[Size]; From f6e16801e77d7eb3a91db3e5db6d4dcf71e81fb1 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 11 Oct 2022 17:38:49 +0100 Subject: [PATCH 13/59] Use transpose rotate transpose trick - 10-20% faster --- src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs | 193 +++++++++++++------- 1 file changed, 126 insertions(+), 67 deletions(-) diff --git a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs index 98af3e2..26e3a57 100644 --- a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs @@ -283,26 +283,37 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) } while (bytes >= 64) { - // TODO: Is SIMDS transposing faster than manual loading/deloading? - Vector128 x_0 = Vector128.Create(x[0], x[5], x[10], x[15]); - Vector128 x_1 = Vector128.Create(x[4], x[9], x[14], x[3]); - Vector128 x_2 = Vector128.Create(x[8], x[13], x[2], x[7]); - Vector128 x_3 = Vector128.Create(x[12], x[1], x[6], x[11]); - - //Vector128 x_0 = Sse2.LoadVector128(x); - //Vector128 x_1 = Sse2.LoadVector128(x + 4); - //Vector128 x_2 = Sse2.LoadVector128(x + 8); - //Vector128 x_3 = Sse2.LoadVector128(x + 12); - - //Vector128 w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - //Vector128 w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - //Vector128 w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - //Vector128 w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - //x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - //x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - //x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - //x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); + + // Transpose + Vector128 w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + Vector128 w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + Vector128 w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + Vector128 w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + + // Diagonalize + x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); + x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); + x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); + + // Transpose + w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); Vector128 orig_0 = x_0; Vector128 orig_1 = x_1; @@ -329,25 +340,38 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); } - Vector128 t_0 = Sse2.Add(x_0, orig_0); - Vector128 t_1 = Sse2.Add(x_1, orig_1); - Vector128 t_2 = Sse2.Add(x_2, orig_2); - Vector128 t_3 = Sse2.Add(x_3, orig_3); - - x_0 = Vector128.Create(t_0.GetElement(0), t_3.GetElement(1), t_2.GetElement(2), t_1.GetElement(3)); - x_1 = Vector128.Create(t_1.GetElement(0), t_0.GetElement(1), t_3.GetElement(2), t_2.GetElement(3)); - x_2 = Vector128.Create(t_2.GetElement(0), t_1.GetElement(1), t_0.GetElement(2), t_3.GetElement(3)); - x_3 = Vector128.Create(t_3.GetElement(0), t_2.GetElement(1), t_1.GetElement(2), t_0.GetElement(3)); - - //w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - //w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - //w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - //w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - //x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - //x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - //x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - //x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + + x_0 = Sse2.Add(x_0, orig_0); + x_1 = Sse2.Add(x_1, orig_1); + x_2 = Sse2.Add(x_2, orig_2); + x_3 = Sse2.Add(x_3, orig_3); + + // Transpose + w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + + // Diagonalize + x_1 = Sse2.Shuffle(x_1, 0b_10_01_00_11); + x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); + x_3 = Sse2.Shuffle(x_3, 0b_00_11_10_01); + + // Transpose + w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32(); @@ -374,26 +398,37 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) } if (bytes > 0) { - // TODO: Is SIMDS transposing faster than manual loading/deloading? - Vector128 x_0 = Vector128.Create(x[0], x[5], x[10], x[15]); - Vector128 x_1 = Vector128.Create(x[4], x[9], x[14], x[3]); - Vector128 x_2 = Vector128.Create(x[8], x[13], x[2], x[7]); - Vector128 x_3 = Vector128.Create(x[12], x[1], x[6], x[11]); - - //Vector128 x_0 = Sse2.LoadVector128(x); - //Vector128 x_1 = Sse2.LoadVector128(x + 4); - //Vector128 x_2 = Sse2.LoadVector128(x + 8); - //Vector128 x_3 = Sse2.LoadVector128(x + 12); - - //Vector128 w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - //Vector128 w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - //Vector128 w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - //Vector128 w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - //x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - //x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - //x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - //x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); + + // Transpose + Vector128 w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + Vector128 w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + Vector128 w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + Vector128 w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + + // Diagonalize + x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); + x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); + x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); + + // Transpose + w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); Vector128 orig_0 = x_0; Vector128 orig_1 = x_1; @@ -420,15 +455,38 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); } - Vector128 t_0 = Sse2.Add(x_0, orig_0); - Vector128 t_1 = Sse2.Add(x_1, orig_1); - Vector128 t_2 = Sse2.Add(x_2, orig_2); - Vector128 t_3 = Sse2.Add(x_3, orig_3); - x_0 = Vector128.Create(t_0.GetElement(0), t_3.GetElement(1), t_2.GetElement(2), t_1.GetElement(3)); - x_1 = Vector128.Create(t_1.GetElement(0), t_0.GetElement(1), t_3.GetElement(2), t_2.GetElement(3)); - x_2 = Vector128.Create(t_2.GetElement(0), t_1.GetElement(1), t_0.GetElement(2), t_3.GetElement(3)); - x_3 = Vector128.Create(t_3.GetElement(0), t_2.GetElement(1), t_1.GetElement(2), t_0.GetElement(3)); + x_0 = Sse2.Add(x_0, orig_0); + x_1 = Sse2.Add(x_1, orig_1); + x_2 = Sse2.Add(x_2, orig_2); + x_3 = Sse2.Add(x_3, orig_3); + + // Transpose + w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + + // Diagonalize + x_1 = Sse2.Shuffle(x_1, 0b_10_01_00_11); + x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); + x_3 = Sse2.Shuffle(x_3, 0b_00_11_10_01); + + // Transpose + w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); byte* partialblock = stackalloc byte[64]; Sse2.Store(partialblock, Vector128.AsByte(x_0)); @@ -436,6 +494,7 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) Sse2.Store(partialblock + 32, Vector128.AsByte(x_2)); Sse2.Store(partialblock + 48, Vector128.AsByte(x_3)); + // TODO use vector for (ulong i = 0; i < bytes; i++) { c[i] = (byte)(m[i] ^ partialblock[i]); From cbf3778b81eed7231cc633a48bf641061ea67004 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 11 Oct 2022 17:59:03 +0100 Subject: [PATCH 14/59] Add Transpose method --- src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs | 101 ++++-------------- test/NaCl.Core.Benchmarks/Program.cs | 4 +- test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs | 8 +- 3 files changed, 27 insertions(+), 86 deletions(-) diff --git a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs index 26e3a57..63cad0c 100644 --- a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs @@ -288,32 +288,14 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) Vector128 x_2 = Sse2.LoadVector128(x + 8); Vector128 x_3 = Sse2.LoadVector128(x + 12); - // Transpose - Vector128 w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - Vector128 w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - Vector128 w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - Vector128 w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); // Diagonalize x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); - // Transpose - w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); Vector128 orig_0 = x_0; Vector128 orig_1 = x_1; @@ -346,32 +328,14 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) x_2 = Sse2.Add(x_2, orig_2); x_3 = Sse2.Add(x_3, orig_3); - // Transpose - w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); // Diagonalize x_1 = Sse2.Shuffle(x_1, 0b_10_01_00_11); x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); x_3 = Sse2.Shuffle(x_3, 0b_00_11_10_01); - // Transpose - w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32(); @@ -403,16 +367,7 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) Vector128 x_2 = Sse2.LoadVector128(x + 8); Vector128 x_3 = Sse2.LoadVector128(x + 12); - // Transpose - Vector128 w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - Vector128 w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - Vector128 w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - Vector128 w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); // Diagonalize x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); @@ -420,15 +375,7 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); // Transpose - w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); Vector128 orig_0 = x_0; Vector128 orig_1 = x_1; @@ -461,32 +408,14 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) x_2 = Sse2.Add(x_2, orig_2); x_3 = Sse2.Add(x_3, orig_3); - // Transpose - w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); // Diagonalize x_1 = Sse2.Shuffle(x_1, 0b_10_01_00_11); x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); x_3 = Sse2.Shuffle(x_3, 0b_00_11_10_01); - // Transpose - w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); byte* partialblock = stackalloc byte[64]; Sse2.Store(partialblock, Vector128.AsByte(x_0)); @@ -510,6 +439,20 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector128 Vector128Rotate(Vector128 a, byte imm) => Sse2.Or(Sse2.ShiftLeftLogical(a, imm), Sse2.ShiftRightLogical(a, (byte)(32 - imm))); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Transpose(ref Vector128 a, ref Vector128 b, ref Vector128 c, ref Vector128 d) + { + var w_0 = Sse2.UnpackLow(a, b).AsUInt64(); + var w_1 = Sse2.UnpackHigh(a, b).AsUInt64(); + var w_2 = Sse2.UnpackLow(c, d).AsUInt64(); + var w_3 = Sse2.UnpackHigh(c, d).AsUInt64(); + + a = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + b = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + c = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + d = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void OneQuad(ref Vector128 x_A, ref Vector128 x_B, ref Vector128 x_C, ref Vector128 x_D, ref Vector128 origA, ref Vector128 origB, ref Vector128 origC, ref Vector128 origD, byte* m, byte* c) diff --git a/test/NaCl.Core.Benchmarks/Program.cs b/test/NaCl.Core.Benchmarks/Program.cs index f984f79..3b73a1a 100644 --- a/test/NaCl.Core.Benchmarks/Program.cs +++ b/test/NaCl.Core.Benchmarks/Program.cs @@ -11,10 +11,10 @@ static void Main(string[] args) // Execute following code: // $ dotnet run -c release --framework netcoreapp3.1 // $ dotnet run -c release --framework netcoreapp3.1 --filter *XChaCha20Poly1305Benchmark* - BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); + //BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); //BenchmarkRunner.Run(); //BenchmarkRunner.Run(); - //BenchmarkRunner.Run(); + BenchmarkRunner.Run(); //BenchmarkRunner.Run(); //BenchmarkRunner.Run(); //BenchmarkRunner.Run(); diff --git a/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs b/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs index f0492aa..b08d60e 100644 --- a/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs +++ b/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs @@ -9,7 +9,7 @@ [BenchmarkCategory("Stream Cipher")] [MemoryDiagnoser] - [RPlotExporter, RankColumn] + [RPlotExporter, RankColumn, HtmlExporter] public class Salsa20Benchmark { private static readonly Random rnd = new Random(42); @@ -23,10 +23,8 @@ public class Salsa20Benchmark [Params( (int)1E+2, // 100 bytes (int)1E+3, // 1 000 bytes = 1 KB - (int)1E+4, // 10 000 bytes = 10 KB - (int)1E+5, // 100 000 bytes = 100 KB - (int)1E+6, // 1 000 000 bytes = 1 MB - (int)1E+7)] // 10 000 000 bytes = 10 MB + (int)1E+5 // 100 000 bytes = 100 KB + )] // 10 000 000 bytes = 10 MB public int Size { get; set; } [GlobalSetup] From 3ff01b16c959807bbb9112483aad1009df5e1126 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 11 Oct 2022 18:01:58 +0100 Subject: [PATCH 15/59] Revert "Add Transpose method" This reverts commit cbf3778b81eed7231cc633a48bf641061ea67004. --- src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs | 101 ++++++++++++++---- test/NaCl.Core.Benchmarks/Program.cs | 4 +- test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs | 8 +- 3 files changed, 86 insertions(+), 27 deletions(-) diff --git a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs index 63cad0c..26e3a57 100644 --- a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs @@ -288,14 +288,32 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) Vector128 x_2 = Sse2.LoadVector128(x + 8); Vector128 x_3 = Sse2.LoadVector128(x + 12); - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); + // Transpose + Vector128 w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + Vector128 w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + Vector128 w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + Vector128 w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); // Diagonalize x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); + // Transpose + w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); Vector128 orig_0 = x_0; Vector128 orig_1 = x_1; @@ -328,14 +346,32 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) x_2 = Sse2.Add(x_2, orig_2); x_3 = Sse2.Add(x_3, orig_3); - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); + // Transpose + w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); // Diagonalize x_1 = Sse2.Shuffle(x_1, 0b_10_01_00_11); x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); x_3 = Sse2.Shuffle(x_3, 0b_00_11_10_01); - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); + // Transpose + w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32(); @@ -367,7 +403,16 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) Vector128 x_2 = Sse2.LoadVector128(x + 8); Vector128 x_3 = Sse2.LoadVector128(x + 12); - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); + // Transpose + Vector128 w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + Vector128 w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + Vector128 w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + Vector128 w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); // Diagonalize x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); @@ -375,7 +420,15 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); // Transpose - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); + w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); Vector128 orig_0 = x_0; Vector128 orig_1 = x_1; @@ -408,14 +461,32 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) x_2 = Sse2.Add(x_2, orig_2); x_3 = Sse2.Add(x_3, orig_3); - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); + // Transpose + w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); // Diagonalize x_1 = Sse2.Shuffle(x_1, 0b_10_01_00_11); x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); x_3 = Sse2.Shuffle(x_3, 0b_00_11_10_01); - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); + // Transpose + w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); + w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); + w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); + w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); + + x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); byte* partialblock = stackalloc byte[64]; Sse2.Store(partialblock, Vector128.AsByte(x_0)); @@ -439,20 +510,6 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector128 Vector128Rotate(Vector128 a, byte imm) => Sse2.Or(Sse2.ShiftLeftLogical(a, imm), Sse2.ShiftRightLogical(a, (byte)(32 - imm))); - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Transpose(ref Vector128 a, ref Vector128 b, ref Vector128 c, ref Vector128 d) - { - var w_0 = Sse2.UnpackLow(a, b).AsUInt64(); - var w_1 = Sse2.UnpackHigh(a, b).AsUInt64(); - var w_2 = Sse2.UnpackLow(c, d).AsUInt64(); - var w_3 = Sse2.UnpackHigh(c, d).AsUInt64(); - - a = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - b = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - c = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - d = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void OneQuad(ref Vector128 x_A, ref Vector128 x_B, ref Vector128 x_C, ref Vector128 x_D, ref Vector128 origA, ref Vector128 origB, ref Vector128 origC, ref Vector128 origD, byte* m, byte* c) diff --git a/test/NaCl.Core.Benchmarks/Program.cs b/test/NaCl.Core.Benchmarks/Program.cs index 3b73a1a..f984f79 100644 --- a/test/NaCl.Core.Benchmarks/Program.cs +++ b/test/NaCl.Core.Benchmarks/Program.cs @@ -11,10 +11,10 @@ static void Main(string[] args) // Execute following code: // $ dotnet run -c release --framework netcoreapp3.1 // $ dotnet run -c release --framework netcoreapp3.1 --filter *XChaCha20Poly1305Benchmark* - //BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); + BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); //BenchmarkRunner.Run(); //BenchmarkRunner.Run(); - BenchmarkRunner.Run(); + //BenchmarkRunner.Run(); //BenchmarkRunner.Run(); //BenchmarkRunner.Run(); //BenchmarkRunner.Run(); diff --git a/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs b/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs index b08d60e..f0492aa 100644 --- a/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs +++ b/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs @@ -9,7 +9,7 @@ [BenchmarkCategory("Stream Cipher")] [MemoryDiagnoser] - [RPlotExporter, RankColumn, HtmlExporter] + [RPlotExporter, RankColumn] public class Salsa20Benchmark { private static readonly Random rnd = new Random(42); @@ -23,8 +23,10 @@ public class Salsa20Benchmark [Params( (int)1E+2, // 100 bytes (int)1E+3, // 1 000 bytes = 1 KB - (int)1E+5 // 100 000 bytes = 100 KB - )] // 10 000 000 bytes = 10 MB + (int)1E+4, // 10 000 bytes = 10 KB + (int)1E+5, // 100 000 bytes = 100 KB + (int)1E+6, // 1 000 000 bytes = 1 MB + (int)1E+7)] // 10 000 000 bytes = 10 MB public int Size { get; set; } [GlobalSetup] From d8b341aace9d0ac3ce7b46963a67daff56b47314 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 11 Oct 2022 18:03:29 +0100 Subject: [PATCH 16/59] Add transpose method --- src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs | 103 +++++--------------- 1 file changed, 23 insertions(+), 80 deletions(-) diff --git a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs index 26e3a57..8f2f6be 100644 --- a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs @@ -288,32 +288,14 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) Vector128 x_2 = Sse2.LoadVector128(x + 8); Vector128 x_3 = Sse2.LoadVector128(x + 12); - // Transpose - Vector128 w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - Vector128 w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - Vector128 w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - Vector128 w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); // Diagonalize x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); - // Transpose - w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); Vector128 orig_0 = x_0; Vector128 orig_1 = x_1; @@ -346,32 +328,14 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) x_2 = Sse2.Add(x_2, orig_2); x_3 = Sse2.Add(x_3, orig_3); - // Transpose - w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); // Diagonalize x_1 = Sse2.Shuffle(x_1, 0b_10_01_00_11); x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); x_3 = Sse2.Shuffle(x_3, 0b_00_11_10_01); - // Transpose - w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32(); @@ -403,16 +367,7 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) Vector128 x_2 = Sse2.LoadVector128(x + 8); Vector128 x_3 = Sse2.LoadVector128(x + 12); - // Transpose - Vector128 w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - Vector128 w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - Vector128 w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - Vector128 w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); // Diagonalize x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); @@ -420,15 +375,7 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); // Transpose - w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); Vector128 orig_0 = x_0; Vector128 orig_1 = x_1; @@ -461,32 +408,14 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) x_2 = Sse2.Add(x_2, orig_2); x_3 = Sse2.Add(x_3, orig_3); - // Transpose - w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); // Diagonalize x_1 = Sse2.Shuffle(x_1, 0b_10_01_00_11); x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); x_3 = Sse2.Shuffle(x_3, 0b_00_11_10_01); - // Transpose - w_0 = Sse2.UnpackLow(x_0, x_1).AsUInt64(); - w_1 = Sse2.UnpackHigh(x_0, x_1).AsUInt64(); - w_2 = Sse2.UnpackLow(x_2, x_3).AsUInt64(); - w_3 = Sse2.UnpackHigh(x_2, x_3).AsUInt64(); - - x_0 = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - x_1 = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - x_2 = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - x_3 = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); byte* partialblock = stackalloc byte[64]; Sse2.Store(partialblock, Vector128.AsByte(x_0)); @@ -510,6 +439,20 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector128 Vector128Rotate(Vector128 a, byte imm) => Sse2.Or(Sse2.ShiftLeftLogical(a, imm), Sse2.ShiftRightLogical(a, (byte)(32 - imm))); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Transpose(ref Vector128 a, ref Vector128 b, ref Vector128 c, ref Vector128 d) + { + var w_0 = Sse2.UnpackLow(a, b).AsUInt64(); + var w_1 = Sse2.UnpackHigh(a, b).AsUInt64(); + var w_2 = Sse2.UnpackLow(c, d).AsUInt64(); + var w_3 = Sse2.UnpackHigh(c, d).AsUInt64(); + + a = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + b = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + c = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + d = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void OneQuad(ref Vector128 x_A, ref Vector128 x_B, ref Vector128 x_C, ref Vector128 x_D, ref Vector128 origA, ref Vector128 origB, ref Vector128 origC, ref Vector128 origD, byte* m, byte* c) @@ -574,7 +517,7 @@ private static void Vec256Round(ref Vector256 A1, ref Vector256 B1, [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Vector256Line1(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) { - x_B = Avx2.Xor(x_B,Vector256Rotate(Avx2.Add(x_A, x_D), 7)); + x_B = Avx2.Xor(x_B, Vector256Rotate(Avx2.Add(x_A, x_D), 7)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] From c286d7be559dca9a43eafc005719d84965fb9b83 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 11 Oct 2022 18:03:40 +0100 Subject: [PATCH 17/59] Fix XSalsa benchmark --- test/NaCl.Core.Benchmarks/XSalsa20Benchmark .cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/NaCl.Core.Benchmarks/XSalsa20Benchmark .cs b/test/NaCl.Core.Benchmarks/XSalsa20Benchmark .cs index 4a76567..170111c 100644 --- a/test/NaCl.Core.Benchmarks/XSalsa20Benchmark .cs +++ b/test/NaCl.Core.Benchmarks/XSalsa20Benchmark .cs @@ -35,7 +35,7 @@ public void Setup() key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; rnd.NextBytes(key.Span); - nonce = new byte[12]; + nonce = new byte[24]; rnd.NextBytes(nonce.Span); message = new byte[Size]; From e770b25a615ee65a257e37cf92dea3bca30f4666 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 11 Oct 2022 18:10:06 +0100 Subject: [PATCH 18/59] Code cleanup --- src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs | 8 ++++---- src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs | 6 ++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs index f994e8d..6ee992e 100644 --- a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs @@ -10,10 +10,10 @@ namespace NaCl.Core.Base { public static class ChaCha20BaseIntrinsics { - private static Vector128 rot8_128 = Vector128.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); - private static Vector128 rot16_128 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); - private static Vector256 rot8_256 = Vector256.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); - private static Vector256 rot16_256 = Vector256.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); + private static readonly Vector128 rot8_128 = Vector128.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); + private static readonly Vector128 rot16_128 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); + private static readonly Vector256 rot8_256 = Vector256.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); + private static readonly Vector256 rot16_256 = Vector256.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void ChaCha20(uint* x, byte* m, byte* c, ulong bytes) diff --git a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs index 8f2f6be..48ce27f 100644 --- a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs @@ -10,10 +10,8 @@ namespace NaCl.Core.Base { public static class Salsa20BaseIntrinsics { - private static Vector128 rot8_128 = Vector128.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); - private static Vector128 rot16_128 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); - private static Vector256 rot8_256 = Vector256.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); - private static Vector256 rot16_256 = Vector256.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); + private static readonly Vector256 rot8_256 = Vector256.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); + private static readonly Vector256 rot16_256 = Vector256.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) From 836ae771adfea48b47ed303b90b353b6704cabde Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 11 Oct 2022 18:17:53 +0100 Subject: [PATCH 19/59] Correct nonce size --- test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs b/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs index f0492aa..0b2a27e 100644 --- a/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs +++ b/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs @@ -35,7 +35,7 @@ public void Setup() key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; rnd.NextBytes(key.Span); - nonce = new byte[8]; + nonce = new byte[12]; rnd.NextBytes(nonce.Span); message = new byte[Size]; From e4d35c617cc4469688f87ac65242fb5b1feaea5a Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 11 Oct 2022 18:25:02 +0100 Subject: [PATCH 20/59] Inline pre processor variable --- src/NaCl.Core/NaCl.Core.csproj | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/NaCl.Core/NaCl.Core.csproj b/src/NaCl.Core/NaCl.Core.csproj index a873a24..7fb49d5 100644 --- a/src/NaCl.Core/NaCl.Core.csproj +++ b/src/NaCl.Core/NaCl.Core.csproj @@ -58,11 +58,7 @@ - FCL_BITOPS - - - - INTRINSICS + FCL_BITOPS, INTRINSICS From 288ae8c0949168880a9b1d4be01aca1a92b5b82a Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 11 Oct 2022 20:32:21 +0100 Subject: [PATCH 21/59] Add variable length test for ChaCha and Salsa --- test/NaCl.Core.Tests/ChaCha20Tests.cs | 30 +++++++++++++++++++++++++++ test/NaCl.Core.Tests/Salsa20Tests.cs | 30 +++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/test/NaCl.Core.Tests/ChaCha20Tests.cs b/test/NaCl.Core.Tests/ChaCha20Tests.cs index 2fd3712..1396e09 100644 --- a/test/NaCl.Core.Tests/ChaCha20Tests.cs +++ b/test/NaCl.Core.Tests/ChaCha20Tests.cs @@ -332,6 +332,34 @@ public void ChaCha20TestVector() } } + [Theory] + [InlineData(33)] + [InlineData(64)] + [InlineData(65)] + [InlineData(255)] + [InlineData(256)] + [InlineData(511)] + [InlineData(512)] + [InlineData(1023)] + [InlineData(1024)] + public void CreateVariableLengthCiphers(int size) + { + var input = new byte[size]; + var output = new byte[size]; + + var nonce = new byte[12]; + Array.Fill(nonce, (byte)2); + + var key = new byte[32]; + Array.Fill(key, (byte)1); + + var cipher = new ChaCha20(key, 0); + cipher.Encrypt(input, nonce, output); + var value = Convert.ToHexString(output); + + value.Should().Be(LongKeyStream[..(size*2)]); + } + [Fact] public void ChaCha20TestVectorTC8() { @@ -386,5 +414,7 @@ public void ChaCha20TestVectorTC8() CryptoBytes.Combine(block0, block1).Should().Equal(expected); } + + private const string LongKeyStream = "06E1F8D66AC5C75181F3E5ED9FA16AA909A1FB57A4A9B0110C84FCDC0D710880072A4342AF88DEC0138DAF141A3F471C01E77C1FDA90999496D601A36A8C0412E61CF22E8DA3E8DA712DE9F9D38BE4298CB36C0D83AA7DD314841BBDF59644DCD313F9F53B0E06B9D6CB3F0788CE2EE78993D9D27A3EDF0A52589CBB698519D583B68F72F3961AD77C1358394F29B08FE9F98A29F98311723013591E698557A04A73FB277E3E247083444A6C139ADE01BDE3C368C3A484D6824B33C024C0285CBD665D4F2E4DE87BF79565F08FE09766C16639279A243DAE8395F3E0E5D96E711B210355605A5A8E7B50CEA4BA25E4CB0E273488E223CD69FB699BD937A30D33488EF6076192E1ED08758F7F4774E4C0B8E70955D3CAFE790EB40F7725EB87B8BE6BBECDE1E140966973B5B05FDBFBE05C4BC599888693D96AC0C429B75591EF228A243A6EFDBEEEE49F09383AF2D4AFB6305DE60C5D195A44ED646B0CAFCEC5E445562FFFBB56D444C650E2D892FA99BCE78F2EBF866B154FDB110DDF8CAFB7BE4BEA46724B3952906F0C6E81BE7A17E3C95DF350BB970D2C97499924BDCC4EA0E1DE33AA4E62B5C1FC65FFD2728D81A79AE218AE1C639108323C3D22BA1B8C746CAB0CD535C8661CCA4B6B047790EF148A1B9A88CD3CDD8D79389E2F0D9AAAE135B361ED6778A6F6E03186651692F8DABEDF8872939F694C41E2CAD064FF4C537B92AFD0951DF77302749DCDBC9560FCE001DACAAFAA703BDA73007174C549B69EB031324E31BC9F60049E39254146AEB39BEE8A52CAEA1DD31C42346E44EBCC0771A2548D55ABD085323BA69625845F34831E7518F129CB1D80B76D3C94634F38A1226B5E212D917D593838F51D6CC35F87EB500030AB1446D87F6FFC4717B51C619DDAFD75DBA4C25A09C8C961CDA12A9E01203D678AD2ABB4B7D1BED7EBF0C2932DCE5F0C97F9488DD01A7891DC18D5EEFF6129B7942726A5B5110877260E2A78075C666F4410A2F8A2909D03DE0FBE2BFCA2B068B438ADAF767D804BA85278FB930945D15380281C215BC664B6627EE76CBBC8C5355E607721AAAC069B16B78C2F282795E7BF9B6509E7DC36FD2D45A227BF9D20C5E9678A040B63E964817F98B5F4828EB5D66740C595304D08A0A3C5A50EE3B3F99D2269992DD400A5B452A213DCD2579F7A193FC7FE33E498E91203DE19FF9D54BEBDE9E124A17E784430C38110FE3552861737DE1F2B7678F63417FE2224ED6571D43A8015F6F81362E7B95CB93C86735787F0980B0A3A65549844768EDF0DDEC75A24FA1EF5A26640932F65FF141CAEE2E14506A34E925C21BC268769CD95328675953E79B4B375912434834018ADD9C1832057EE4386C95B6E9407346B4A1582FB3C095E4B0882087DB48F081B5C0DE69ADBC447A6BA2ED6A4F90909911CD3B51ECEC2C6BE6EFE"; } } diff --git a/test/NaCl.Core.Tests/Salsa20Tests.cs b/test/NaCl.Core.Tests/Salsa20Tests.cs index 011f154..db33110 100644 --- a/test/NaCl.Core.Tests/Salsa20Tests.cs +++ b/test/NaCl.Core.Tests/Salsa20Tests.cs @@ -283,6 +283,34 @@ public void Salsa20TestVectors(Salsa20TestVector test) ToBlock8(output).Should().Be(test.ExpectedBlock8); } + [Theory] + [InlineData(33)] + [InlineData(64)] + [InlineData(65)] + [InlineData(255)] + [InlineData(256)] + [InlineData(511)] + [InlineData(512)] + [InlineData(1023)] + [InlineData(1024)] + public void CreateVariableLengthCiphers(int size) + { + var input = new byte[size]; + var output = new byte[size]; + + var nonce = new byte[8]; + Array.Fill(nonce, (byte)2); + + var key = new byte[32]; + Array.Fill(key, (byte)1); + + var cipher = new Salsa20(key, 0); + cipher.Encrypt(input, nonce, output); + var value = Convert.ToHexString(output); + + value.Should().Be(LongKeyStream[..(size*2)]); + } + private static string GetTestVector() { try @@ -358,5 +386,7 @@ string ReadValue(string toFind, int idx, int len) private static string ToBlock5(byte[] output) => CryptoBytes.ToHexStringUpper(output[256..320]); private static string ToBlock8(byte[] output) => CryptoBytes.ToHexStringUpper(output[448..512]); + + private const string LongKeyStream = "A3D1F8292CAB0B2096AB2AA26FC59AAF3EE159B39FC6029EF160D82EC80FA110FF958AB802861180EC006F8C8450030024A2D7744BF564C1782F15DB6681144C65A730622A14AE9A4E95F753289A6D2DBBEE47B457B57DB75C009B287BF240EBE02890581E3628BDBCC9B79E93500CA15F6E10D4EBCAAFC2FB936AF2EC05BBCB1610036E840621D7CE53E4A06822D6073EA0FA8943EDFB70E45B4D2525AE4B616BD08B33F23A7E0B6CD501E80B8E80B7423E7C9D5D900AE2194AF0CF4A74D721534063D3F17BC7993B5B3EC20A373F933B43CEB6987934C1456521F098BA0CB1205109F534F80D4EA1767EA9DFC08BED97BE40C539DD37EC24EAE0C68AC1B56DD0189747A4B8278B1E0E5206EAE893C0E45C76751002F38924B8C9A036CFAB9E3D44C1E323BCE43F2C69EB8212994803C1D2AC00C3B8F97DA6D09F29B974E0DF4D6D36C9D2E88C2D7B73AB399C0920A2996A4727272339D991C6BF45CE63C2DEF3FC9C2625F87EA6268C196829BB1F7E659736AF4B0CC2A771FB0962B19005E53DD880879C052556312BA353B51C26D5F5949464EAECE15ACA240E339BF3C581E7D93D220B1C3C0DE87F65B4F340DAB924EB72072211C41B18770230A3A123619006BE5FD4ABAAFD2BFAD0F34D5FB491DEBEBF5CA9EC92D997B5A171482CC6E949C70759A0B8EC64D590B6FFF6500E8425C3AE4178C2EDE996C0003F6FA76A6D90F49D6D3D128C0DE82EA8C7C16415DDD07081940701677C32D5B5E3BB57A93315474C5B648D31AA7AE52FCD63BF22550900077FF5CF6A5F5148B285E34A57A3DA1BEB0662A20C23857CA8D5D1748F654F54F42F30CD413F408A0C7B31F57AD59E9F152DBDEEA3EA9C3DBB3517615735CFF0226E179C4A9149C6477A2903B338AE308300A86D91043E2AA437C5F2A77A49B547B05BD98CEBE49500FF367CE204157BB3EFD182A8A96FCC31025D4C948105F6762F22357446367B87A01FA3F954D52810CBE5C4EEB04C3AE827973E481F3C38EF14A6F0FE3FB2D89969D2CCB0DFB63D7366D91F29DDBF1EB90B136191745B8AC8B8F0AAEF4D3A1C763D63AED1E76CC7B920979CB8163C413273CA1A563C37B925A0251C9AD31363F978437D92437A0D250C7F221C00F2E13CF371554DF191ECDDB46C95659739A1CDC257A067D9251FE89EA328D313C4D7EF8E33614FFC4C615D3195CD6282D82633067C81E1F563DA307B14253CBF0492256A409E3007EB6A4A7BDA694E1FFA9B5106AB9868CC359B976441C7B362C03E501D8B3FBEF98771A41C4DA542DB8DA4761EA3792695288437DEAC50E7B6A62E6D00B7511A5DB0E567090ADDDFCF0521F6DD62F969D5BE89378DB127219C38931A0AEDBCE784C35D4215B09B1F96732615813753B67846E9505DF974F4B1ECDFBD0C850A9644D720884B80B4FE4CC08508A8A65D1C5F"; } } \ No newline at end of file From e090aa907c27c669bed61fc457d4e1f825ed579d Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Wed, 12 Oct 2022 11:23:14 +0100 Subject: [PATCH 22/59] Refactor ChaCha20BaseIntrinsics --- src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs | 571 +----------------- .../Base/ChaChaIntrinsics/ChaCha256.cs | 170 ++++++ .../Base/ChaChaIntrinsics/ChaCha512.cs | 238 ++++++++ .../Base/ChaChaIntrinsics/ChaCha64.cs | 158 +++++ 4 files changed, 582 insertions(+), 555 deletions(-) create mode 100644 src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha256.cs create mode 100644 src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs create mode 100644 src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs diff --git a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs index 6ee992e..6482d18 100644 --- a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs @@ -1,575 +1,36 @@ #if INTRINSICS -#pragma warning disable IDE0007 // Use implicit type using System; using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +using NaCl.Core.Base.ChaChaIntrinsics; -namespace NaCl.Core.Base +namespace NaCl.Core.Base; + +public static class ChaCha20BaseIntrinsics { - public static class ChaCha20BaseIntrinsics + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void ChaCha20(uint* x, byte* m, byte* c, ulong bytes) { - private static readonly Vector128 rot8_128 = Vector128.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); - private static readonly Vector128 rot16_128 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); - private static readonly Vector256 rot8_256 = Vector256.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); - private static readonly Vector256 rot16_256 = Vector256.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe void ChaCha20(uint* x, byte* m, byte* c, ulong bytes) - { - if (!Sse3.IsSupported) - throw new Exception("Error this vectorisation is not supported on this CPU"); - - if (Avx2.IsSupported && bytes >= 512) - { - Vector256 x_0 = Vector256.Create(x[0]); - Vector256 x_1 = Vector256.Create(x[1]); - Vector256 x_2 = Vector256.Create(x[2]); - Vector256 x_3 = Vector256.Create(x[3]); - Vector256 x_4 = Vector256.Create(x[4]); - Vector256 x_5 = Vector256.Create(x[5]); - Vector256 x_6 = Vector256.Create(x[6]); - Vector256 x_7 = Vector256.Create(x[7]); - Vector256 x_8 = Vector256.Create(x[8]); - Vector256 x_9 = Vector256.Create(x[9]); - Vector256 x_10 = Vector256.Create(x[10]); - Vector256 x_11 = Vector256.Create(x[11]); - Vector256 x_12; - Vector256 x_13; - Vector256 x_14 = Vector256.Create(x[14]); - Vector256 x_15 = Vector256.Create(x[15]); - - Vector256 orig0 = x_0; - Vector256 orig1 = x_1; - Vector256 orig2 = x_2; - Vector256 orig3 = x_3; - Vector256 orig4 = x_4; - Vector256 orig5 = x_5; - Vector256 orig6 = x_6; - Vector256 orig7 = x_7; - Vector256 orig8 = x_8; - Vector256 orig9 = x_9; - Vector256 orig10 = x_10; - Vector256 orig11 = x_11; - Vector256 orig12; - Vector256 orig13; - Vector256 orig14 = x_14; - Vector256 orig15 = x_15; - - while (bytes >= 512) - { - Vector256 addv12 = Vector256.Create(0, 1, 2, 3).AsUInt32(); - Vector256 addv13 = Vector256.Create(4, 5, 6, 7).AsUInt32(); - Vector256 permute = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7).AsUInt32(); - Vector256 t12, t13; - x_0 = orig0; - x_1 = orig1; - x_2 = orig2; - x_3 = orig3; - x_4 = orig4; - x_5 = orig5; - x_6 = orig6; - x_7 = orig7; - x_8 = orig8; - x_9 = orig9; - x_10 = orig10; - x_11 = orig11; - x_14 = orig14; - x_15 = orig15; - - uint in12 = x[12]; - uint in13 = x[13]; - ulong in1213 = in12 | ((ulong)in13 << 32); - x_12 = x_13 = Avx2.BroadcastScalarToVector256(Sse2.X64.ConvertScalarToVector128UInt64(in1213)).AsUInt32(); - t12 = Avx2.Add(addv12.AsUInt64(), x_12.AsUInt64()).AsUInt32(); - t13 = Avx2.Add(addv13.AsUInt64(), x_13.AsUInt64()).AsUInt32(); - x_12 = Avx2.UnpackLow(t12, t13); - x_13 = Avx2.UnpackHigh(t12, t13); - t12 = Avx2.UnpackLow(x_12, x_13); - t13 = Avx2.UnpackHigh(x_12, x_13); - x_12 = Avx2.PermuteVar8x32(t12, permute); - x_13 = Avx2.PermuteVar8x32(t13, permute); - - orig12 = x_12; - orig13 = x_13; - - in1213 += 8; - - x[12] = (uint)(in1213 & 0xFFFFFFFF); - x[13] = (uint)((in1213 >> 32) & 0xFFFFFFFF); - for (int i = 0; i < 20; i += 2) - { - Vec256Round(ref x_0, ref x_4, ref x_8, ref x_12, ref x_1, ref x_5, ref x_9, ref x_13, ref x_2, ref x_6, ref x_10, ref x_14, ref x_3, ref x_7, ref x_11, ref x_15); - Vec256Round(ref x_0, ref x_5, ref x_10, ref x_15, ref x_1, ref x_6, ref x_11, ref x_12, ref x_2, ref x_7, ref x_8, ref x_13, ref x_3, ref x_4, ref x_9, ref x_14); - } - - Vector256 t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, t_13, t_14, t_15; - t_0 = t_1 = t_2 = t_3 = t_4 = t_5 = t_6 = t_7 = t_8 = t_9 = t_10 = t_11 = t_12 = t_13 = t_14 = t_15 = Vector256.Create((uint)0); - // ONEOCTO enter - OneQuadUnpack(ref x_0, ref x_1, ref x_2, ref x_3, ref t_0, ref t_1, ref t_2, ref t_3, ref orig0, ref orig1, ref orig2, ref orig3); - OneQuadUnpack(ref x_4, ref x_5, ref x_6, ref x_7, ref t_4, ref t_5, ref t_6, ref t_7, ref orig4, ref orig5, ref orig6, ref orig7); - t_0 = Avx2.Permute2x128(x_0, x_4, 0x20); - t_4 = Avx2.Permute2x128(x_0, x_4, 0x31); - t_1 = Avx2.Permute2x128(x_1, x_5, 0x20); - t_5 = Avx2.Permute2x128(x_1, x_5, 0x31); - t_2 = Avx2.Permute2x128(x_2, x_6, 0x20); - t_6 = Avx2.Permute2x128(x_2, x_6, 0x31); - t_3 = Avx2.Permute2x128(x_3, x_7, 0x20); - t_7 = Avx2.Permute2x128(x_3, x_7, 0x31); - t_0 = Avx2.Xor(t_0, Avx.LoadVector256(m).AsUInt32()); - t_1 = Avx2.Xor(t_1, Avx.LoadVector256(m + 64).AsUInt32()); - t_2 = Avx2.Xor(t_2, Avx.LoadVector256(m + 128).AsUInt32()); - t_3 = Avx2.Xor(t_3, Avx.LoadVector256(m + 192).AsUInt32()); - t_4 = Avx2.Xor(t_4, Avx.LoadVector256(m + 256).AsUInt32()); - t_5 = Avx2.Xor(t_5, Avx.LoadVector256(m + 320).AsUInt32()); - t_6 = Avx2.Xor(t_6, Avx.LoadVector256(m + 384).AsUInt32()); - t_7 = Avx2.Xor(t_7, Avx.LoadVector256(m + 448).AsUInt32()); - Avx.Store(c, t_0.AsByte()); - Avx.Store(c + 64, t_1.AsByte()); - Avx.Store(c + 128, t_2.AsByte()); - Avx.Store(c + 192, t_3.AsByte()); - Avx.Store(c + 256, t_4.AsByte()); - Avx.Store(c + 320, t_5.AsByte()); - Avx.Store(c + 384, t_6.AsByte()); - Avx.Store(c + 448, t_7.AsByte()); - // ONEOCTO exit - - m += 32; - c += 32; - - // ONEOCTO enter - OneQuadUnpack(ref x_8, ref x_9, ref x_10, ref x_11, ref t_8, ref t_9, ref t_10, ref t_11, ref orig8, ref orig9, ref orig10, ref orig11); - OneQuadUnpack(ref x_12, ref x_13, ref x_14, ref x_15, ref t_12, ref t_13, ref t_14, ref t_15, ref orig12, ref orig13, ref orig14, ref orig15); - t_8 = Avx2.Permute2x128(x_8, x_12, 0x20); - t_12 = Avx2.Permute2x128(x_8, x_12, 0x31); - t_9 = Avx2.Permute2x128(x_9, x_13, 0x20); - t_13 = Avx2.Permute2x128(x_9, x_13, 0x31); - t_10 = Avx2.Permute2x128(x_10, x_14, 0x20); - t_14 = Avx2.Permute2x128(x_10, x_14, 0x31); - t_11 = Avx2.Permute2x128(x_11, x_15, 0x20); - t_15 = Avx2.Permute2x128(x_11, x_15, 0x31); - t_8 = Avx2.Xor(t_8, Avx.LoadVector256(m).AsUInt32()); - t_9 = Avx2.Xor(t_9, Avx.LoadVector256(m + 64).AsUInt32()); - t_10 = Avx2.Xor(t_10, Avx.LoadVector256(m + 128).AsUInt32()); - t_11 = Avx2.Xor(t_11, Avx.LoadVector256(m + 192).AsUInt32()); - t_12 = Avx2.Xor(t_12, Avx.LoadVector256(m + 256).AsUInt32()); - t_13 = Avx2.Xor(t_13, Avx.LoadVector256(m + 320).AsUInt32()); - t_14 = Avx2.Xor(t_14, Avx.LoadVector256(m + 384).AsUInt32()); - t_15 = Avx2.Xor(t_15, Avx.LoadVector256(m + 448).AsUInt32()); - Avx.Store(c, t_8.AsByte()); - Avx.Store(c + 64, t_9.AsByte()); - Avx.Store(c + 128, t_10.AsByte()); - Avx.Store(c + 192, t_11.AsByte()); - Avx.Store(c + 256, t_12.AsByte()); - Avx.Store(c + 320, t_13.AsByte()); - Avx.Store(c + 384, t_14.AsByte()); - Avx.Store(c + 448, t_15.AsByte()); - // ONEOCTO exit - m -= 32; - c -= 32; - bytes -= 512; - c += 512; - m += 512; - } - } - if (bytes >= 256) - { - Vector128 x_0 = Vector128.Create(x[0]); - Vector128 x_1 = Vector128.Create(x[1]); - Vector128 x_2 = Vector128.Create(x[2]); - Vector128 x_3 = Vector128.Create(x[3]); - Vector128 x_4 = Vector128.Create(x[4]); - Vector128 x_5 = Vector128.Create(x[5]); - Vector128 x_6 = Vector128.Create(x[6]); - Vector128 x_7 = Vector128.Create(x[7]); - Vector128 x_8 = Vector128.Create(x[8]); - Vector128 x_9 = Vector128.Create(x[9]); - Vector128 x_10 = Vector128.Create(x[10]); - Vector128 x_11 = Vector128.Create(x[11]); - Vector128 x_12; - Vector128 x_13; - Vector128 x_14 = Vector128.Create(x[14]); - Vector128 x_15 = Vector128.Create(x[15]); - Vector128 orig0 = x_0; - Vector128 orig1 = x_1; - Vector128 orig2 = x_2; - Vector128 orig3 = x_3; - Vector128 orig4 = x_4; - Vector128 orig5 = x_5; - Vector128 orig6 = x_6; - Vector128 orig7 = x_7; - Vector128 orig8 = x_8; - Vector128 orig9 = x_9; - Vector128 orig10 = x_10; - Vector128 orig11 = x_11; - Vector128 orig12; - Vector128 orig13; - Vector128 orig14 = x_14; - Vector128 orig15 = x_15; - Vector128 t12, t13; - - while (bytes >= 256) - { - Vector128 addv12 = Vector128.Create(0, 1).AsUInt32(); - Vector128 addv13 = Vector128.Create(2, 3).AsUInt32(); - - x_0 = orig0; - x_1 = orig1; - x_2 = orig2; - x_3 = orig3; - x_4 = orig4; - x_5 = orig5; - x_6 = orig6; - x_7 = orig7; - x_8 = orig8; - x_9 = orig9; - x_10 = orig10; - x_11 = orig11; - x_14 = orig14; - x_15 = orig15; - - uint in12 = x[12]; - uint in13 = x[13]; - ulong in1213 = in12 | ((ulong)in13) << 32; - t12 = Vector128.Create(in1213).AsUInt32(); - t13 = Vector128.Create(in1213).AsUInt32(); - - x_12 = Sse2.Add(Vector128.AsUInt64(addv12), Vector128.AsUInt64(t12)).AsUInt32(); - x_13 = Sse2.Add(Vector128.AsUInt64(addv13), Vector128.AsUInt64(t13)).AsUInt32(); - - t12 = Sse2.UnpackLow(x_12, x_13); - t13 = Sse2.UnpackHigh(x_12, x_13); - - x_12 = Sse2.UnpackLow(t12, t13); - x_13 = Sse2.UnpackHigh(t12, t13); - - orig12 = x_12; - orig13 = x_13; - - in1213 += 4; - - x[12] = (uint)(in1213 & 0xFFFFFFFF); - x[13] = (uint)(in1213 >> 32 & 0xFFFFFFFF); - - for (int i = 0; i < 20; i += 2) - { - Vec128QuarterRound(ref x_0, ref x_4, ref x_8, ref x_12); - Vec128QuarterRound(ref x_1, ref x_5, ref x_9, ref x_13); - Vec128QuarterRound(ref x_2, ref x_6, ref x_10, ref x_14); - Vec128QuarterRound(ref x_3, ref x_7, ref x_11, ref x_15); - Vec128QuarterRound(ref x_0, ref x_5, ref x_10, ref x_15); - Vec128QuarterRound(ref x_1, ref x_6, ref x_11, ref x_12); - Vec128QuarterRound(ref x_2, ref x_7, ref x_8, ref x_13); - Vec128QuarterRound(ref x_3, ref x_4, ref x_9, ref x_14); - } - OneQuad(ref x_0, ref x_1, ref x_2, ref x_3, ref orig0, ref orig1, ref orig2, ref orig3, m, c); - m += 16; - c += 16; - OneQuad(ref x_4, ref x_5, ref x_6, ref x_7, ref orig4, ref orig5, ref orig6, ref orig7, m, c); - m += 16; - c += 16; - OneQuad(ref x_8, ref x_9, ref x_10, ref x_11, ref orig8, ref orig9, ref orig10, ref orig11, m, c); - m += 16; - c += 16; - OneQuad(ref x_12, ref x_13, ref x_14, ref x_15, ref orig12, ref orig13, ref orig14, ref orig15, m, c); - m -= 48; - c -= 48; - bytes -= 256; - c += 256; - m += 256; - } - } - while (bytes >= 64) - { - Vector128 x_0 = Sse2.LoadVector128(x); - Vector128 x_1 = Sse2.LoadVector128(x + 4); - Vector128 x_2 = Sse2.LoadVector128(x + 8); - Vector128 x_3 = Sse2.LoadVector128(x + 12); - Vector128 t_1; - - for (int i = 0; i < 20; i += 2) - { - x_0 = Sse2.Add(x_0, x_1); - x_3 = Sse2.Xor(x_3, x_0); - x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); - - x_2 = Sse2.Add(x_2, x_3); - x_1 = Sse2.Xor(x_1, x_2); - - t_1 = x_1; - x_1 = Sse2.ShiftLeftLogical(x_1, 12); - t_1 = Sse2.ShiftRightLogical(t_1, 20); - x_1 = Sse2.Xor(x_1, t_1); - - x_0 = Sse2.Add(x_0, x_1); - x_3 = Sse2.Xor(x_3, x_0); - x_0 = Sse2.Shuffle(x_0, 147); - x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); - - x_2 = Sse2.Add(x_2, x_3); - x_3 = Sse2.Shuffle(x_3, 78); - x_1 = Sse2.Xor(x_1, x_2); - x_2 = Sse2.Shuffle(x_2, 57); - - t_1 = x_1; - x_1 = Sse2.ShiftLeftLogical(x_1, 7); - t_1 = Sse2.ShiftRightLogical(t_1, 25); - x_1 = Sse2.Xor(x_1, t_1); - - x_0 = Sse2.Add(x_0, x_1); - x_3 = Sse2.Xor(x_3, x_0); - x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); - - x_2 = Sse2.Add(x_2, x_3); - x_1 = Sse2.Xor(x_1, x_2); - - t_1 = x_1; - x_1 = Sse2.ShiftLeftLogical(x_1, 12); - t_1 = Sse2.ShiftRightLogical(t_1, 20); - x_1 = Sse2.Xor(x_1, t_1); - - x_0 = Sse2.Add(x_0, x_1); - x_3 = Sse2.Xor(x_3, x_0); - x_0 = Sse2.Shuffle(x_0, 57); - x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); + if (!Sse3.IsSupported) + throw new Exception("Error this vectorisation is not supported on this CPU"); - x_2 = Sse2.Add(x_2, x_3); - x_3 = Sse2.Shuffle(x_3, 78); - x_1 = Sse2.Xor(x_1, x_2); - x_2 = Sse2.Shuffle(x_2, 147); - - t_1 = x_1; - x_1 = Sse2.ShiftLeftLogical(x_1, 7); - t_1 = Sse2.ShiftRightLogical(t_1, 25); - x_1 = Sse2.Xor(x_1, t_1); - } - x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x)); - x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4)); - x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8)); - x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12)); - x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); - x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32(); - x_2 = Sse2.Xor(x_2.AsByte(), Sse2.LoadVector128(m + 32)).AsUInt32(); - x_3 = Sse2.Xor(x_3.AsByte(), Sse2.LoadVector128(m + 48)).AsUInt32(); - Sse2.Store(c, x_0.AsByte()); - Sse2.Store(c + 16, x_1.AsByte()); - Sse2.Store(c + 32, x_2.AsByte()); - Sse2.Store(c + 48, x_3.AsByte()); - - uint in12 = x[12]; - uint in13 = x[13]; - in12++; - if (in12 == 0) - { - in13++; - } - x[12] = in12; - x[13] = in13; - - bytes -= 64; - c += 64; - m += 64; - } - if (bytes > 0) - { - Vector128 x_0 = Sse2.LoadVector128(x); - Vector128 x_1 = Sse2.LoadVector128(x + 4); - Vector128 x_2 = Sse2.LoadVector128(x + 8); - Vector128 x_3 = Sse2.LoadVector128(x + 12); - Vector128 t_1; - for (int i = 0; i < 20; i += 2) - { - x_0 = Sse2.Add(x_0, x_1); - x_3 = Sse2.Xor(x_3, x_0); - x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); - - x_2 = Sse2.Add(x_2, x_3); - x_1 = Sse2.Xor(x_1, x_2); - - t_1 = x_1; - x_1 = Sse2.ShiftLeftLogical(x_1, 12); - t_1 = Sse2.ShiftRightLogical(t_1, 20); - x_1 = Sse2.Xor(x_1, t_1); - - x_0 = Sse2.Add(x_0, x_1); - x_3 = Sse2.Xor(x_3, x_0); - x_0 = Sse2.Shuffle(x_0, 0x93); - x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); - - x_2 = Sse2.Add(x_2, x_3); - x_3 = Sse2.Shuffle(x_3, 0x4e); - x_1 = Sse2.Xor(x_1, x_2); - x_2 = Sse2.Shuffle(x_2, 0x39); - - t_1 = x_1; - x_1 = Sse2.ShiftLeftLogical(x_1, 7); - t_1 = Sse2.ShiftRightLogical(t_1, 25); - x_1 = Sse2.Xor(x_1, t_1); - - x_0 = Sse2.Add(x_0, x_1); - x_3 = Sse2.Xor(x_3, x_0); - x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); - - x_2 = Sse2.Add(x_2, x_3); - x_1 = Sse2.Xor(x_1, x_2); - - t_1 = x_1; - x_1 = Sse2.ShiftLeftLogical(x_1, 12); - t_1 = Sse2.ShiftRightLogical(t_1, 20); - x_1 = Sse2.Xor(x_1, t_1); - - x_0 = Sse2.Add(x_0, x_1); - x_3 = Sse2.Xor(x_3, x_0); - x_0 = Sse2.Shuffle(x_0, 0x39); - x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); - - x_2 = Sse2.Add(x_2, x_3); - x_3 = Sse2.Shuffle(x_3, 0x4e); - x_1 = Sse2.Xor(x_1, x_2); - x_2 = Sse2.Shuffle(x_2, 0x93); - - t_1 = x_1; - x_1 = Sse2.ShiftLeftLogical(x_1, 7); - t_1 = Sse2.ShiftRightLogical(t_1, 25); - x_1 = Sse2.Xor(x_1, t_1); - } - x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x)); - x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4)); - x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8)); - x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12)); - byte* partialblock = stackalloc byte[64]; - Sse2.Store(partialblock, Vector128.AsByte(x_0)); - Sse2.Store(partialblock + 16, Vector128.AsByte(x_1)); - Sse2.Store(partialblock + 32, Vector128.AsByte(x_2)); - Sse2.Store(partialblock + 48, Vector128.AsByte(x_3)); - - for (ulong i = 0; i < bytes; i++) - { - c[i] = (byte)(m[i] ^ partialblock[i]); - } - for (int n = 0; n < 64 / sizeof(int); n++) - { - ((int*)partialblock)[n] = 0; - } - } - } - - // 256 byte methods - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe void OneQuad(ref Vector128 x_A, ref Vector128 x_B, ref Vector128 x_C, ref Vector128 x_D, ref Vector128 origA, ref Vector128 origB, ref Vector128 origC, ref Vector128 origD, byte* m, byte* c) + if (Avx2.IsSupported && bytes >= 512) { - Vector128 t_A, t_B, t_C, t_D, t0, t1, t2, t3; - x_A = Sse2.Add(x_A, origA); - x_B = Sse2.Add(x_B, origB); - x_C = Sse2.Add(x_C, origC); - x_D = Sse2.Add(x_D, origD); - t_A = Sse2.UnpackLow(x_A, x_B); - t_B = Sse2.UnpackLow(x_C, x_D); - t_C = Sse2.UnpackHigh(x_A, x_B); - t_D = Sse2.UnpackHigh(x_C, x_D); - x_A = Sse2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); - x_B = Sse2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); - x_C = Sse2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); - x_D = Sse2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); - t0 = Sse2.Xor(x_A.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); - Sse2.Store(c, t0.AsByte()); - t1 = Sse2.Xor(x_B.AsByte(), Sse2.LoadVector128(m + 64)).AsUInt32(); - Sse2.Store(c + 64, t1.AsByte()); - t2 = Sse2.Xor(x_C.AsByte(), Sse2.LoadVector128(m + 128)).AsUInt32(); - Sse2.Store(c + 128, t2.AsByte()); - t3 = Sse2.Xor(x_D.AsByte(), Sse2.LoadVector128(m + 192)).AsUInt32(); - Sse2.Store(c + 192, t3.AsByte()); + ChaCha512.Process(x, ref m, ref c, ref bytes); } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Vec128QuarterRound(ref Vector128 x_A, ref Vector128 x_B, ref Vector128 x_C, ref Vector128 x_D) - { - Vector128 t_A, t_C; - x_A = Sse2.Add(x_A, x_B); - t_A = Sse2.Xor(x_D, x_A); - x_D = Ssse3.Shuffle(t_A.AsByte(), rot16_128).AsUInt32(); - x_C = Sse2.Add(x_C, x_D); - t_C = Sse2.Xor(x_B, x_C); - x_B = Sse2.Or(Sse2.ShiftLeftLogical(t_C, 12), Sse2.ShiftRightLogical(t_C, 20)); - x_A = Sse2.Add(x_A, x_B); - t_A = Sse2.Xor(x_D, x_A); - x_D = Ssse3.Shuffle(t_A.AsByte(), rot8_128).AsUInt32(); - x_C = Sse2.Add(x_C, x_D); - t_C = Sse2.Xor(x_B, x_C); - x_B = Sse2.Or(Sse2.ShiftLeftLogical(t_C, 7), Sse2.ShiftRightLogical(t_C, 25)); - } - - // 512 byte methods - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector256 Vector256Rotate(Vector256 a, byte imm) => Avx2.Or(Avx2.ShiftLeftLogical(a, imm), Avx2.ShiftRightLogical(a, (byte)(32 - imm))); - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Vec256Round(ref Vector256 A1, ref Vector256 B1, ref Vector256 C1, ref Vector256 D1, ref Vector256 A2, ref Vector256 B2, ref Vector256 C2, ref Vector256 D2, ref Vector256 A3, ref Vector256 B3, ref Vector256 C3, ref Vector256 D3, ref Vector256 A4, ref Vector256 B4, ref Vector256 C4, ref Vector256 D4) - { - Vector256Line1(ref A1, ref B1, ref C1, ref D1); - Vector256Line1(ref A2, ref B2, ref C2, ref D2); - Vector256Line1(ref A3, ref B3, ref C3, ref D3); - Vector256Line1(ref A4, ref B4, ref C4, ref D4); - Vector256Line2(ref A1, ref B1, ref C1, ref D1); - Vector256Line2(ref A2, ref B2, ref C2, ref D2); - Vector256Line2(ref A3, ref B3, ref C3, ref D3); - Vector256Line2(ref A4, ref B4, ref C4, ref D4); - Vector256Line3(ref A1, ref B1, ref C1, ref D1); - Vector256Line3(ref A2, ref B2, ref C2, ref D2); - Vector256Line3(ref A3, ref B3, ref C3, ref D3); - Vector256Line3(ref A4, ref B4, ref C4, ref D4); - Vector256Line4(ref A1, ref B1, ref C1, ref D1); - Vector256Line4(ref A2, ref B2, ref C2, ref D2); - Vector256Line4(ref A3, ref B3, ref C3, ref D3); - Vector256Line4(ref A4, ref B4, ref C4, ref D4); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Vector256Line1(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + if (bytes >= 256) { - x_A = Avx2.Add(x_A, x_B); - x_D = Avx2.Shuffle(Avx2.Xor(x_D, x_A).AsByte(), rot16_256).AsUInt32(); + ChaCha256.Process(x, ref m, ref c, ref bytes); } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Vector256Line2(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) - { - x_C = Avx2.Add(x_C, x_D); - x_B = Vector256Rotate(Avx2.Xor(x_B, x_C), 12); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Vector256Line3(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + while (bytes >= 64) { - x_A = Avx2.Add(x_A, x_B); - x_D = Avx2.Shuffle(Avx2.Xor(x_D, x_A).AsByte(), rot8_256).AsUInt32(); + ChaCha64.Process64(x, ref m, ref c, ref bytes); } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Vector256Line4(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) - { - x_C = Avx2.Add(x_C, x_D); - x_B = Vector256Rotate(Avx2.Xor(x_B, x_C), 7); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void OneQuadUnpack(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D, ref Vector256 t_A, ref Vector256 t_B, ref Vector256 t_C, ref Vector256 t_D, ref Vector256 orig_A, ref Vector256 orig_B, ref Vector256 orig_C, ref Vector256 orig_D) + if (bytes > 0) { - x_A = Avx2.Add(x_A, orig_A); - x_B = Avx2.Add(x_B, orig_B); - x_C = Avx2.Add(x_C, orig_C); - x_D = Avx2.Add(x_D, orig_D); - t_A = Avx2.UnpackLow(x_A, x_B); - t_B = Avx2.UnpackLow(x_C, x_D); - t_C = Avx2.UnpackHigh(x_A, x_B); - t_D = Avx2.UnpackHigh(x_C, x_D); - x_A = Avx2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); - x_B = Avx2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); - x_C = Avx2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); - x_D = Avx2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + ChaCha64.ProcessVarLength(x, ref m, ref c, ref bytes); } } } -#pragma warning restore IDE0007 // Use implicit type #endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha256.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha256.cs new file mode 100644 index 0000000..c5d7022 --- /dev/null +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha256.cs @@ -0,0 +1,170 @@ +#if INTRINSICS +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; + +namespace NaCl.Core.Base.ChaChaIntrinsics; + +#pragma warning disable IDE0007 // Use implicit type +internal static class ChaCha256 +{ + private static readonly Vector128 rot8_128 = Vector128.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); + private static readonly Vector128 rot16_128 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void Process(uint* x, ref byte* m, ref byte* c, ref ulong bytes) + { + Vector128 x_0 = Vector128.Create(x[0]); + Vector128 x_1 = Vector128.Create(x[1]); + Vector128 x_2 = Vector128.Create(x[2]); + Vector128 x_3 = Vector128.Create(x[3]); + Vector128 x_4 = Vector128.Create(x[4]); + Vector128 x_5 = Vector128.Create(x[5]); + Vector128 x_6 = Vector128.Create(x[6]); + Vector128 x_7 = Vector128.Create(x[7]); + Vector128 x_8 = Vector128.Create(x[8]); + Vector128 x_9 = Vector128.Create(x[9]); + Vector128 x_10 = Vector128.Create(x[10]); + Vector128 x_11 = Vector128.Create(x[11]); + Vector128 x_12; + Vector128 x_13; + Vector128 x_14 = Vector128.Create(x[14]); + Vector128 x_15 = Vector128.Create(x[15]); + Vector128 orig0 = x_0; + Vector128 orig1 = x_1; + Vector128 orig2 = x_2; + Vector128 orig3 = x_3; + Vector128 orig4 = x_4; + Vector128 orig5 = x_5; + Vector128 orig6 = x_6; + Vector128 orig7 = x_7; + Vector128 orig8 = x_8; + Vector128 orig9 = x_9; + Vector128 orig10 = x_10; + Vector128 orig11 = x_11; + Vector128 orig12; + Vector128 orig13; + Vector128 orig14 = x_14; + Vector128 orig15 = x_15; + Vector128 t12, t13; + + while (bytes >= 256) + { + Vector128 addv12 = Vector128.Create(0, 1).AsUInt32(); + Vector128 addv13 = Vector128.Create(2, 3).AsUInt32(); + + x_0 = orig0; + x_1 = orig1; + x_2 = orig2; + x_3 = orig3; + x_4 = orig4; + x_5 = orig5; + x_6 = orig6; + x_7 = orig7; + x_8 = orig8; + x_9 = orig9; + x_10 = orig10; + x_11 = orig11; + x_14 = orig14; + x_15 = orig15; + + uint in12 = x[12]; + uint in13 = x[13]; + ulong in1213 = in12 | ((ulong)in13) << 32; + t12 = Vector128.Create(in1213).AsUInt32(); + t13 = Vector128.Create(in1213).AsUInt32(); + + x_12 = Sse2.Add(Vector128.AsUInt64(addv12), Vector128.AsUInt64(t12)).AsUInt32(); + x_13 = Sse2.Add(Vector128.AsUInt64(addv13), Vector128.AsUInt64(t13)).AsUInt32(); + + t12 = Sse2.UnpackLow(x_12, x_13); + t13 = Sse2.UnpackHigh(x_12, x_13); + + x_12 = Sse2.UnpackLow(t12, t13); + x_13 = Sse2.UnpackHigh(t12, t13); + + orig12 = x_12; + orig13 = x_13; + + in1213 += 4; + + x[12] = (uint)(in1213 & 0xFFFFFFFF); + x[13] = (uint)(in1213 >> 32 & 0xFFFFFFFF); + + for (int i = 0; i < 20; i += 2) + { + Vec128QuarterRound(ref x_0, ref x_4, ref x_8, ref x_12); + Vec128QuarterRound(ref x_1, ref x_5, ref x_9, ref x_13); + Vec128QuarterRound(ref x_2, ref x_6, ref x_10, ref x_14); + Vec128QuarterRound(ref x_3, ref x_7, ref x_11, ref x_15); + Vec128QuarterRound(ref x_0, ref x_5, ref x_10, ref x_15); + Vec128QuarterRound(ref x_1, ref x_6, ref x_11, ref x_12); + Vec128QuarterRound(ref x_2, ref x_7, ref x_8, ref x_13); + Vec128QuarterRound(ref x_3, ref x_4, ref x_9, ref x_14); + } + + OneQuad(ref x_0, ref x_1, ref x_2, ref x_3, ref orig0, ref orig1, ref orig2, ref orig3, m, c); + m += 16; + c += 16; + OneQuad(ref x_4, ref x_5, ref x_6, ref x_7, ref orig4, ref orig5, ref orig6, ref orig7, m, c); + m += 16; + c += 16; + OneQuad(ref x_8, ref x_9, ref x_10, ref x_11, ref orig8, ref orig9, ref orig10, ref orig11, m, c); + m += 16; + c += 16; + OneQuad(ref x_12, ref x_13, ref x_14, ref x_15, ref orig12, ref orig13, ref orig14, ref orig15, m, c); + m -= 48; + c -= 48; + bytes -= 256; + c += 256; + m += 256; + } + } + + // 256 byte methods + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void OneQuad(ref Vector128 x_A, ref Vector128 x_B, ref Vector128 x_C, ref Vector128 x_D, ref Vector128 origA, ref Vector128 origB, ref Vector128 origC, ref Vector128 origD, byte* m, byte* c) + { + Vector128 t_A, t_B, t_C, t_D, t0, t1, t2, t3; + x_A = Sse2.Add(x_A, origA); + x_B = Sse2.Add(x_B, origB); + x_C = Sse2.Add(x_C, origC); + x_D = Sse2.Add(x_D, origD); + t_A = Sse2.UnpackLow(x_A, x_B); + t_B = Sse2.UnpackLow(x_C, x_D); + t_C = Sse2.UnpackHigh(x_A, x_B); + t_D = Sse2.UnpackHigh(x_C, x_D); + x_A = Sse2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_B = Sse2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_C = Sse2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + x_D = Sse2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + t0 = Sse2.Xor(x_A.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); + Sse2.Store(c, t0.AsByte()); + t1 = Sse2.Xor(x_B.AsByte(), Sse2.LoadVector128(m + 64)).AsUInt32(); + Sse2.Store(c + 64, t1.AsByte()); + t2 = Sse2.Xor(x_C.AsByte(), Sse2.LoadVector128(m + 128)).AsUInt32(); + Sse2.Store(c + 128, t2.AsByte()); + t3 = Sse2.Xor(x_D.AsByte(), Sse2.LoadVector128(m + 192)).AsUInt32(); + Sse2.Store(c + 192, t3.AsByte()); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vec128QuarterRound(ref Vector128 x_A, ref Vector128 x_B, ref Vector128 x_C, ref Vector128 x_D) + { + Vector128 t_A, t_C; + x_A = Sse2.Add(x_A, x_B); + t_A = Sse2.Xor(x_D, x_A); + x_D = Ssse3.Shuffle(t_A.AsByte(), rot16_128).AsUInt32(); + x_C = Sse2.Add(x_C, x_D); + t_C = Sse2.Xor(x_B, x_C); + x_B = Sse2.Or(Sse2.ShiftLeftLogical(t_C, 12), Sse2.ShiftRightLogical(t_C, 20)); + x_A = Sse2.Add(x_A, x_B); + t_A = Sse2.Xor(x_D, x_A); + x_D = Ssse3.Shuffle(t_A.AsByte(), rot8_128).AsUInt32(); + x_C = Sse2.Add(x_C, x_D); + t_C = Sse2.Xor(x_B, x_C); + x_B = Sse2.Or(Sse2.ShiftLeftLogical(t_C, 7), Sse2.ShiftRightLogical(t_C, 25)); + } +} +#pragma warning restore IDE0007 // Use implicit type +#endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs new file mode 100644 index 0000000..f31550b --- /dev/null +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs @@ -0,0 +1,238 @@ +#if INTRINSICS +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; + +namespace NaCl.Core.Base.ChaChaIntrinsics; + +#pragma warning disable IDE0007 // Use implicit type +internal static class ChaCha512 +{ + private static readonly Vector256 rot8_256 = Vector256.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); + private static readonly Vector256 rot16_256 = Vector256.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void Process(uint* x, ref byte* m, ref byte* c, ref ulong bytes) + { + Vector256 x_0 = Vector256.Create(x[0]); + Vector256 x_1 = Vector256.Create(x[1]); + Vector256 x_2 = Vector256.Create(x[2]); + Vector256 x_3 = Vector256.Create(x[3]); + Vector256 x_4 = Vector256.Create(x[4]); + Vector256 x_5 = Vector256.Create(x[5]); + Vector256 x_6 = Vector256.Create(x[6]); + Vector256 x_7 = Vector256.Create(x[7]); + Vector256 x_8 = Vector256.Create(x[8]); + Vector256 x_9 = Vector256.Create(x[9]); + Vector256 x_10 = Vector256.Create(x[10]); + Vector256 x_11 = Vector256.Create(x[11]); + Vector256 x_12; + Vector256 x_13; + Vector256 x_14 = Vector256.Create(x[14]); + Vector256 x_15 = Vector256.Create(x[15]); + + Vector256 orig0 = x_0; + Vector256 orig1 = x_1; + Vector256 orig2 = x_2; + Vector256 orig3 = x_3; + Vector256 orig4 = x_4; + Vector256 orig5 = x_5; + Vector256 orig6 = x_6; + Vector256 orig7 = x_7; + Vector256 orig8 = x_8; + Vector256 orig9 = x_9; + Vector256 orig10 = x_10; + Vector256 orig11 = x_11; + Vector256 orig12; + Vector256 orig13; + Vector256 orig14 = x_14; + Vector256 orig15 = x_15; + + while (bytes >= 512) + { + Vector256 addv12 = Vector256.Create(0, 1, 2, 3).AsUInt32(); + Vector256 addv13 = Vector256.Create(4, 5, 6, 7).AsUInt32(); + Vector256 permute = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7).AsUInt32(); + Vector256 t12, t13; + x_0 = orig0; + x_1 = orig1; + x_2 = orig2; + x_3 = orig3; + x_4 = orig4; + x_5 = orig5; + x_6 = orig6; + x_7 = orig7; + x_8 = orig8; + x_9 = orig9; + x_10 = orig10; + x_11 = orig11; + x_14 = orig14; + x_15 = orig15; + + uint in12 = x[12]; + uint in13 = x[13]; + ulong in1213 = in12 | ((ulong)in13 << 32); + x_12 = x_13 = Avx2.BroadcastScalarToVector256(Sse2.X64.ConvertScalarToVector128UInt64(in1213)).AsUInt32(); + t12 = Avx2.Add(addv12.AsUInt64(), x_12.AsUInt64()).AsUInt32(); + t13 = Avx2.Add(addv13.AsUInt64(), x_13.AsUInt64()).AsUInt32(); + x_12 = Avx2.UnpackLow(t12, t13); + x_13 = Avx2.UnpackHigh(t12, t13); + t12 = Avx2.UnpackLow(x_12, x_13); + t13 = Avx2.UnpackHigh(x_12, x_13); + x_12 = Avx2.PermuteVar8x32(t12, permute); + x_13 = Avx2.PermuteVar8x32(t13, permute); + + orig12 = x_12; + orig13 = x_13; + + in1213 += 8; + + x[12] = (uint)(in1213 & 0xFFFFFFFF); + x[13] = (uint)((in1213 >> 32) & 0xFFFFFFFF); + for (int i = 0; i < 20; i += 2) + { + Vec256Round(ref x_0, ref x_4, ref x_8, ref x_12, ref x_1, ref x_5, ref x_9, ref x_13, ref x_2, ref x_6, ref x_10, ref x_14, ref x_3, ref x_7, ref x_11, ref x_15); + Vec256Round(ref x_0, ref x_5, ref x_10, ref x_15, ref x_1, ref x_6, ref x_11, ref x_12, ref x_2, ref x_7, ref x_8, ref x_13, ref x_3, ref x_4, ref x_9, ref x_14); + } + + Vector256 t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, t_13, t_14, t_15; + t_0 = t_1 = t_2 = t_3 = t_4 = t_5 = t_6 = t_7 = t_8 = t_9 = t_10 = t_11 = t_12 = t_13 = t_14 = t_15 = Vector256.Create((uint)0); + // ONEOCTO enter + OneQuadUnpack(ref x_0, ref x_1, ref x_2, ref x_3, ref t_0, ref t_1, ref t_2, ref t_3, ref orig0, ref orig1, ref orig2, ref orig3); + OneQuadUnpack(ref x_4, ref x_5, ref x_6, ref x_7, ref t_4, ref t_5, ref t_6, ref t_7, ref orig4, ref orig5, ref orig6, ref orig7); + t_0 = Avx2.Permute2x128(x_0, x_4, 0x20); + t_4 = Avx2.Permute2x128(x_0, x_4, 0x31); + t_1 = Avx2.Permute2x128(x_1, x_5, 0x20); + t_5 = Avx2.Permute2x128(x_1, x_5, 0x31); + t_2 = Avx2.Permute2x128(x_2, x_6, 0x20); + t_6 = Avx2.Permute2x128(x_2, x_6, 0x31); + t_3 = Avx2.Permute2x128(x_3, x_7, 0x20); + t_7 = Avx2.Permute2x128(x_3, x_7, 0x31); + t_0 = Avx2.Xor(t_0, Avx.LoadVector256(m).AsUInt32()); + t_1 = Avx2.Xor(t_1, Avx.LoadVector256(m + 64).AsUInt32()); + t_2 = Avx2.Xor(t_2, Avx.LoadVector256(m + 128).AsUInt32()); + t_3 = Avx2.Xor(t_3, Avx.LoadVector256(m + 192).AsUInt32()); + t_4 = Avx2.Xor(t_4, Avx.LoadVector256(m + 256).AsUInt32()); + t_5 = Avx2.Xor(t_5, Avx.LoadVector256(m + 320).AsUInt32()); + t_6 = Avx2.Xor(t_6, Avx.LoadVector256(m + 384).AsUInt32()); + t_7 = Avx2.Xor(t_7, Avx.LoadVector256(m + 448).AsUInt32()); + Avx.Store(c, t_0.AsByte()); + Avx.Store(c + 64, t_1.AsByte()); + Avx.Store(c + 128, t_2.AsByte()); + Avx.Store(c + 192, t_3.AsByte()); + Avx.Store(c + 256, t_4.AsByte()); + Avx.Store(c + 320, t_5.AsByte()); + Avx.Store(c + 384, t_6.AsByte()); + Avx.Store(c + 448, t_7.AsByte()); + // ONEOCTO exit + + m += 32; + c += 32; + + // ONEOCTO enter + OneQuadUnpack(ref x_8, ref x_9, ref x_10, ref x_11, ref t_8, ref t_9, ref t_10, ref t_11, ref orig8, ref orig9, ref orig10, ref orig11); + OneQuadUnpack(ref x_12, ref x_13, ref x_14, ref x_15, ref t_12, ref t_13, ref t_14, ref t_15, ref orig12, ref orig13, ref orig14, ref orig15); + t_8 = Avx2.Permute2x128(x_8, x_12, 0x20); + t_12 = Avx2.Permute2x128(x_8, x_12, 0x31); + t_9 = Avx2.Permute2x128(x_9, x_13, 0x20); + t_13 = Avx2.Permute2x128(x_9, x_13, 0x31); + t_10 = Avx2.Permute2x128(x_10, x_14, 0x20); + t_14 = Avx2.Permute2x128(x_10, x_14, 0x31); + t_11 = Avx2.Permute2x128(x_11, x_15, 0x20); + t_15 = Avx2.Permute2x128(x_11, x_15, 0x31); + t_8 = Avx2.Xor(t_8, Avx.LoadVector256(m).AsUInt32()); + t_9 = Avx2.Xor(t_9, Avx.LoadVector256(m + 64).AsUInt32()); + t_10 = Avx2.Xor(t_10, Avx.LoadVector256(m + 128).AsUInt32()); + t_11 = Avx2.Xor(t_11, Avx.LoadVector256(m + 192).AsUInt32()); + t_12 = Avx2.Xor(t_12, Avx.LoadVector256(m + 256).AsUInt32()); + t_13 = Avx2.Xor(t_13, Avx.LoadVector256(m + 320).AsUInt32()); + t_14 = Avx2.Xor(t_14, Avx.LoadVector256(m + 384).AsUInt32()); + t_15 = Avx2.Xor(t_15, Avx.LoadVector256(m + 448).AsUInt32()); + Avx.Store(c, t_8.AsByte()); + Avx.Store(c + 64, t_9.AsByte()); + Avx.Store(c + 128, t_10.AsByte()); + Avx.Store(c + 192, t_11.AsByte()); + Avx.Store(c + 256, t_12.AsByte()); + Avx.Store(c + 320, t_13.AsByte()); + Avx.Store(c + 384, t_14.AsByte()); + Avx.Store(c + 448, t_15.AsByte()); + // ONEOCTO exit + m -= 32; + c -= 32; + bytes -= 512; + c += 512; + m += 512; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 Vector256Rotate(Vector256 a, byte imm) => Avx2.Or(Avx2.ShiftLeftLogical(a, imm), Avx2.ShiftRightLogical(a, (byte)(32 - imm))); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vec256Round(ref Vector256 A1, ref Vector256 B1, ref Vector256 C1, ref Vector256 D1, ref Vector256 A2, ref Vector256 B2, ref Vector256 C2, ref Vector256 D2, ref Vector256 A3, ref Vector256 B3, ref Vector256 C3, ref Vector256 D3, ref Vector256 A4, ref Vector256 B4, ref Vector256 C4, ref Vector256 D4) + { + Vector256Line1(ref A1, ref B1, ref C1, ref D1); + Vector256Line1(ref A2, ref B2, ref C2, ref D2); + Vector256Line1(ref A3, ref B3, ref C3, ref D3); + Vector256Line1(ref A4, ref B4, ref C4, ref D4); + Vector256Line2(ref A1, ref B1, ref C1, ref D1); + Vector256Line2(ref A2, ref B2, ref C2, ref D2); + Vector256Line2(ref A3, ref B3, ref C3, ref D3); + Vector256Line2(ref A4, ref B4, ref C4, ref D4); + Vector256Line3(ref A1, ref B1, ref C1, ref D1); + Vector256Line3(ref A2, ref B2, ref C2, ref D2); + Vector256Line3(ref A3, ref B3, ref C3, ref D3); + Vector256Line3(ref A4, ref B4, ref C4, ref D4); + Vector256Line4(ref A1, ref B1, ref C1, ref D1); + Vector256Line4(ref A2, ref B2, ref C2, ref D2); + Vector256Line4(ref A3, ref B3, ref C3, ref D3); + Vector256Line4(ref A4, ref B4, ref C4, ref D4); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line1(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_A = Avx2.Add(x_A, x_B); + x_D = Avx2.Shuffle(Avx2.Xor(x_D, x_A).AsByte(), rot16_256).AsUInt32(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line2(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_C = Avx2.Add(x_C, x_D); + x_B = Vector256Rotate(Avx2.Xor(x_B, x_C), 12); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line3(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_A = Avx2.Add(x_A, x_B); + x_D = Avx2.Shuffle(Avx2.Xor(x_D, x_A).AsByte(), rot8_256).AsUInt32(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line4(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_C = Avx2.Add(x_C, x_D); + x_B = Vector256Rotate(Avx2.Xor(x_B, x_C), 7); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void OneQuadUnpack(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D, ref Vector256 t_A, ref Vector256 t_B, ref Vector256 t_C, ref Vector256 t_D, ref Vector256 orig_A, ref Vector256 orig_B, ref Vector256 orig_C, ref Vector256 orig_D) + { + x_A = Avx2.Add(x_A, orig_A); + x_B = Avx2.Add(x_B, orig_B); + x_C = Avx2.Add(x_C, orig_C); + x_D = Avx2.Add(x_D, orig_D); + t_A = Avx2.UnpackLow(x_A, x_B); + t_B = Avx2.UnpackLow(x_C, x_D); + t_C = Avx2.UnpackHigh(x_A, x_B); + t_D = Avx2.UnpackHigh(x_C, x_D); + x_A = Avx2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_B = Avx2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_C = Avx2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + x_D = Avx2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + } +} +#pragma warning restore IDE0007 // Use implicit type +#endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs new file mode 100644 index 0000000..192dd45 --- /dev/null +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs @@ -0,0 +1,158 @@ +#if INTRINSICS +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; + +namespace NaCl.Core.Base.ChaChaIntrinsics; + +#pragma warning disable IDE0007 // Use implicit type +internal static class ChaCha64 +{ + private static readonly Vector128 rot8_128 = Vector128.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); + private static readonly Vector128 rot16_128 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void Process64(uint* x, ref byte* m, ref byte* c, ref ulong bytes) + { + Vector128 x_0 =Sse2.LoadVector128(x); + Vector128 x_1 =Sse2.LoadVector128(x + 4); + Vector128 x_2 =Sse2.LoadVector128(x + 8); + Vector128 x_3 =Sse2.LoadVector128(x + 12); + + Vector128 orig_0 = x_0; + Vector128 orig_1 = x_1; + Vector128 orig_2 = x_2; + Vector128 orig_3 = x_3; + + ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + + x_0 = Sse2.Add(x_0, orig_0); + x_1 = Sse2.Add(x_1, orig_1); + x_2 = Sse2.Add(x_2, orig_2); + x_3 = Sse2.Add(x_3, orig_3); + + x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); + x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32(); + x_2 = Sse2.Xor(x_2.AsByte(), Sse2.LoadVector128(m + 32)).AsUInt32(); + x_3 = Sse2.Xor(x_3.AsByte(), Sse2.LoadVector128(m + 48)).AsUInt32(); + Sse2.Store(c, x_0.AsByte()); + Sse2.Store(c + 16, x_1.AsByte()); + Sse2.Store(c + 32, x_2.AsByte()); + Sse2.Store(c + 48, x_3.AsByte()); + + uint in12 = x[12]; + uint in13 = x[13]; + in12++; + if (in12 == 0) + { + in13++; + } + x[12] = in12; + x[13] = in13; + + bytes -= 64; + c += 64; + m += 64; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void ProcessVarLength(uint* x, ref byte* m, ref byte* c, ref ulong bytes) + { + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); + + Vector128 orig_0 = x_0; + Vector128 orig_1 = x_1; + Vector128 orig_2 = x_2; + Vector128 orig_3 = x_3; + + ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + + x_0 = Sse2.Add(x_0, orig_0); + x_1 = Sse2.Add(x_1, orig_1); + x_2 = Sse2.Add(x_2, orig_2); + x_3 = Sse2.Add(x_3, orig_3); + + byte* partialblock = stackalloc byte[64]; + Sse2.Store(partialblock, Vector128.AsByte(x_0)); + Sse2.Store(partialblock + 16, Vector128.AsByte(x_1)); + Sse2.Store(partialblock + 32, Vector128.AsByte(x_2)); + Sse2.Store(partialblock + 48, Vector128.AsByte(x_3)); + + for (ulong i = 0; i x_0, ref Vector128 x_1, ref Vector128 x_2, ref Vector128 x_3) + { + Vector128 t_1; + + for (int i = 0; i < 20; i += 2) + { + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_1 = Sse2.Xor(x_1, x_2); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 12); + t_1 = Sse2.ShiftRightLogical(t_1, 20); + x_1 = Sse2.Xor(x_1, t_1); + + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_0 = Sse2.Shuffle(x_0, 147); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_3 = Sse2.Shuffle(x_3, 78); + x_1 = Sse2.Xor(x_1, x_2); + x_2 = Sse2.Shuffle(x_2, 57); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 7); + t_1 = Sse2.ShiftRightLogical(t_1, 25); + x_1 = Sse2.Xor(x_1, t_1); + + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_1 = Sse2.Xor(x_1, x_2); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 12); + t_1 = Sse2.ShiftRightLogical(t_1, 20); + x_1 = Sse2.Xor(x_1, t_1); + + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_0 = Sse2.Shuffle(x_0, 57); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_3 = Sse2.Shuffle(x_3, 78); + x_1 = Sse2.Xor(x_1, x_2); + x_2 = Sse2.Shuffle(x_2, 147); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 7); + t_1 = Sse2.ShiftRightLogical(t_1, 25); + x_1 = Sse2.Xor(x_1, t_1); + } + } +} +#pragma warning restore IDE0007 // Use implicit type +#endif \ No newline at end of file From b2478acd6278c454eead956c499bb314f8496daa Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Wed, 12 Oct 2022 12:30:27 +0100 Subject: [PATCH 23/59] Refactor Salsa20BaseIntrinsics --- src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs | 554 +----------------- .../Base/SalsaIntrinsics/Salsa256.cs | 174 ++++++ .../Base/SalsaIntrinsics/Salsa512.cs | 231 ++++++++ src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs | 154 +++++ 4 files changed, 575 insertions(+), 538 deletions(-) create mode 100644 src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs create mode 100644 src/NaCl.Core/Base/SalsaIntrinsics/Salsa512.cs create mode 100644 src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs diff --git a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs index 48ce27f..114722a 100644 --- a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs @@ -1,558 +1,36 @@ #if INTRINSICS -#pragma warning disable IDE0007 // Use implicit type using System; using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +using NaCl.Core.Base.SalsaIntrinsics; -namespace NaCl.Core.Base +namespace NaCl.Core.Base; + +public static class Salsa20BaseIntrinsics { - public static class Salsa20BaseIntrinsics + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) { - private static readonly Vector256 rot8_256 = Vector256.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); - private static readonly Vector256 rot16_256 = Vector256.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) - { - if (!Sse3.IsSupported) - throw new Exception("Error this vectorisation is not supported on this CPU"); - - if (Avx2.IsSupported && bytes >= 512) - { - Vector256 x_0 = Vector256.Create(x[0]); - Vector256 x_1 = Vector256.Create(x[1]); - Vector256 x_2 = Vector256.Create(x[2]); - Vector256 x_3 = Vector256.Create(x[3]); - Vector256 x_4 = Vector256.Create(x[4]); - Vector256 x_5 = Vector256.Create(x[5]); - Vector256 x_6 = Vector256.Create(x[6]); - Vector256 x_7 = Vector256.Create(x[7]); - Vector256 x_8; - Vector256 x_9; - Vector256 x_10 = Vector256.Create(x[10]); - Vector256 x_11 = Vector256.Create(x[11]); - Vector256 x_12 = Vector256.Create(x[12]); - Vector256 x_13 = Vector256.Create(x[13]); - Vector256 x_14 = Vector256.Create(x[14]); - Vector256 x_15 = Vector256.Create(x[15]); - - Vector256 orig0 = x_0; - Vector256 orig1 = x_1; - Vector256 orig2 = x_2; - Vector256 orig3 = x_3; - Vector256 orig4 = x_4; - Vector256 orig5 = x_5; - Vector256 orig6 = x_6; - Vector256 orig7 = x_7; - Vector256 orig8; - Vector256 orig9; - Vector256 orig10 = x_10; - Vector256 orig11 = x_11; - Vector256 orig12 = x_12; - Vector256 orig13 = x_13; - Vector256 orig14 = x_14; - Vector256 orig15 = x_15; - - while (bytes >= 512) - { - Vector256 addv8 = Vector256.Create(0, 1, 2, 3).AsUInt32(); - Vector256 addv9 = Vector256.Create(4, 5, 6, 7).AsUInt32(); - Vector256 permute = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7).AsUInt32(); - Vector256 t8, t9; - x_0 = orig0; - x_1 = orig1; - x_2 = orig2; - x_3 = orig3; - x_4 = orig4; - x_5 = orig5; - x_6 = orig6; - x_7 = orig7; - x_10 = orig10; - x_11 = orig11; - x_12 = orig12; - x_13 = orig13; - x_14 = orig14; - x_15 = orig15; - - uint in8 = x[8]; - uint in9 = x[9]; - ulong in89 = in8 | ((ulong)in9 << 32); - x_8 = x_9 = Avx2.BroadcastScalarToVector256(Sse2.X64.ConvertScalarToVector128UInt64(in89)).AsUInt32(); - t8 = Avx2.Add(addv8.AsUInt64(), x_8.AsUInt64()).AsUInt32(); - t9 = Avx2.Add(addv9.AsUInt64(), x_9.AsUInt64()).AsUInt32(); - x_8 = Avx2.UnpackLow(t8, t9); - x_9 = Avx2.UnpackHigh(t8, t9); - t8 = Avx2.UnpackLow(x_8, x_9); - t9 = Avx2.UnpackHigh(x_8, x_9); - x_8 = Avx2.PermuteVar8x32(t8, permute); - x_9 = Avx2.PermuteVar8x32(t9, permute); - - orig8 = x_8; - orig9 = x_9; - - in89 += 8; - - x[8] = (uint)(in89 & 0xFFFFFFFF); - x[9] = (uint)((in89 >> 32) & 0xFFFFFFFF); - for (int i = 0; i < 20; i += 2) - { - Vec256Round(ref x_0, ref x_4, ref x_8, ref x_12, ref x_5, ref x_9, ref x_13, ref x_1, ref x_10, ref x_14, ref x_2, ref x_6, ref x_15, ref x_3, ref x_7, ref x_11); - Vec256Round(ref x_0, ref x_1, ref x_2, ref x_3, ref x_5, ref x_6, ref x_7, ref x_4, ref x_10, ref x_11, ref x_8, ref x_9, ref x_15, ref x_12, ref x_13, ref x_14); - } - - Vector256 t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, t_13, t_14, t_15; - t_0 = t_1 = t_2 = t_3 = t_4 = t_5 = t_6 = t_7 = t_8 = t_9 = t_10 = t_11 = t_12 = t_13 = t_14 = t_15 = Vector256.Create((uint)0); - // ONEOCTO enter - OneQuadUnpack(ref x_0, ref x_1, ref x_2, ref x_3, ref t_0, ref t_1, ref t_2, ref t_3, ref orig0, ref orig1, ref orig2, ref orig3); - OneQuadUnpack(ref x_4, ref x_5, ref x_6, ref x_7, ref t_4, ref t_5, ref t_6, ref t_7, ref orig4, ref orig5, ref orig6, ref orig7); - t_0 = Avx2.Permute2x128(x_0, x_4, 0x20); - t_4 = Avx2.Permute2x128(x_0, x_4, 0x31); - t_1 = Avx2.Permute2x128(x_1, x_5, 0x20); - t_5 = Avx2.Permute2x128(x_1, x_5, 0x31); - t_2 = Avx2.Permute2x128(x_2, x_6, 0x20); - t_6 = Avx2.Permute2x128(x_2, x_6, 0x31); - t_3 = Avx2.Permute2x128(x_3, x_7, 0x20); - t_7 = Avx2.Permute2x128(x_3, x_7, 0x31); - t_0 = Avx2.Xor(t_0, Avx.LoadVector256(m).AsUInt32()); - t_1 = Avx2.Xor(t_1, Avx.LoadVector256(m + 64).AsUInt32()); - t_2 = Avx2.Xor(t_2, Avx.LoadVector256(m + 128).AsUInt32()); - t_3 = Avx2.Xor(t_3, Avx.LoadVector256(m + 192).AsUInt32()); - t_4 = Avx2.Xor(t_4, Avx.LoadVector256(m + 256).AsUInt32()); - t_5 = Avx2.Xor(t_5, Avx.LoadVector256(m + 320).AsUInt32()); - t_6 = Avx2.Xor(t_6, Avx.LoadVector256(m + 384).AsUInt32()); - t_7 = Avx2.Xor(t_7, Avx.LoadVector256(m + 448).AsUInt32()); - Avx.Store(c, t_0.AsByte()); - Avx.Store(c + 64, t_1.AsByte()); - Avx.Store(c + 128, t_2.AsByte()); - Avx.Store(c + 192, t_3.AsByte()); - Avx.Store(c + 256, t_4.AsByte()); - Avx.Store(c + 320, t_5.AsByte()); - Avx.Store(c + 384, t_6.AsByte()); - Avx.Store(c + 448, t_7.AsByte()); - // ONEOCTO exit - - m += 32; - c += 32; - - // ONEOCTO enter - OneQuadUnpack(ref x_8, ref x_9, ref x_10, ref x_11, ref t_8, ref t_9, ref t_10, ref t_11, ref orig8, ref orig9, ref orig10, ref orig11); - OneQuadUnpack(ref x_12, ref x_13, ref x_14, ref x_15, ref t_12, ref t_13, ref t_14, ref t_15, ref orig12, ref orig13, ref orig14, ref orig15); - t_8 = Avx2.Permute2x128(x_8, x_12, 0x20); - t_12 = Avx2.Permute2x128(x_8, x_12, 0x31); - t_9 = Avx2.Permute2x128(x_9, x_13, 0x20); - t_13 = Avx2.Permute2x128(x_9, x_13, 0x31); - t_10 = Avx2.Permute2x128(x_10, x_14, 0x20); - t_14 = Avx2.Permute2x128(x_10, x_14, 0x31); - t_11 = Avx2.Permute2x128(x_11, x_15, 0x20); - t_15 = Avx2.Permute2x128(x_11, x_15, 0x31); - t_8 = Avx2.Xor(t_8, Avx.LoadVector256(m).AsUInt32()); - t_9 = Avx2.Xor(t_9, Avx.LoadVector256(m + 64).AsUInt32()); - t_10 = Avx2.Xor(t_10, Avx.LoadVector256(m + 128).AsUInt32()); - t_11 = Avx2.Xor(t_11, Avx.LoadVector256(m + 192).AsUInt32()); - t_12 = Avx2.Xor(t_12, Avx.LoadVector256(m + 256).AsUInt32()); - t_13 = Avx2.Xor(t_13, Avx.LoadVector256(m + 320).AsUInt32()); - t_14 = Avx2.Xor(t_14, Avx.LoadVector256(m + 384).AsUInt32()); - t_15 = Avx2.Xor(t_15, Avx.LoadVector256(m + 448).AsUInt32()); - Avx.Store(c, t_8.AsByte()); - Avx.Store(c + 64, t_9.AsByte()); - Avx.Store(c + 128, t_10.AsByte()); - Avx.Store(c + 192, t_11.AsByte()); - Avx.Store(c + 256, t_12.AsByte()); - Avx.Store(c + 320, t_13.AsByte()); - Avx.Store(c + 384, t_14.AsByte()); - Avx.Store(c + 448, t_15.AsByte()); - // ONEOCTO exit - m -= 32; - c -= 32; - bytes -= 512; - c += 512; - m += 512; - } - } - if (bytes >= 256) - { - Vector128 x_0 = Vector128.Create(x[0]); - Vector128 x_1 = Vector128.Create(x[1]); - Vector128 x_2 = Vector128.Create(x[2]); - Vector128 x_3 = Vector128.Create(x[3]); - Vector128 x_4 = Vector128.Create(x[4]); - Vector128 x_5 = Vector128.Create(x[5]); - Vector128 x_6 = Vector128.Create(x[6]); - Vector128 x_7 = Vector128.Create(x[7]); - Vector128 x_8; - Vector128 x_9; - Vector128 x_10 = Vector128.Create(x[10]); - Vector128 x_11 = Vector128.Create(x[11]); - Vector128 x_12 = Vector128.Create(x[12]); - Vector128 x_13 = Vector128.Create(x[13]); - Vector128 x_14 = Vector128.Create(x[14]); - Vector128 x_15 = Vector128.Create(x[15]); - Vector128 orig0 = x_0; - Vector128 orig1 = x_1; - Vector128 orig2 = x_2; - Vector128 orig3 = x_3; - Vector128 orig4 = x_4; - Vector128 orig5 = x_5; - Vector128 orig6 = x_6; - Vector128 orig7 = x_7; - Vector128 orig8; - Vector128 orig9; - Vector128 orig10 = x_10; - Vector128 orig11 = x_11; - Vector128 orig12 = x_12; - Vector128 orig13 = x_13; - Vector128 orig14 = x_14; - Vector128 orig15 = x_15; - Vector128 t8, t9; - - while (bytes >= 256) - { - Vector128 addv8 = Vector128.Create(0, 1).AsUInt32(); - Vector128 addv9 = Vector128.Create(2, 3).AsUInt32(); - - x_0 = orig0; - x_1 = orig1; - x_2 = orig2; - x_3 = orig3; - x_4 = orig4; - x_5 = orig5; - x_6 = orig6; - x_7 = orig7; - x_10 = orig10; - x_11 = orig11; - x_12 = orig12; - x_13 = orig13; - x_14 = orig14; - x_15 = orig15; - - uint in8 = x[8]; - uint in9 = x[9]; - ulong in89 = in8 | ((ulong)in9) << 32; - t8 = Vector128.Create(in89).AsUInt32(); - t9 = Vector128.Create(in89).AsUInt32(); - - x_8 = Sse2.Add(Vector128.AsUInt64(addv8), Vector128.AsUInt64(t8)).AsUInt32(); - x_9 = Sse2.Add(Vector128.AsUInt64(addv9), Vector128.AsUInt64(t9)).AsUInt32(); - - t8 = Sse2.UnpackLow(x_8, x_9); - t9 = Sse2.UnpackHigh(x_8, x_9); - - x_8 = Sse2.UnpackLow(t8, t9); - x_9 = Sse2.UnpackHigh(t8, t9); - - orig8 = x_8; - orig9 = x_9; - - in89 += 4; - - x[8] = (uint)(in89 & 0xFFFFFFFF); - x[9] = (uint)(in89 >> 32 & 0xFFFFFFFF); - - for (int i = 0; i < 20; i += 2) - { - Vec128QuarterRound(ref x_0, ref x_4, ref x_8, ref x_12); - Vec128QuarterRound(ref x_5, ref x_9, ref x_13, ref x_1); - Vec128QuarterRound(ref x_10, ref x_14, ref x_2, ref x_6); - Vec128QuarterRound(ref x_15, ref x_3, ref x_7, ref x_11); - - Vec128QuarterRound(ref x_0, ref x_1, ref x_2, ref x_3); - Vec128QuarterRound(ref x_5, ref x_6, ref x_7, ref x_4); - Vec128QuarterRound(ref x_10, ref x_11, ref x_8, ref x_9); - Vec128QuarterRound(ref x_15, ref x_12, ref x_13, ref x_14); - } - OneQuad(ref x_0, ref x_1, ref x_2, ref x_3, ref orig0, ref orig1, ref orig2, ref orig3, m, c); - m += 16; - c += 16; - OneQuad(ref x_4, ref x_5, ref x_6, ref x_7, ref orig4, ref orig5, ref orig6, ref orig7, m, c); - m += 16; - c += 16; - OneQuad(ref x_8, ref x_9, ref x_10, ref x_11, ref orig8, ref orig9, ref orig10, ref orig11, m, c); - m += 16; - c += 16; - OneQuad(ref x_12, ref x_13, ref x_14, ref x_15, ref orig12, ref orig13, ref orig14, ref orig15, m, c); - m -= 48; - c -= 48; - bytes -= 256; - c += 256; - m += 256; - } - } - while (bytes >= 64) - { - Vector128 x_0 = Sse2.LoadVector128(x); - Vector128 x_1 = Sse2.LoadVector128(x + 4); - Vector128 x_2 = Sse2.LoadVector128(x + 8); - Vector128 x_3 = Sse2.LoadVector128(x + 12); - - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); - - // Diagonalize - x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); - x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); - x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); - - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); - - Vector128 orig_0 = x_0; - Vector128 orig_1 = x_1; - Vector128 orig_2 = x_2; - Vector128 orig_3 = x_3; - - for (int i = 0; i < 20; i += 2) - { - x_1 = Sse2.Xor(x_1, Vector128Rotate(Sse2.Add(x_0, x_3), 7)); - x_2 = Sse2.Xor(x_2, Vector128Rotate(Sse2.Add(x_1, x_0), 9)); - x_3 = Sse2.Xor(x_3, Vector128Rotate(Sse2.Add(x_2, x_1), 13)); - x_0 = Sse2.Xor(x_0, Vector128Rotate(Sse2.Add(x_3, x_2), 18)); - - x_1 = Sse2.Shuffle(x_1, 0b_10_01_00_11); - x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); - x_3 = Sse2.Shuffle(x_3, 0b_00_11_10_01); - - x_3 = Sse2.Xor(x_3, Vector128Rotate(Sse2.Add(x_0, x_1), 7)); - x_2 = Sse2.Xor(x_2, Vector128Rotate(Sse2.Add(x_3, x_0), 9)); - x_1 = Sse2.Xor(x_1, Vector128Rotate(Sse2.Add(x_2, x_3), 13)); - x_0 = Sse2.Xor(x_0, Vector128Rotate(Sse2.Add(x_1, x_2), 18)); - - x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); - x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); - x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); - } - - x_0 = Sse2.Add(x_0, orig_0); - x_1 = Sse2.Add(x_1, orig_1); - x_2 = Sse2.Add(x_2, orig_2); - x_3 = Sse2.Add(x_3, orig_3); - - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); - - // Diagonalize - x_1 = Sse2.Shuffle(x_1, 0b_10_01_00_11); - x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); - x_3 = Sse2.Shuffle(x_3, 0b_00_11_10_01); - - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); - - x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); - x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32(); - x_2 = Sse2.Xor(x_2.AsByte(), Sse2.LoadVector128(m + 32)).AsUInt32(); - x_3 = Sse2.Xor(x_3.AsByte(), Sse2.LoadVector128(m + 48)).AsUInt32(); - Sse2.Store(c, x_0.AsByte()); - Sse2.Store(c + 16, x_1.AsByte()); - Sse2.Store(c + 32, x_2.AsByte()); - Sse2.Store(c + 48, x_3.AsByte()); - - uint in8 = x[8]; - uint in9 = x[9]; - in8++; - if (in8 == 0) - { - in9++; - } - x[8] = in8; - x[9] = in9; + if (!Sse3.IsSupported) + throw new Exception("Error this vectorisation is not supported on this CPU"); - bytes -= 64; - c += 64; - m += 64; - } - if (bytes > 0) - { - Vector128 x_0 = Sse2.LoadVector128(x); - Vector128 x_1 = Sse2.LoadVector128(x + 4); - Vector128 x_2 = Sse2.LoadVector128(x + 8); - Vector128 x_3 = Sse2.LoadVector128(x + 12); - - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); - - // Diagonalize - x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); - x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); - x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); - - // Transpose - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); - - Vector128 orig_0 = x_0; - Vector128 orig_1 = x_1; - Vector128 orig_2 = x_2; - Vector128 orig_3 = x_3; - - for (int i = 0; i < 20; i += 2) - { - x_1 = Sse2.Xor(x_1, Vector128Rotate(Sse2.Add(x_0, x_3), 7)); - x_2 = Sse2.Xor(x_2, Vector128Rotate(Sse2.Add(x_1, x_0), 9)); - x_3 = Sse2.Xor(x_3, Vector128Rotate(Sse2.Add(x_2, x_1), 13)); - x_0 = Sse2.Xor(x_0, Vector128Rotate(Sse2.Add(x_3, x_2), 18)); - - x_1 = Sse2.Shuffle(x_1, 0b_10_01_00_11); - x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); - x_3 = Sse2.Shuffle(x_3, 0b_00_11_10_01); - - x_3 = Sse2.Xor(x_3, Vector128Rotate(Sse2.Add(x_0, x_1), 7)); - x_2 = Sse2.Xor(x_2, Vector128Rotate(Sse2.Add(x_3, x_0), 9)); - x_1 = Sse2.Xor(x_1, Vector128Rotate(Sse2.Add(x_2, x_3), 13)); - x_0 = Sse2.Xor(x_0, Vector128Rotate(Sse2.Add(x_1, x_2), 18)); - - x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); - x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); - x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); - } - - x_0 = Sse2.Add(x_0, orig_0); - x_1 = Sse2.Add(x_1, orig_1); - x_2 = Sse2.Add(x_2, orig_2); - x_3 = Sse2.Add(x_3, orig_3); - - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); - - // Diagonalize - x_1 = Sse2.Shuffle(x_1, 0b_10_01_00_11); - x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); - x_3 = Sse2.Shuffle(x_3, 0b_00_11_10_01); - - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); - - byte* partialblock = stackalloc byte[64]; - Sse2.Store(partialblock, Vector128.AsByte(x_0)); - Sse2.Store(partialblock + 16, Vector128.AsByte(x_1)); - Sse2.Store(partialblock + 32, Vector128.AsByte(x_2)); - Sse2.Store(partialblock + 48, Vector128.AsByte(x_3)); - - // TODO use vector - for (ulong i = 0; i < bytes; i++) - { - c[i] = (byte)(m[i] ^ partialblock[i]); - } - for (int n = 0; n < 64 / sizeof(int); n++) - { - ((int*)partialblock)[n] = 0; - } - } - } - - // 256 byte methods - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector128 Vector128Rotate(Vector128 a, byte imm) => Sse2.Or(Sse2.ShiftLeftLogical(a, imm), Sse2.ShiftRightLogical(a, (byte)(32 - imm))); - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Transpose(ref Vector128 a, ref Vector128 b, ref Vector128 c, ref Vector128 d) - { - var w_0 = Sse2.UnpackLow(a, b).AsUInt64(); - var w_1 = Sse2.UnpackHigh(a, b).AsUInt64(); - var w_2 = Sse2.UnpackLow(c, d).AsUInt64(); - var w_3 = Sse2.UnpackHigh(c, d).AsUInt64(); - - a = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - b = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - c = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - d = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); - } - - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe void OneQuad(ref Vector128 x_A, ref Vector128 x_B, ref Vector128 x_C, ref Vector128 x_D, ref Vector128 origA, ref Vector128 origB, ref Vector128 origC, ref Vector128 origD, byte* m, byte* c) - { - Vector128 t_A, t_B, t_C, t_D, t0, t1, t2, t3; - x_A = Sse2.Add(x_A, origA); - x_B = Sse2.Add(x_B, origB); - x_C = Sse2.Add(x_C, origC); - x_D = Sse2.Add(x_D, origD); - t_A = Sse2.UnpackLow(x_A, x_B); - t_B = Sse2.UnpackLow(x_C, x_D); - t_C = Sse2.UnpackHigh(x_A, x_B); - t_D = Sse2.UnpackHigh(x_C, x_D); - x_A = Sse2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); - x_B = Sse2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); - x_C = Sse2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); - x_D = Sse2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); - t0 = Sse2.Xor(x_A.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); - Sse2.Store(c, t0.AsByte()); - t1 = Sse2.Xor(x_B.AsByte(), Sse2.LoadVector128(m + 64)).AsUInt32(); - Sse2.Store(c + 64, t1.AsByte()); - t2 = Sse2.Xor(x_C.AsByte(), Sse2.LoadVector128(m + 128)).AsUInt32(); - Sse2.Store(c + 128, t2.AsByte()); - t3 = Sse2.Xor(x_D.AsByte(), Sse2.LoadVector128(m + 192)).AsUInt32(); - Sse2.Store(c + 192, t3.AsByte()); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Vec128QuarterRound(ref Vector128 x_A, ref Vector128 x_B, ref Vector128 x_C, ref Vector128 x_D) + if (Avx2.IsSupported && bytes >= 512) { - x_B = Sse2.Xor(x_B, Vector128Rotate(Sse2.Add(x_A, x_D), 7)); - x_C = Sse2.Xor(x_C, Vector128Rotate(Sse2.Add(x_B, x_A), 9)); - x_D = Sse2.Xor(x_D, Vector128Rotate(Sse2.Add(x_C, x_B), 13)); - x_A = Sse2.Xor(x_A, Vector128Rotate(Sse2.Add(x_D, x_C), 18)); + Salsa512.Process(x, ref m, ref c, ref bytes); } - - // 512 byte methods - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector256 Vector256Rotate(Vector256 a, byte imm) => Avx2.Or(Avx2.ShiftLeftLogical(a, imm), Avx2.ShiftRightLogical(a, (byte)(32 - imm))); - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Vec256Round(ref Vector256 A1, ref Vector256 B1, ref Vector256 C1, ref Vector256 D1, ref Vector256 A2, ref Vector256 B2, ref Vector256 C2, ref Vector256 D2, ref Vector256 A3, ref Vector256 B3, ref Vector256 C3, ref Vector256 D3, ref Vector256 A4, ref Vector256 B4, ref Vector256 C4, ref Vector256 D4) + if (bytes >= 256) { - Vector256Line1(ref A1, ref B1, ref C1, ref D1); - Vector256Line1(ref A2, ref B2, ref C2, ref D2); - Vector256Line1(ref A3, ref B3, ref C3, ref D3); - Vector256Line1(ref A4, ref B4, ref C4, ref D4); - Vector256Line2(ref A1, ref B1, ref C1, ref D1); - Vector256Line2(ref A2, ref B2, ref C2, ref D2); - Vector256Line2(ref A3, ref B3, ref C3, ref D3); - Vector256Line2(ref A4, ref B4, ref C4, ref D4); - Vector256Line3(ref A1, ref B1, ref C1, ref D1); - Vector256Line3(ref A2, ref B2, ref C2, ref D2); - Vector256Line3(ref A3, ref B3, ref C3, ref D3); - Vector256Line3(ref A4, ref B4, ref C4, ref D4); - Vector256Line4(ref A1, ref B1, ref C1, ref D1); - Vector256Line4(ref A2, ref B2, ref C2, ref D2); - Vector256Line4(ref A3, ref B3, ref C3, ref D3); - Vector256Line4(ref A4, ref B4, ref C4, ref D4); + Salsa256.Process(x, ref m, ref c, ref bytes); } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Vector256Line1(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + while (bytes >= 64) { - x_B = Avx2.Xor(x_B, Vector256Rotate(Avx2.Add(x_A, x_D), 7)); + Salsa64.Process64(x, ref m, ref c, ref bytes); } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Vector256Line2(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) - { - x_C = Avx2.Xor(x_C, Vector256Rotate(Avx2.Add(x_B, x_A), 9)); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Vector256Line3(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) - { - x_D = Avx2.Xor(x_D, Vector256Rotate(Avx2.Add(x_C, x_B), 13)); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Vector256Line4(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) - { - x_A = Avx2.Xor(x_A, Vector256Rotate(Avx2.Add(x_D, x_C), 18)); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void OneQuadUnpack(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D, ref Vector256 t_A, ref Vector256 t_B, ref Vector256 t_C, ref Vector256 t_D, ref Vector256 orig_A, ref Vector256 orig_B, ref Vector256 orig_C, ref Vector256 orig_D) + if (bytes > 0) { - x_A = Avx2.Add(x_A, orig_A); - x_B = Avx2.Add(x_B, orig_B); - x_C = Avx2.Add(x_C, orig_C); - x_D = Avx2.Add(x_D, orig_D); - t_A = Avx2.UnpackLow(x_A, x_B); - t_B = Avx2.UnpackLow(x_C, x_D); - t_C = Avx2.UnpackHigh(x_A, x_B); - t_D = Avx2.UnpackHigh(x_C, x_D); - x_A = Avx2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); - x_B = Avx2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); - x_C = Avx2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); - x_D = Avx2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + Salsa64.ProcessVarLength(x, ref m, ref c, ref bytes); } } } -#pragma warning restore IDE0007 // Use implicit type #endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs new file mode 100644 index 0000000..5ecdfac --- /dev/null +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs @@ -0,0 +1,174 @@ +#if INTRINSICS +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; + +namespace NaCl.Core.Base.SalsaIntrinsics; + +#pragma warning disable IDE0007 // Use implicit type +internal static class Salsa256 +{ + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void Process(uint* x, ref byte* m, ref byte* c, ref ulong bytes) + { + Vector128 x_0 = Vector128.Create(x[0]); + Vector128 x_1 = Vector128.Create(x[1]); + Vector128 x_2 = Vector128.Create(x[2]); + Vector128 x_3 = Vector128.Create(x[3]); + Vector128 x_4 = Vector128.Create(x[4]); + Vector128 x_5 = Vector128.Create(x[5]); + Vector128 x_6 = Vector128.Create(x[6]); + Vector128 x_7 = Vector128.Create(x[7]); + Vector128 x_8; + Vector128 x_9; + Vector128 x_10 = Vector128.Create(x[10]); + Vector128 x_11 = Vector128.Create(x[11]); + Vector128 x_12 = Vector128.Create(x[12]); + Vector128 x_13 = Vector128.Create(x[13]); + Vector128 x_14 = Vector128.Create(x[14]); + Vector128 x_15 = Vector128.Create(x[15]); + Vector128 orig0 = x_0; + Vector128 orig1 = x_1; + Vector128 orig2 = x_2; + Vector128 orig3 = x_3; + Vector128 orig4 = x_4; + Vector128 orig5 = x_5; + Vector128 orig6 = x_6; + Vector128 orig7 = x_7; + Vector128 orig8; + Vector128 orig9; + Vector128 orig10 = x_10; + Vector128 orig11 = x_11; + Vector128 orig12 = x_12; + Vector128 orig13 = x_13; + Vector128 orig14 = x_14; + Vector128 orig15 = x_15; + Vector128 t8, t9; + + while (bytes >= 256) + { + Vector128 addv8 = Vector128.Create(0, 1).AsUInt32(); + Vector128 addv9 = Vector128.Create(2, 3).AsUInt32(); + + x_0 = orig0; + x_1 = orig1; + x_2 = orig2; + x_3 = orig3; + x_4 = orig4; + x_5 = orig5; + x_6 = orig6; + x_7 = orig7; + x_10 = orig10; + x_11 = orig11; + x_12 = orig12; + x_13 = orig13; + x_14 = orig14; + x_15 = orig15; + + uint in8 = x[8]; + uint in9 = x[9]; + ulong in89 = in8 | ((ulong)in9) << 32; + t8 = Vector128.Create(in89).AsUInt32(); + t9 = Vector128.Create(in89).AsUInt32(); + + x_8 = Sse2.Add(Vector128.AsUInt64(addv8), Vector128.AsUInt64(t8)).AsUInt32(); + x_9 = Sse2.Add(Vector128.AsUInt64(addv9), Vector128.AsUInt64(t9)).AsUInt32(); + + t8 = Sse2.UnpackLow(x_8, x_9); + t9 = Sse2.UnpackHigh(x_8, x_9); + + x_8 = Sse2.UnpackLow(t8, t9); + x_9 = Sse2.UnpackHigh(t8, t9); + + orig8 = x_8; + orig9 = x_9; + + in89 += 4; + + x[8] = (uint)(in89 & 0xFFFFFFFF); + x[9] = (uint)(in89 >> 32 & 0xFFFFFFFF); + + for (int i = 0; i < 20; i += 2) + { + Vec128QuarterRound(ref x_0, ref x_4, ref x_8, ref x_12); + Vec128QuarterRound(ref x_5, ref x_9, ref x_13, ref x_1); + Vec128QuarterRound(ref x_10, ref x_14, ref x_2, ref x_6); + Vec128QuarterRound(ref x_15, ref x_3, ref x_7, ref x_11); + + Vec128QuarterRound(ref x_0, ref x_1, ref x_2, ref x_3); + Vec128QuarterRound(ref x_5, ref x_6, ref x_7, ref x_4); + Vec128QuarterRound(ref x_10, ref x_11, ref x_8, ref x_9); + Vec128QuarterRound(ref x_15, ref x_12, ref x_13, ref x_14); + } + OneQuad(ref x_0, ref x_1, ref x_2, ref x_3, ref orig0, ref orig1, ref orig2, ref orig3, m, c); + m += 16; + c += 16; + OneQuad(ref x_4, ref x_5, ref x_6, ref x_7, ref orig4, ref orig5, ref orig6, ref orig7, m, c); + m += 16; + c += 16; + OneQuad(ref x_8, ref x_9, ref x_10, ref x_11, ref orig8, ref orig9, ref orig10, ref orig11, m, c); + m += 16; + c += 16; + OneQuad(ref x_12, ref x_13, ref x_14, ref x_15, ref orig12, ref orig13, ref orig14, ref orig15, m, c); + m -= 48; + c -= 48; + bytes -= 256; + c += 256; + m += 256; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 Vector128Rotate(Vector128 a, byte imm) => Sse2.Or(Sse2.ShiftLeftLogical(a, imm), Sse2.ShiftRightLogical(a, (byte)(32 - imm))); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Transpose(ref Vector128 a, ref Vector128 b, ref Vector128 c, ref Vector128 d) + { + var w_0 = Sse2.UnpackLow(a, b).AsUInt64(); + var w_1 = Sse2.UnpackHigh(a, b).AsUInt64(); + var w_2 = Sse2.UnpackLow(c, d).AsUInt64(); + var w_3 = Sse2.UnpackHigh(c, d).AsUInt64(); + + a = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + b = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + c = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + d = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void OneQuad(ref Vector128 x_A, ref Vector128 x_B, ref Vector128 x_C, ref Vector128 x_D, ref Vector128 origA, ref Vector128 origB, ref Vector128 origC, ref Vector128 origD, byte* m, byte* c) + { + Vector128 t_A, t_B, t_C, t_D, t0, t1, t2, t3; + x_A = Sse2.Add(x_A, origA); + x_B = Sse2.Add(x_B, origB); + x_C = Sse2.Add(x_C, origC); + x_D = Sse2.Add(x_D, origD); + t_A = Sse2.UnpackLow(x_A, x_B); + t_B = Sse2.UnpackLow(x_C, x_D); + t_C = Sse2.UnpackHigh(x_A, x_B); + t_D = Sse2.UnpackHigh(x_C, x_D); + x_A = Sse2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_B = Sse2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_C = Sse2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + x_D = Sse2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + t0 = Sse2.Xor(x_A.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); + Sse2.Store(c, t0.AsByte()); + t1 = Sse2.Xor(x_B.AsByte(), Sse2.LoadVector128(m + 64)).AsUInt32(); + Sse2.Store(c + 64, t1.AsByte()); + t2 = Sse2.Xor(x_C.AsByte(), Sse2.LoadVector128(m + 128)).AsUInt32(); + Sse2.Store(c + 128, t2.AsByte()); + t3 = Sse2.Xor(x_D.AsByte(), Sse2.LoadVector128(m + 192)).AsUInt32(); + Sse2.Store(c + 192, t3.AsByte()); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vec128QuarterRound(ref Vector128 x_A, ref Vector128 x_B, ref Vector128 x_C, ref Vector128 x_D) + { + x_B = Sse2.Xor(x_B, Vector128Rotate(Sse2.Add(x_A, x_D), 7)); + x_C = Sse2.Xor(x_C, Vector128Rotate(Sse2.Add(x_B, x_A), 9)); + x_D = Sse2.Xor(x_D, Vector128Rotate(Sse2.Add(x_C, x_B), 13)); + x_A = Sse2.Xor(x_A, Vector128Rotate(Sse2.Add(x_D, x_C), 18)); + } +} +#pragma warning restore IDE0007 // Use implicit type +#endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa512.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa512.cs new file mode 100644 index 0000000..2fea411 --- /dev/null +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa512.cs @@ -0,0 +1,231 @@ +#if INTRINSICS +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; + +namespace NaCl.Core.Base.SalsaIntrinsics; + +#pragma warning disable IDE0007 // Use implicit type +internal static class Salsa512 +{ + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void Process(uint* x, ref byte* m, ref byte* c, ref ulong bytes) + { + Vector256 x_0 = Vector256.Create(x[0]); + Vector256 x_1 = Vector256.Create(x[1]); + Vector256 x_2 = Vector256.Create(x[2]); + Vector256 x_3 = Vector256.Create(x[3]); + Vector256 x_4 = Vector256.Create(x[4]); + Vector256 x_5 = Vector256.Create(x[5]); + Vector256 x_6 = Vector256.Create(x[6]); + Vector256 x_7 = Vector256.Create(x[7]); + Vector256 x_8; + Vector256 x_9; + Vector256 x_10 = Vector256.Create(x[10]); + Vector256 x_11 = Vector256.Create(x[11]); + Vector256 x_12 = Vector256.Create(x[12]); + Vector256 x_13 = Vector256.Create(x[13]); + Vector256 x_14 = Vector256.Create(x[14]); + Vector256 x_15 = Vector256.Create(x[15]); + + Vector256 orig0 = x_0; + Vector256 orig1 = x_1; + Vector256 orig2 = x_2; + Vector256 orig3 = x_3; + Vector256 orig4 = x_4; + Vector256 orig5 = x_5; + Vector256 orig6 = x_6; + Vector256 orig7 = x_7; + Vector256 orig8; + Vector256 orig9; + Vector256 orig10 = x_10; + Vector256 orig11 = x_11; + Vector256 orig12 = x_12; + Vector256 orig13 = x_13; + Vector256 orig14 = x_14; + Vector256 orig15 = x_15; + + while (bytes >= 512) + { + Vector256 addv8 = Vector256.Create(0, 1, 2, 3).AsUInt32(); + Vector256 addv9 = Vector256.Create(4, 5, 6, 7).AsUInt32(); + Vector256 permute = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7).AsUInt32(); + Vector256 t8, t9; + x_0 = orig0; + x_1 = orig1; + x_2 = orig2; + x_3 = orig3; + x_4 = orig4; + x_5 = orig5; + x_6 = orig6; + x_7 = orig7; + x_10 = orig10; + x_11 = orig11; + x_12 = orig12; + x_13 = orig13; + x_14 = orig14; + x_15 = orig15; + + uint in8 = x[8]; + uint in9 = x[9]; + ulong in89 = in8 | ((ulong)in9 << 32); + x_8 = x_9 = Avx2.BroadcastScalarToVector256(Sse2.X64.ConvertScalarToVector128UInt64(in89)).AsUInt32(); + t8 = Avx2.Add(addv8.AsUInt64(), x_8.AsUInt64()).AsUInt32(); + t9 = Avx2.Add(addv9.AsUInt64(), x_9.AsUInt64()).AsUInt32(); + x_8 = Avx2.UnpackLow(t8, t9); + x_9 = Avx2.UnpackHigh(t8, t9); + t8 = Avx2.UnpackLow(x_8, x_9); + t9 = Avx2.UnpackHigh(x_8, x_9); + x_8 = Avx2.PermuteVar8x32(t8, permute); + x_9 = Avx2.PermuteVar8x32(t9, permute); + + orig8 = x_8; + orig9 = x_9; + + in89 += 8; + + x[8] = (uint)(in89 & 0xFFFFFFFF); + x[9] = (uint)((in89 >> 32) & 0xFFFFFFFF); + for (int i = 0; i < 20; i += 2) + { + Vec256Round(ref x_0, ref x_4, ref x_8, ref x_12, ref x_5, ref x_9, ref x_13, ref x_1, ref x_10, ref x_14, ref x_2, ref x_6, ref x_15, ref x_3, ref x_7, ref x_11); + Vec256Round(ref x_0, ref x_1, ref x_2, ref x_3, ref x_5, ref x_6, ref x_7, ref x_4, ref x_10, ref x_11, ref x_8, ref x_9, ref x_15, ref x_12, ref x_13, ref x_14); + } + + Vector256 t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, t_13, t_14, t_15; + t_0 = t_1 = t_2 = t_3 = t_4 = t_5 = t_6 = t_7 = t_8 = t_9 = t_10 = t_11 = t_12 = t_13 = t_14 = t_15 = Vector256.Create((uint)0); + // ONEOCTO enter + OneQuadUnpack(ref x_0, ref x_1, ref x_2, ref x_3, ref t_0, ref t_1, ref t_2, ref t_3, ref orig0, ref orig1, ref orig2, ref orig3); + OneQuadUnpack(ref x_4, ref x_5, ref x_6, ref x_7, ref t_4, ref t_5, ref t_6, ref t_7, ref orig4, ref orig5, ref orig6, ref orig7); + t_0 = Avx2.Permute2x128(x_0, x_4, 0x20); + t_4 = Avx2.Permute2x128(x_0, x_4, 0x31); + t_1 = Avx2.Permute2x128(x_1, x_5, 0x20); + t_5 = Avx2.Permute2x128(x_1, x_5, 0x31); + t_2 = Avx2.Permute2x128(x_2, x_6, 0x20); + t_6 = Avx2.Permute2x128(x_2, x_6, 0x31); + t_3 = Avx2.Permute2x128(x_3, x_7, 0x20); + t_7 = Avx2.Permute2x128(x_3, x_7, 0x31); + t_0 = Avx2.Xor(t_0, Avx.LoadVector256(m).AsUInt32()); + t_1 = Avx2.Xor(t_1, Avx.LoadVector256(m + 64).AsUInt32()); + t_2 = Avx2.Xor(t_2, Avx.LoadVector256(m + 128).AsUInt32()); + t_3 = Avx2.Xor(t_3, Avx.LoadVector256(m + 192).AsUInt32()); + t_4 = Avx2.Xor(t_4, Avx.LoadVector256(m + 256).AsUInt32()); + t_5 = Avx2.Xor(t_5, Avx.LoadVector256(m + 320).AsUInt32()); + t_6 = Avx2.Xor(t_6, Avx.LoadVector256(m + 384).AsUInt32()); + t_7 = Avx2.Xor(t_7, Avx.LoadVector256(m + 448).AsUInt32()); + Avx.Store(c, t_0.AsByte()); + Avx.Store(c + 64, t_1.AsByte()); + Avx.Store(c + 128, t_2.AsByte()); + Avx.Store(c + 192, t_3.AsByte()); + Avx.Store(c + 256, t_4.AsByte()); + Avx.Store(c + 320, t_5.AsByte()); + Avx.Store(c + 384, t_6.AsByte()); + Avx.Store(c + 448, t_7.AsByte()); + // ONEOCTO exit + + m += 32; + c += 32; + + // ONEOCTO enter + OneQuadUnpack(ref x_8, ref x_9, ref x_10, ref x_11, ref t_8, ref t_9, ref t_10, ref t_11, ref orig8, ref orig9, ref orig10, ref orig11); + OneQuadUnpack(ref x_12, ref x_13, ref x_14, ref x_15, ref t_12, ref t_13, ref t_14, ref t_15, ref orig12, ref orig13, ref orig14, ref orig15); + t_8 = Avx2.Permute2x128(x_8, x_12, 0x20); + t_12 = Avx2.Permute2x128(x_8, x_12, 0x31); + t_9 = Avx2.Permute2x128(x_9, x_13, 0x20); + t_13 = Avx2.Permute2x128(x_9, x_13, 0x31); + t_10 = Avx2.Permute2x128(x_10, x_14, 0x20); + t_14 = Avx2.Permute2x128(x_10, x_14, 0x31); + t_11 = Avx2.Permute2x128(x_11, x_15, 0x20); + t_15 = Avx2.Permute2x128(x_11, x_15, 0x31); + t_8 = Avx2.Xor(t_8, Avx.LoadVector256(m).AsUInt32()); + t_9 = Avx2.Xor(t_9, Avx.LoadVector256(m + 64).AsUInt32()); + t_10 = Avx2.Xor(t_10, Avx.LoadVector256(m + 128).AsUInt32()); + t_11 = Avx2.Xor(t_11, Avx.LoadVector256(m + 192).AsUInt32()); + t_12 = Avx2.Xor(t_12, Avx.LoadVector256(m + 256).AsUInt32()); + t_13 = Avx2.Xor(t_13, Avx.LoadVector256(m + 320).AsUInt32()); + t_14 = Avx2.Xor(t_14, Avx.LoadVector256(m + 384).AsUInt32()); + t_15 = Avx2.Xor(t_15, Avx.LoadVector256(m + 448).AsUInt32()); + Avx.Store(c, t_8.AsByte()); + Avx.Store(c + 64, t_9.AsByte()); + Avx.Store(c + 128, t_10.AsByte()); + Avx.Store(c + 192, t_11.AsByte()); + Avx.Store(c + 256, t_12.AsByte()); + Avx.Store(c + 320, t_13.AsByte()); + Avx.Store(c + 384, t_14.AsByte()); + Avx.Store(c + 448, t_15.AsByte()); + // ONEOCTO exit + m -= 32; + c -= 32; + bytes -= 512; + c += 512; + m += 512; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 Vector256Rotate(Vector256 a, byte imm) => Avx2.Or(Avx2.ShiftLeftLogical(a, imm), Avx2.ShiftRightLogical(a, (byte)(32 - imm))); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vec256Round(ref Vector256 A1, ref Vector256 B1, ref Vector256 C1, ref Vector256 D1, ref Vector256 A2, ref Vector256 B2, ref Vector256 C2, ref Vector256 D2, ref Vector256 A3, ref Vector256 B3, ref Vector256 C3, ref Vector256 D3, ref Vector256 A4, ref Vector256 B4, ref Vector256 C4, ref Vector256 D4) + { + Vector256Line1(ref A1, ref B1, ref C1, ref D1); + Vector256Line1(ref A2, ref B2, ref C2, ref D2); + Vector256Line1(ref A3, ref B3, ref C3, ref D3); + Vector256Line1(ref A4, ref B4, ref C4, ref D4); + Vector256Line2(ref A1, ref B1, ref C1, ref D1); + Vector256Line2(ref A2, ref B2, ref C2, ref D2); + Vector256Line2(ref A3, ref B3, ref C3, ref D3); + Vector256Line2(ref A4, ref B4, ref C4, ref D4); + Vector256Line3(ref A1, ref B1, ref C1, ref D1); + Vector256Line3(ref A2, ref B2, ref C2, ref D2); + Vector256Line3(ref A3, ref B3, ref C3, ref D3); + Vector256Line3(ref A4, ref B4, ref C4, ref D4); + Vector256Line4(ref A1, ref B1, ref C1, ref D1); + Vector256Line4(ref A2, ref B2, ref C2, ref D2); + Vector256Line4(ref A3, ref B3, ref C3, ref D3); + Vector256Line4(ref A4, ref B4, ref C4, ref D4); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line1(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_B = Avx2.Xor(x_B, Vector256Rotate(Avx2.Add(x_A, x_D), 7)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line2(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_C = Avx2.Xor(x_C, Vector256Rotate(Avx2.Add(x_B, x_A), 9)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line3(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_D = Avx2.Xor(x_D, Vector256Rotate(Avx2.Add(x_C, x_B), 13)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line4(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_A = Avx2.Xor(x_A, Vector256Rotate(Avx2.Add(x_D, x_C), 18)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void OneQuadUnpack(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D, ref Vector256 t_A, ref Vector256 t_B, ref Vector256 t_C, ref Vector256 t_D, ref Vector256 orig_A, ref Vector256 orig_B, ref Vector256 orig_C, ref Vector256 orig_D) + { + x_A = Avx2.Add(x_A, orig_A); + x_B = Avx2.Add(x_B, orig_B); + x_C = Avx2.Add(x_C, orig_C); + x_D = Avx2.Add(x_D, orig_D); + t_A = Avx2.UnpackLow(x_A, x_B); + t_B = Avx2.UnpackLow(x_C, x_D); + t_C = Avx2.UnpackHigh(x_A, x_B); + t_D = Avx2.UnpackHigh(x_C, x_D); + x_A = Avx2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_B = Avx2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_C = Avx2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + x_D = Avx2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + } +} +#pragma warning restore IDE0007 // Use implicit type +#endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs new file mode 100644 index 0000000..101a9cc --- /dev/null +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs @@ -0,0 +1,154 @@ +#if INTRINSICS +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; + +namespace NaCl.Core.Base.SalsaIntrinsics; + +#pragma warning disable IDE0007 // Use implicit type +internal static class Salsa64 +{ + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void Process64(uint* x, ref byte* m, ref byte* c, ref ulong bytes) + { + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); + + Vector128 orig_0 = x_0; + Vector128 orig_1 = x_1; + Vector128 orig_2 = x_2; + Vector128 orig_3 = x_3; + + ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + + x_0 = Sse2.Add(x_0, orig_0); + x_1 = Sse2.Add(x_1, orig_1); + x_2 = Sse2.Add(x_2, orig_2); + x_3 = Sse2.Add(x_3, orig_3); + + x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); + x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32(); + x_2 = Sse2.Xor(x_2.AsByte(), Sse2.LoadVector128(m + 32)).AsUInt32(); + x_3 = Sse2.Xor(x_3.AsByte(), Sse2.LoadVector128(m + 48)).AsUInt32(); + + Sse2.Store(c, x_0.AsByte()); + Sse2.Store(c + 16, x_1.AsByte()); + Sse2.Store(c + 32, x_2.AsByte()); + Sse2.Store(c + 48, x_3.AsByte()); + + uint in8 = x[8]; + uint in9 = x[9]; + in8++; + if (in8 == 0) + { + in9++; + } + x[8] = in8; + x[9] = in9; + + bytes -= 64; + c += 64; + m += 64; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void ProcessVarLength(uint* x, ref byte* m, ref byte* c, ref ulong bytes) + { + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); + + Vector128 orig_0 = x_0; + Vector128 orig_1 = x_1; + Vector128 orig_2 = x_2; + Vector128 orig_3 = x_3; + + ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + + x_0 = Sse2.Add(x_0, orig_0); + x_1 = Sse2.Add(x_1, orig_1); + x_2 = Sse2.Add(x_2, orig_2); + x_3 = Sse2.Add(x_3, orig_3); + + byte* partialblock = stackalloc byte[64]; + Sse2.Store(partialblock, Vector128.AsByte(x_0)); + Sse2.Store(partialblock + 16, Vector128.AsByte(x_1)); + Sse2.Store(partialblock + 32, Vector128.AsByte(x_2)); + Sse2.Store(partialblock + 48, Vector128.AsByte(x_3)); + + // TODO use vector + for (ulong i = 0; i < bytes; i++) + { + c[i] = (byte)(m[i] ^ partialblock[i]); + } + for (int n = 0; n < 64 / sizeof(int); n++) + { + ((int*)partialblock)[n] = 0; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void ShuffleState(ref Vector128 x_0, ref Vector128 x_1, ref Vector128 x_2, ref Vector128 x_3) + { + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); + + // Diagonalize + x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); + x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); + x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); + + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); + + for (int i = 0; i < 20; i += 2) + { + x_1 = Sse2.Xor(x_1, Vector128Rotate(Sse2.Add(x_0, x_3), 7)); + x_2 = Sse2.Xor(x_2, Vector128Rotate(Sse2.Add(x_1, x_0), 9)); + x_3 = Sse2.Xor(x_3, Vector128Rotate(Sse2.Add(x_2, x_1), 13)); + x_0 = Sse2.Xor(x_0, Vector128Rotate(Sse2.Add(x_3, x_2), 18)); + + x_1 = Sse2.Shuffle(x_1, 0b_10_01_00_11); + x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); + x_3 = Sse2.Shuffle(x_3, 0b_00_11_10_01); + + x_3 = Sse2.Xor(x_3, Vector128Rotate(Sse2.Add(x_0, x_1), 7)); + x_2 = Sse2.Xor(x_2, Vector128Rotate(Sse2.Add(x_3, x_0), 9)); + x_1 = Sse2.Xor(x_1, Vector128Rotate(Sse2.Add(x_2, x_3), 13)); + x_0 = Sse2.Xor(x_0, Vector128Rotate(Sse2.Add(x_1, x_2), 18)); + + x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); + x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); + x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); + } + + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); + + // Diagonalize + x_1 = Sse2.Shuffle(x_1, 0b_10_01_00_11); + x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); + x_3 = Sse2.Shuffle(x_3, 0b_00_11_10_01); + + Transpose(ref x_0, ref x_1, ref x_2, ref x_3); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 Vector128Rotate(Vector128 a, byte imm) => Sse2.Or(Sse2.ShiftLeftLogical(a, imm), Sse2.ShiftRightLogical(a, (byte)(32 - imm))); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Transpose(ref Vector128 a, ref Vector128 b, ref Vector128 c, ref Vector128 d) + { + var w_0 = Sse2.UnpackLow(a, b).AsUInt64(); + var w_1 = Sse2.UnpackHigh(a, b).AsUInt64(); + var w_2 = Sse2.UnpackLow(c, d).AsUInt64(); + var w_3 = Sse2.UnpackHigh(c, d).AsUInt64(); + + a = Sse2.UnpackLow(w_0, w_2).AsUInt32(); + b = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); + c = Sse2.UnpackLow(w_1, w_3).AsUInt32(); + d = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); + } +} +#pragma warning restore IDE0007 // Use implicit type +#endif \ No newline at end of file From 27b544d1b5f3a2799e88e60291d46b68fbe1a7c4 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Wed, 12 Oct 2022 15:28:52 +0100 Subject: [PATCH 24/59] Added HChaCha and HSalsa --- src/NaCl.Core/Base/ChaCha20Base.cs | 8 ++++++ src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs | 8 ++++++ .../Base/ChaChaIntrinsics/ChaCha64.cs | 19 +++++++++++++ src/NaCl.Core/Base/Salsa20Base.cs | 8 ++++++ src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs | 8 ++++++ src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs | 28 +++++++++++++++++++ 6 files changed, 79 insertions(+) diff --git a/src/NaCl.Core/Base/ChaCha20Base.cs b/src/NaCl.Core/Base/ChaCha20Base.cs index 25a73af..8a85450 100644 --- a/src/NaCl.Core/Base/ChaCha20Base.cs +++ b/src/NaCl.Core/Base/ChaCha20Base.cs @@ -82,6 +82,14 @@ public void HChaCha20(Span subKey, ReadOnlySpan nonce) // Setting HChaCha20 initial state HChaCha20InitialState(state, nonce); +#if INTRINSICS + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported) + { + ChaCha20BaseIntrinsics.HChaCha20(subKey, state); + return; + } +#endif + // Block function ShuffleState(state); diff --git a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs index 6482d18..89d96bc 100644 --- a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs @@ -32,5 +32,13 @@ public static unsafe void ChaCha20(uint* x, byte* m, byte* c, ulong bytes) ChaCha64.ProcessVarLength(x, ref m, ref c, ref bytes); } } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void HChaCha20(Span subKey, ReadOnlySpan state) + { + if (!Sse3.IsSupported) + throw new Exception("Error this vectorisation is not supported on this CPU"); + ChaCha64.HChaCha20(subKey, state); + } } #endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs index 192dd45..575082b 100644 --- a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs @@ -2,6 +2,7 @@ using System.Runtime.CompilerServices; using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; +using System; namespace NaCl.Core.Base.ChaChaIntrinsics; @@ -92,6 +93,24 @@ public static unsafe void ProcessVarLength(uint* x, ref byte* m, ref byte* c, re } [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void HChaCha20(Span subKey, ReadOnlySpan state) + { + fixed(uint* x = state) + fixed(byte* sk = subKey) + { + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); + + ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + + Sse2.Store(sk, Vector128.AsByte(x_0)); + Sse2.Store(sk + 16, Vector128.AsByte(x_3)); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void ShuffleState(ref Vector128 x_0, ref Vector128 x_1, ref Vector128 x_2, ref Vector128 x_3) { Vector128 t_1; diff --git a/src/NaCl.Core/Base/Salsa20Base.cs b/src/NaCl.Core/Base/Salsa20Base.cs index dfb091c..e0f3257 100644 --- a/src/NaCl.Core/Base/Salsa20Base.cs +++ b/src/NaCl.Core/Base/Salsa20Base.cs @@ -81,6 +81,14 @@ public void HSalsa20(Span subKey, ReadOnlySpan nonce) // Setting HSalsa20 initial state HSalsa20InitialState(state, nonce); +#if INTRINSICS + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported) + { + Salsa20BaseIntrinsics.HSalsa20(subKey, state); + return; + } +#endif + // Block function ShuffleState(state); diff --git a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs index 114722a..c80c05e 100644 --- a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs @@ -32,5 +32,13 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) Salsa64.ProcessVarLength(x, ref m, ref c, ref bytes); } } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void HSalsa20(Span subKey, ReadOnlySpan state) + { + if (!Sse3.IsSupported) + throw new Exception("Error this vectorisation is not supported on this CPU"); + Salsa64.HSalsa20(subKey, state); + } } #endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs index 101a9cc..241e6ce 100644 --- a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs @@ -2,6 +2,7 @@ using System.Runtime.CompilerServices; using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; +using System; namespace NaCl.Core.Base.SalsaIntrinsics; @@ -90,6 +91,33 @@ public static unsafe void ProcessVarLength(uint* x, ref byte* m, ref byte* c, re } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void HSalsa20(Span subKey, ReadOnlySpan state) + { + fixed (uint* x = state) + fixed (byte* sk = subKey) + { + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); + + ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + + // <0, 5, 2, 3> + <8, 9, 10, 15> -> <0, 5, 10, 15> + Vector128 t_0 = Avx2.Blend(x_0, x_1, 0b_00_00_00_10); + Vector128 t_1 = Avx2.Blend(x_2, x_3, 0b_00_00_10_00); + Vector128 t_2 = Avx2.Blend(t_0, t_1, 0b_00_00_11_00); + + // Get <8, 9, 6, 7> then shuffle to <6, 7, 8, 9> + Vector128 t_3 = Avx2.Blend(x_1, x_2, 0b_00_00_00_11); + t_3 = Sse2.Shuffle(t_3, 0b_01_00_11_10); + + Sse2.Store(sk, Vector128.AsByte(t_2)); + Sse2.Store(sk + 16, Vector128.AsByte(t_3)); + } + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void ShuffleState(ref Vector128 x_0, ref Vector128 x_1, ref Vector128 x_2, ref Vector128 x_3) { From 79ca4ce71f953d9799566d58311dea6118ed4252 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Thu, 13 Oct 2022 23:34:57 +0100 Subject: [PATCH 25/59] Added little endian check --- src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs | 5 +++-- src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs | 1 + src/NaCl.Core/Base/Salsa20Base.cs | 2 +- src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs | 5 +++-- src/NaCl.Core/Base/Snuffle.cs | 3 +-- 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs index 89d96bc..4def507 100644 --- a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs @@ -12,7 +12,7 @@ public static class ChaCha20BaseIntrinsics [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void ChaCha20(uint* x, byte* m, byte* c, ulong bytes) { - if (!Sse3.IsSupported) + if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) throw new Exception("Error this vectorisation is not supported on this CPU"); if (Avx2.IsSupported && bytes >= 512) @@ -36,8 +36,9 @@ public static unsafe void ChaCha20(uint* x, byte* m, byte* c, ulong bytes) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void HChaCha20(Span subKey, ReadOnlySpan state) { - if (!Sse3.IsSupported) + if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) throw new Exception("Error this vectorisation is not supported on this CPU"); + ChaCha64.HChaCha20(subKey, state); } } diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs index f31550b..8b9dc66 100644 --- a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs @@ -100,6 +100,7 @@ public static unsafe void Process(uint* x, ref byte* m, ref byte* c, ref ulong b // ONEOCTO enter OneQuadUnpack(ref x_0, ref x_1, ref x_2, ref x_3, ref t_0, ref t_1, ref t_2, ref t_3, ref orig0, ref orig1, ref orig2, ref orig3); OneQuadUnpack(ref x_4, ref x_5, ref x_6, ref x_7, ref t_4, ref t_5, ref t_6, ref t_7, ref orig4, ref orig5, ref orig6, ref orig7); + t_0 = Avx2.Permute2x128(x_0, x_4, 0x20); t_4 = Avx2.Permute2x128(x_0, x_4, 0x31); t_1 = Avx2.Permute2x128(x_1, x_5, 0x20); diff --git a/src/NaCl.Core/Base/Salsa20Base.cs b/src/NaCl.Core/Base/Salsa20Base.cs index e0f3257..1733e3f 100644 --- a/src/NaCl.Core/Base/Salsa20Base.cs +++ b/src/NaCl.Core/Base/Salsa20Base.cs @@ -82,7 +82,7 @@ public void HSalsa20(Span subKey, ReadOnlySpan nonce) HSalsa20InitialState(state, nonce); #if INTRINSICS - if (System.Runtime.Intrinsics.X86.Sse3.IsSupported) + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) { Salsa20BaseIntrinsics.HSalsa20(subKey, state); return; diff --git a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs index c80c05e..b798fbf 100644 --- a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs @@ -12,7 +12,7 @@ public static class Salsa20BaseIntrinsics [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) { - if (!Sse3.IsSupported) + if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) throw new Exception("Error this vectorisation is not supported on this CPU"); if (Avx2.IsSupported && bytes >= 512) @@ -36,8 +36,9 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void HSalsa20(Span subKey, ReadOnlySpan state) { - if (!Sse3.IsSupported) + if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) throw new Exception("Error this vectorisation is not supported on this CPU"); + Salsa64.HSalsa20(subKey, state); } } diff --git a/src/NaCl.Core/Base/Snuffle.cs b/src/NaCl.Core/Base/Snuffle.cs index 506efeb..e6542b6 100644 --- a/src/NaCl.Core/Base/Snuffle.cs +++ b/src/NaCl.Core/Base/Snuffle.cs @@ -8,7 +8,6 @@ using System.Runtime.Intrinsics.X86; #endif - /// /// Abstract base class for XSalsa20, ChaCha20, XChaCha20 and their variants. /// @@ -128,7 +127,7 @@ public void Decrypt(ReadOnlySpan ciphertext, ReadOnlySpan nonce, Spa private void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) { #if INTRINSICS - if (Sse3.IsSupported) + if (Sse3.IsSupported && BitConverter.IsLittleEndian) { ProcessStream(nonce, output, input, InitialCounter, offset); return; From 589912ebcdbfbaea567975d8135a185f50ad21d0 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Fri, 14 Oct 2022 16:08:02 +0100 Subject: [PATCH 26/59] Added intrinsics process stream --- src/NaCl.Core/Base/ChaCha20Base.cs | 15 ++++++++- src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs | 9 ++++++ .../Base/ChaChaIntrinsics/ChaCha64.cs | 31 ++++++++++++++++++- src/NaCl.Core/Base/Salsa20Base.cs | 12 +++++++ src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs | 9 ++++++ src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs | 31 +++++++++++++++++++ 6 files changed, 105 insertions(+), 2 deletions(-) diff --git a/src/NaCl.Core/Base/ChaCha20Base.cs b/src/NaCl.Core/Base/ChaCha20Base.cs index 8a85450..5d52631 100644 --- a/src/NaCl.Core/Base/ChaCha20Base.cs +++ b/src/NaCl.Core/Base/ChaCha20Base.cs @@ -2,6 +2,7 @@ { using System; using System.Runtime.CompilerServices; + using System.Runtime.InteropServices; using System.Security.Cryptography; using Internal; @@ -41,6 +42,17 @@ public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; SetInitialState(state, nonce, counter); +#if INTRINSICS + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) + { + Span stateBytes = MemoryMarshal.Cast(state); + ChaCha20BaseIntrinsics.ChaCha20KeyStream(stateBytes); + stateBytes.CopyTo(block); + + return; + } +#endif + // Create a copy of the state and then run 20 rounds on it, // alternating between "column rounds" and "diagonal rounds"; each round consisting of four quarter-rounds. Span workingState = stackalloc uint[BLOCK_SIZE_IN_INTS]; @@ -59,6 +71,7 @@ public override unsafe void ProcessStream(ReadOnlySpan nonce, Span o { Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; SetInitialState(state, nonce, initialCounter); + fixed(uint* x = state) fixed (byte* m = input, c = output.Slice(offset)) { @@ -83,7 +96,7 @@ public void HChaCha20(Span subKey, ReadOnlySpan nonce) HChaCha20InitialState(state, nonce); #if INTRINSICS - if (System.Runtime.Intrinsics.X86.Sse3.IsSupported) + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) { ChaCha20BaseIntrinsics.HChaCha20(subKey, state); return; diff --git a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs index 4def507..3b8326d 100644 --- a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs @@ -41,5 +41,14 @@ public static unsafe void HChaCha20(Span subKey, ReadOnlySpan state) ChaCha64.HChaCha20(subKey, state); } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void ChaCha20KeyStream(Span state) + { + if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) + throw new Exception("Error this vectorisation is not supported on this CPU"); + + ChaCha64.KeyStream64(state); + } } #endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs index 575082b..9b2f972 100644 --- a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs @@ -110,7 +110,36 @@ public static unsafe void HChaCha20(Span subKey, ReadOnlySpan state) } } - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void KeyStream64(Span state) + { + fixed (byte* x = state) + { + Vector128 x_0 = Sse2.LoadVector128(x).AsUInt32(); + Vector128 x_1 = Sse2.LoadVector128(x + 16).AsUInt32(); + Vector128 x_2 = Sse2.LoadVector128(x + 32).AsUInt32(); + Vector128 x_3 = Sse2.LoadVector128(x + 48).AsUInt32(); + + Vector128 orig_0 = x_0; + Vector128 orig_1 = x_1; + Vector128 orig_2 = x_2; + Vector128 orig_3 = x_3; + + ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + + x_0 = Sse2.Add(x_0, orig_0); + x_1 = Sse2.Add(x_1, orig_1); + x_2 = Sse2.Add(x_2, orig_2); + x_3 = Sse2.Add(x_3, orig_3); + + Sse2.Store(x, x_0.AsByte()); + Sse2.Store(x + 16, x_1.AsByte()); + Sse2.Store(x + 32, x_2.AsByte()); + Sse2.Store(x + 48, x_3.AsByte()); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void ShuffleState(ref Vector128 x_0, ref Vector128 x_1, ref Vector128 x_2, ref Vector128 x_3) { Vector128 t_1; diff --git a/src/NaCl.Core/Base/Salsa20Base.cs b/src/NaCl.Core/Base/Salsa20Base.cs index 1733e3f..61eeab6 100644 --- a/src/NaCl.Core/Base/Salsa20Base.cs +++ b/src/NaCl.Core/Base/Salsa20Base.cs @@ -2,6 +2,7 @@ { using System; using System.Runtime.CompilerServices; + using System.Runtime.InteropServices; using System.Security.Cryptography; using Internal; @@ -40,6 +41,17 @@ public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; SetInitialState(state, nonce, counter); +#if INTRINSICS + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) + { + Span stateBytes = MemoryMarshal.Cast(state); + Salsa20BaseIntrinsics.Salsa20KeyStream(stateBytes); + stateBytes.CopyTo(block); + + return; + } +#endif + // Create a copy of the state and then run 20 rounds on it, // alternating between "column rounds" and "diagonal rounds"; each round consisting of four quarter-rounds. Span workingState = stackalloc uint[BLOCK_SIZE_IN_INTS]; diff --git a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs index b798fbf..7c406be 100644 --- a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs @@ -41,5 +41,14 @@ public static unsafe void HSalsa20(Span subKey, ReadOnlySpan state) Salsa64.HSalsa20(subKey, state); } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void Salsa20KeyStream(Span state) + { + if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) + throw new Exception("Error this vectorisation is not supported on this CPU"); + + Salsa64.KeyStream64(state); + } } #endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs index 241e6ce..42c2649 100644 --- a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs @@ -29,6 +29,7 @@ public static unsafe void Process64(uint* x, ref byte* m, ref byte* c, ref ulong x_2 = Sse2.Add(x_2, orig_2); x_3 = Sse2.Add(x_3, orig_3); + // Xor the key stream and message to obtain the cipher. x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32(); x_2 = Sse2.Xor(x_2.AsByte(), Sse2.LoadVector128(m + 32)).AsUInt32(); @@ -39,6 +40,7 @@ public static unsafe void Process64(uint* x, ref byte* m, ref byte* c, ref ulong Sse2.Store(c + 32, x_2.AsByte()); Sse2.Store(c + 48, x_3.AsByte()); + // Increment 64 bit counter for the original state. uint in8 = x[8]; uint in9 = x[9]; in8++; @@ -118,6 +120,35 @@ public static unsafe void HSalsa20(Span subKey, ReadOnlySpan state) } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void KeyStream64(Span state) + { + fixed (byte* x = state) + { + Vector128 x_0 = Sse2.LoadVector128(x).AsUInt32(); + Vector128 x_1 = Sse2.LoadVector128(x + 16).AsUInt32(); + Vector128 x_2 = Sse2.LoadVector128(x + 32).AsUInt32(); + Vector128 x_3 = Sse2.LoadVector128(x + 48).AsUInt32(); + + Vector128 orig_0 = x_0; + Vector128 orig_1 = x_1; + Vector128 orig_2 = x_2; + Vector128 orig_3 = x_3; + + ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + + x_0 = Sse2.Add(x_0, orig_0); + x_1 = Sse2.Add(x_1, orig_1); + x_2 = Sse2.Add(x_2, orig_2); + x_3 = Sse2.Add(x_3, orig_3); + + Sse2.Store(x, x_0.AsByte()); + Sse2.Store(x + 16, x_1.AsByte()); + Sse2.Store(x + 32, x_2.AsByte()); + Sse2.Store(x + 48, x_3.AsByte()); + } + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void ShuffleState(ref Vector128 x_0, ref Vector128 x_1, ref Vector128 x_2, ref Vector128 x_3) { From 5deec51987f10764bd9f25bca1cb8245994c81ea Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Fri, 14 Oct 2022 16:08:25 +0100 Subject: [PATCH 27/59] Added Salsa ProcessKeyStreamBlock test --- test/NaCl.Core.Tests/Salsa20Tests.cs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/test/NaCl.Core.Tests/Salsa20Tests.cs b/test/NaCl.Core.Tests/Salsa20Tests.cs index db33110..d5b5f65 100644 --- a/test/NaCl.Core.Tests/Salsa20Tests.cs +++ b/test/NaCl.Core.Tests/Salsa20Tests.cs @@ -263,6 +263,30 @@ public void Salsa20BlockWhenLengthIsInvalidFails() act.Should().Throw(); } + [Fact] + public void Salsa20BlockTestVector() + { + // Arrange + var key = CryptoBytes.FromHexString("00:01:02:03:04:05:06:07:08:09:0a:0b:0c:0d:0e:0f:10:11:12:13:14:15:16:17:18:19:1a:1b:1c:1d:1e:1f".Replace(":", string.Empty)); + var nonce = CryptoBytes.FromHexString("00:00:00:09:00:00:00:4a".Replace(":", string.Empty)); + var counter = 1; + + // Act + var salsa20 = new Salsa20(key, 1); + var output = new byte[Salsa20.BLOCK_SIZE_IN_BYTES]; + salsa20.ProcessKeyStreamBlock(nonce, counter, output); + + // Assert + var expected = new uint[16] + { + 3649387971u, 3432934094u, 2867581180u, 544842727u, + 3442094382u, 3233001746u, 2484653980u, 586338650u, + 3037335121u, 3388889956u, 1351682463u, 2284954070u, + 3021171268u, 2617586057u, 3288245149u, 2763695160u }; + + output.ToUInt16Array().Should().Equal(expected); + } + public static IEnumerable Salsa20TestData => ParseTestVectors(GetTestVector()).Select(d => new object[] { d }); [Theory] From 37b41ab44f0fdd6b1bdeef0a0e412556758ca974 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Sat, 15 Oct 2022 00:09:22 +0100 Subject: [PATCH 28/59] Minor HSalsa & KeyStream api changes --- src/NaCl.Core/Base/ChaCha20Base.cs | 7 ++---- src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs | 8 +++---- .../Base/ChaChaIntrinsics/ChaCha64.cs | 23 ++++++++++--------- src/NaCl.Core/Base/Salsa20Base.cs | 7 ++---- src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs | 8 +++---- src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs | 23 ++++++++++--------- 6 files changed, 36 insertions(+), 40 deletions(-) diff --git a/src/NaCl.Core/Base/ChaCha20Base.cs b/src/NaCl.Core/Base/ChaCha20Base.cs index 5d52631..13fd1a2 100644 --- a/src/NaCl.Core/Base/ChaCha20Base.cs +++ b/src/NaCl.Core/Base/ChaCha20Base.cs @@ -45,10 +45,7 @@ public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter #if INTRINSICS if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) { - Span stateBytes = MemoryMarshal.Cast(state); - ChaCha20BaseIntrinsics.ChaCha20KeyStream(stateBytes); - stateBytes.CopyTo(block); - + ChaCha20BaseIntrinsics.ChaCha20KeyStream(state, block); return; } #endif @@ -98,7 +95,7 @@ public void HChaCha20(Span subKey, ReadOnlySpan nonce) #if INTRINSICS if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) { - ChaCha20BaseIntrinsics.HChaCha20(subKey, state); + ChaCha20BaseIntrinsics.HChaCha20(state, subKey); return; } #endif diff --git a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs index 3b8326d..3ce192b 100644 --- a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs @@ -34,21 +34,21 @@ public static unsafe void ChaCha20(uint* x, byte* m, byte* c, ulong bytes) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe void HChaCha20(Span subKey, ReadOnlySpan state) + public static unsafe void HChaCha20(ReadOnlySpan state, Span subKey) { if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) throw new Exception("Error this vectorisation is not supported on this CPU"); - ChaCha64.HChaCha20(subKey, state); + ChaCha64.HChaCha20(state, subKey); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe void ChaCha20KeyStream(Span state) + public static unsafe void ChaCha20KeyStream(ReadOnlySpan state, Span output) { if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) throw new Exception("Error this vectorisation is not supported on this CPU"); - ChaCha64.KeyStream64(state); + ChaCha64.KeyStream64(state, output); } } #endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs index 9b2f972..7121038 100644 --- a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs @@ -93,7 +93,7 @@ public static unsafe void ProcessVarLength(uint* x, ref byte* m, ref byte* c, re } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe void HChaCha20(Span subKey, ReadOnlySpan state) + public static unsafe void HChaCha20(ReadOnlySpan state, Span subKey) { fixed(uint* x = state) fixed(byte* sk = subKey) @@ -111,14 +111,15 @@ public static unsafe void HChaCha20(Span subKey, ReadOnlySpan state) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe void KeyStream64(Span state) + public static unsafe void KeyStream64(ReadOnlySpan state, Span output) { - fixed (byte* x = state) + fixed (byte* k = output) + fixed (uint* x = state) { - Vector128 x_0 = Sse2.LoadVector128(x).AsUInt32(); - Vector128 x_1 = Sse2.LoadVector128(x + 16).AsUInt32(); - Vector128 x_2 = Sse2.LoadVector128(x + 32).AsUInt32(); - Vector128 x_3 = Sse2.LoadVector128(x + 48).AsUInt32(); + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); Vector128 orig_0 = x_0; Vector128 orig_1 = x_1; @@ -132,10 +133,10 @@ public static unsafe void KeyStream64(Span state) x_2 = Sse2.Add(x_2, orig_2); x_3 = Sse2.Add(x_3, orig_3); - Sse2.Store(x, x_0.AsByte()); - Sse2.Store(x + 16, x_1.AsByte()); - Sse2.Store(x + 32, x_2.AsByte()); - Sse2.Store(x + 48, x_3.AsByte()); + Sse2.Store(k, x_0.AsByte()); + Sse2.Store(k + 16, x_1.AsByte()); + Sse2.Store(k + 32, x_2.AsByte()); + Sse2.Store(k + 48, x_3.AsByte()); } } diff --git a/src/NaCl.Core/Base/Salsa20Base.cs b/src/NaCl.Core/Base/Salsa20Base.cs index 61eeab6..bccd6ef 100644 --- a/src/NaCl.Core/Base/Salsa20Base.cs +++ b/src/NaCl.Core/Base/Salsa20Base.cs @@ -44,10 +44,7 @@ public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter #if INTRINSICS if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) { - Span stateBytes = MemoryMarshal.Cast(state); - Salsa20BaseIntrinsics.Salsa20KeyStream(stateBytes); - stateBytes.CopyTo(block); - + Salsa20BaseIntrinsics.Salsa20KeyStream(state, block); return; } #endif @@ -96,7 +93,7 @@ public void HSalsa20(Span subKey, ReadOnlySpan nonce) #if INTRINSICS if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) { - Salsa20BaseIntrinsics.HSalsa20(subKey, state); + Salsa20BaseIntrinsics.HSalsa20(state, subKey); return; } #endif diff --git a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs index 7c406be..0ec174a 100644 --- a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs @@ -34,21 +34,21 @@ public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe void HSalsa20(Span subKey, ReadOnlySpan state) + public static unsafe void HSalsa20(ReadOnlySpan state, Span subKey) { if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) throw new Exception("Error this vectorisation is not supported on this CPU"); - Salsa64.HSalsa20(subKey, state); + Salsa64.HSalsa20(state, subKey); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe void Salsa20KeyStream(Span state) + public static unsafe void Salsa20KeyStream(ReadOnlySpan state, Span output) { if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) throw new Exception("Error this vectorisation is not supported on this CPU"); - Salsa64.KeyStream64(state); + Salsa64.KeyStream64(state, output); } } #endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs index 42c2649..b249c78 100644 --- a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs @@ -94,7 +94,7 @@ public static unsafe void ProcessVarLength(uint* x, ref byte* m, ref byte* c, re } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe void HSalsa20(Span subKey, ReadOnlySpan state) + public static unsafe void HSalsa20(ReadOnlySpan state, Span subKey) { fixed (uint* x = state) fixed (byte* sk = subKey) @@ -121,14 +121,15 @@ public static unsafe void HSalsa20(Span subKey, ReadOnlySpan state) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe void KeyStream64(Span state) + public static unsafe void KeyStream64(ReadOnlySpan state, Span output) { - fixed (byte* x = state) + fixed (byte* k = output) + fixed (uint* x = state) { - Vector128 x_0 = Sse2.LoadVector128(x).AsUInt32(); - Vector128 x_1 = Sse2.LoadVector128(x + 16).AsUInt32(); - Vector128 x_2 = Sse2.LoadVector128(x + 32).AsUInt32(); - Vector128 x_3 = Sse2.LoadVector128(x + 48).AsUInt32(); + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); Vector128 orig_0 = x_0; Vector128 orig_1 = x_1; @@ -142,10 +143,10 @@ public static unsafe void KeyStream64(Span state) x_2 = Sse2.Add(x_2, orig_2); x_3 = Sse2.Add(x_3, orig_3); - Sse2.Store(x, x_0.AsByte()); - Sse2.Store(x + 16, x_1.AsByte()); - Sse2.Store(x + 32, x_2.AsByte()); - Sse2.Store(x + 48, x_3.AsByte()); + Sse2.Store(k, x_0.AsByte()); + Sse2.Store(k + 16, x_1.AsByte()); + Sse2.Store(k + 32, x_2.AsByte()); + Sse2.Store(k + 48, x_3.AsByte()); } } From e26c7fb7e5cf4a2f376570457f1d15f948d6ab4c Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Sat, 15 Oct 2022 10:33:27 +0100 Subject: [PATCH 29/59] Remove pre processor per method system checks. Instead using either a normal Core or IntrinsicsCore if supported --- src/NaCl.Core/Base/ChaCha20Base.cs | 1 - src/NaCl.Core/Base/ISalsa20Core.cs | 9 ++ src/NaCl.Core/Base/Salsa20Base.cs | 104 ++++++-------------- src/NaCl.Core/Base/Salsa20Core.cs | 75 ++++++++++++++ src/NaCl.Core/Base/Salsa20CoreIntrinsics.cs | 64 ++++++++++++ src/NaCl.Core/Base/Snuffle.cs | 18 ++-- src/NaCl.Core/Salsa20.cs | 2 +- src/NaCl.Core/XSalsa20.cs | 2 +- 8 files changed, 189 insertions(+), 86 deletions(-) create mode 100644 src/NaCl.Core/Base/ISalsa20Core.cs create mode 100644 src/NaCl.Core/Base/Salsa20Core.cs create mode 100644 src/NaCl.Core/Base/Salsa20CoreIntrinsics.cs diff --git a/src/NaCl.Core/Base/ChaCha20Base.cs b/src/NaCl.Core/Base/ChaCha20Base.cs index 13fd1a2..871cec9 100644 --- a/src/NaCl.Core/Base/ChaCha20Base.cs +++ b/src/NaCl.Core/Base/ChaCha20Base.cs @@ -86,7 +86,6 @@ public override unsafe void ProcessStream(ReadOnlySpan nonce, Span o public void HChaCha20(Span subKey, ReadOnlySpan nonce) { // See https://tools.ietf.org/html/draft-arciszewski-xchacha-01#section-2.2. - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; // Setting HChaCha20 initial state diff --git a/src/NaCl.Core/Base/ISalsa20Core.cs b/src/NaCl.Core/Base/ISalsa20Core.cs new file mode 100644 index 0000000..1414275 --- /dev/null +++ b/src/NaCl.Core/Base/ISalsa20Core.cs @@ -0,0 +1,9 @@ +using System; + +namespace NaCl.Core.Base; +internal interface ISalsa20Core +{ + void HSalsa20(Span subKey, ReadOnlySpan nonce); + void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0); + void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block); +} \ No newline at end of file diff --git a/src/NaCl.Core/Base/Salsa20Base.cs b/src/NaCl.Core/Base/Salsa20Base.cs index bccd6ef..4260a6f 100644 --- a/src/NaCl.Core/Base/Salsa20Base.cs +++ b/src/NaCl.Core/Base/Salsa20Base.cs @@ -4,7 +4,9 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Security.Cryptography; - +#if INTRINSICS + using System.Runtime.Intrinsics.X86; +#endif using Internal; /// @@ -13,12 +15,32 @@ /// public abstract class Salsa20Base : Snuffle { + readonly ISalsa20Core salsa20Core; + /// /// Initializes a new instance of the class. /// /// The key. /// The initial counter. - protected Salsa20Base(ReadOnlyMemory key, int initialCounter) : base(key, initialCounter) { } + protected Salsa20Base(ReadOnlyMemory key, int initialCounter) : base(key, initialCounter) + { +#if INTRINSICS + if (Sse3.IsSupported) + { + salsa20Core = new Salsa20CoreIntrinsics(this); + } + else + { + salsa20Core = new Salsa20Core(this); + } +#else + salsa20Core = new Salsa20Core(this); +#endif + } + +#if INTRINSICS + public override void ProcessStream(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int initialCounter, int offset = 0) => throw new NotImplementedException(); +#endif /// public override int BlockSizeInBytes => BLOCK_SIZE_IN_BYTES; @@ -30,50 +52,13 @@ protected Salsa20Base(ReadOnlyMemory key, int initialCounter) : base(key, /// The state. /// The nonce. /// The counter. - protected abstract void SetInitialState(Span state, ReadOnlySpan nonce, int counter); + internal protected abstract void SetInitialState(Span state, ReadOnlySpan nonce, int counter); /// - public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) - { - if (block.Length != BLOCK_SIZE_IN_BYTES) - throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); - - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - SetInitialState(state, nonce, counter); - -#if INTRINSICS - if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) - { - Salsa20BaseIntrinsics.Salsa20KeyStream(state, block); - return; - } -#endif - - // Create a copy of the state and then run 20 rounds on it, - // alternating between "column rounds" and "diagonal rounds"; each round consisting of four quarter-rounds. - Span workingState = stackalloc uint[BLOCK_SIZE_IN_INTS]; - state.CopyTo(workingState); - ShuffleState(workingState); + internal protected override void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) => salsa20Core.Process(nonce, output, input, offset); - // At the end of the rounds, add the result to the original state. - for (var i = 0; i < BLOCK_SIZE_IN_INTS; i++) - state[i] += workingState[i]; - - ArrayUtils.StoreArray16UInt32LittleEndian(block, 0, state); - } - -#if INTRINSICS - public override unsafe void ProcessStream(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int initialCounter, int offset = 0) - { - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - SetInitialState(state, nonce, initialCounter); - fixed (uint* x = state) - fixed (byte* m = input, c = output.Slice(offset)) - { - Salsa20BaseIntrinsics.Salsa20(x, m, c, (ulong)input.Length); - } - } -#endif + /// + public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) => salsa20Core.ProcessKeyStreamBlock(nonce, counter, block); /// /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . @@ -81,36 +66,7 @@ public override unsafe void ProcessStream(ReadOnlySpan nonce, Span o /// The subKey. /// The nonce. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void HSalsa20(Span subKey, ReadOnlySpan nonce) - { - // See: http://cr.yp.to/snuffle/xsalsa-20081128.pdf under 2. Specification - Definition of HSalsa20 - - Span state = stackalloc uint[BLOCK_SIZE_IN_BYTES]; - - // Setting HSalsa20 initial state - HSalsa20InitialState(state, nonce); - -#if INTRINSICS - if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) - { - Salsa20BaseIntrinsics.HSalsa20(state, subKey); - return; - } -#endif - - // Block function - ShuffleState(state); - - state[1] = state[5]; - state[2] = state[10]; - state[3] = state[15]; - state[4] = state[6]; - state[5] = state[7]; - state[6] = state[8]; - state[7] = state[9]; - - ArrayUtils.StoreArray8UInt32LittleEndian(subKey, 0, state); - } + public void HSalsa20(Span subKey, ReadOnlySpan nonce) => salsa20Core.HSalsa20(subKey, nonce); /// /// Sets the initial of the HSalsa20 using the key and the . @@ -142,7 +98,7 @@ public void HSalsa20InitialState(Span state, ReadOnlySpan nonce) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - protected static void ShuffleState(Span state) + internal protected static void ShuffleState(Span state) { // 10 loops × 2 rounds/loop = 20 rounds for (var i = 0; i < 10; i++) diff --git a/src/NaCl.Core/Base/Salsa20Core.cs b/src/NaCl.Core/Base/Salsa20Core.cs new file mode 100644 index 0000000..1a7bbfe --- /dev/null +++ b/src/NaCl.Core/Base/Salsa20Core.cs @@ -0,0 +1,75 @@ +namespace NaCl.Core.Base +{ + using System; + using System.Runtime.CompilerServices; + using System.Runtime.InteropServices; + using System.Security.Cryptography; + + using Internal; + + internal class Salsa20Core : ISalsa20Core + { + protected const int KEY_SIZE_IN_INTS = 8; + public const int KEY_SIZE_IN_BYTES = KEY_SIZE_IN_INTS * 4; // 32 + protected const int BLOCK_SIZE_IN_INTS = 16; + public const int BLOCK_SIZE_IN_BYTES = BLOCK_SIZE_IN_INTS * 4; // 64 + + protected static uint[] SIGMA = new uint[] { 0x61707865, 0x3320646E, 0x79622D32, 0x6B206574 }; // "expand 32-byte k" (4 words constant: "expa", "nd 3", "2-by", and "te k") + + private readonly Salsa20Base _salsa20; + + public Salsa20Core(Salsa20Base salsa20) => _salsa20 = salsa20; + + public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) + { + if (block.Length != BLOCK_SIZE_IN_BYTES) + throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); + + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + _salsa20.SetInitialState(state, nonce, counter); + + // Create a copy of the state and then run 20 rounds on it, + // alternating between "column rounds" and "diagonal rounds"; each round consisting of four quarter-rounds. + Span workingState = stackalloc uint[BLOCK_SIZE_IN_INTS]; + state.CopyTo(workingState); + Salsa20Base.ShuffleState(workingState); + + // At the end of the rounds, add the result to the original state. + for (var i = 0; i < BLOCK_SIZE_IN_INTS; i++) + state[i] += workingState[i]; + + ArrayUtils.StoreArray16UInt32LittleEndian(block, 0, state); + } + + public void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) => _salsa20.Process(nonce, output, input, offset); + + /// + /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . + /// + /// The subKey. + /// The nonce. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void HSalsa20(Span subKey, ReadOnlySpan nonce) + { + // See: http://cr.yp.to/snuffle/xsalsa-20081128.pdf under 2. Specification - Definition of HSalsa20 + + Span state = stackalloc uint[BLOCK_SIZE_IN_BYTES]; + + // Setting HSalsa20 initial state + _salsa20.HSalsa20InitialState(state, nonce); + + // Block function + Salsa20Base.ShuffleState(state); + + state[1] = state[5]; + state[2] = state[10]; + state[3] = state[15]; + state[4] = state[6]; + state[5] = state[7]; + state[6] = state[8]; + state[7] = state[9]; + + ArrayUtils.StoreArray8UInt32LittleEndian(subKey, 0, state); + } + } +} \ No newline at end of file diff --git a/src/NaCl.Core/Base/Salsa20CoreIntrinsics.cs b/src/NaCl.Core/Base/Salsa20CoreIntrinsics.cs new file mode 100644 index 0000000..60db3de --- /dev/null +++ b/src/NaCl.Core/Base/Salsa20CoreIntrinsics.cs @@ -0,0 +1,64 @@ +#if INTRINSICS +namespace NaCl.Core.Base +{ + using System; + using System.Runtime.CompilerServices; + using System.Runtime.InteropServices; + using System.Security.Cryptography; + + using Internal; + + internal class Salsa20CoreIntrinsics : ISalsa20Core + { + protected const int KEY_SIZE_IN_INTS = 8; + public const int KEY_SIZE_IN_BYTES = KEY_SIZE_IN_INTS * 4; // 32 + protected const int BLOCK_SIZE_IN_INTS = 16; + public const int BLOCK_SIZE_IN_BYTES = BLOCK_SIZE_IN_INTS * 4; // 64 + + protected static uint[] SIGMA = new uint[] { 0x61707865, 0x3320646E, 0x79622D32, 0x6B206574 }; // "expand 32-byte k" (4 words constant: "expa", "nd 3", "2-by", and "te k") + private readonly Salsa20Base _salsa20; + + public Salsa20CoreIntrinsics(Salsa20Base salsa20) => _salsa20 = salsa20; + + public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) + { + if (block.Length != BLOCK_SIZE_IN_BYTES) + throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); + + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + _salsa20.SetInitialState(state, nonce, counter); + + Salsa20BaseIntrinsics.Salsa20KeyStream(state, block); + } + + public unsafe void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) + { + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + _salsa20.SetInitialState(state, nonce, _salsa20.InitialCounter); + fixed (uint* x = state) + fixed (byte* m = input, c = output.Slice(offset)) + { + Salsa20BaseIntrinsics.Salsa20(x, m, c, (ulong)input.Length); + } + } + + /// + /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . + /// + /// The subKey. + /// The nonce. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void HSalsa20(Span subKey, ReadOnlySpan nonce) + { + // See: http://cr.yp.to/snuffle/xsalsa-20081128.pdf under 2. Specification - Definition of HSalsa20 + + Span state = stackalloc uint[BLOCK_SIZE_IN_BYTES]; + + // Setting HSalsa20 initial state + _salsa20.HSalsa20InitialState(state, nonce); + + Salsa20BaseIntrinsics.HSalsa20(state, subKey); + } + } +} +#endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/Snuffle.cs b/src/NaCl.Core/Base/Snuffle.cs index e6542b6..dcb679e 100644 --- a/src/NaCl.Core/Base/Snuffle.cs +++ b/src/NaCl.Core/Base/Snuffle.cs @@ -32,7 +32,7 @@ public abstract class Snuffle protected static uint[] SIGMA = new uint[] { 0x61707865, 0x3320646E, 0x79622D32, 0x6B206574 }; // "expand 32-byte k" (4 words constant: "expa", "nd 3", "2-by", and "te k") protected readonly ReadOnlyMemory Key; - protected readonly int InitialCounter; + internal protected readonly int InitialCounter; /// /// Initializes a new instance of the class. @@ -124,15 +124,15 @@ public void Decrypt(ReadOnlySpan ciphertext, ReadOnlySpan nonce, Spa /// The output. /// The input. /// The output's starting offset. - private void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) + internal virtual void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) { -#if INTRINSICS - if (Sse3.IsSupported && BitConverter.IsLittleEndian) - { - ProcessStream(nonce, output, input, InitialCounter, offset); - return; - } -#endif +//#if INTRINSICS +// if (Sse3.IsSupported && BitConverter.IsLittleEndian) +// { +// ProcessStream(nonce, output, input, InitialCounter, offset); +// return; +// } +//#endif var length = input.Length; var numBlocks = (length / BlockSizeInBytes) + 1; diff --git a/src/NaCl.Core/Salsa20.cs b/src/NaCl.Core/Salsa20.cs index e5d97f7..0016401 100644 --- a/src/NaCl.Core/Salsa20.cs +++ b/src/NaCl.Core/Salsa20.cs @@ -23,7 +23,7 @@ public class Salsa20 : Salsa20Base public Salsa20(ReadOnlyMemory key, int initialCounter = 0) : base(key, initialCounter) { } /// - protected override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) + internal protected override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) { if (nonce.IsEmpty || nonce.Length != NonceSizeInBytes) throw new CryptographicException(FormatNonceLengthExceptionMessage(GetType().Name, nonce.Length, NonceSizeInBytes)); diff --git a/src/NaCl.Core/XSalsa20.cs b/src/NaCl.Core/XSalsa20.cs index c17e2c7..c2efbcc 100644 --- a/src/NaCl.Core/XSalsa20.cs +++ b/src/NaCl.Core/XSalsa20.cs @@ -23,7 +23,7 @@ public class XSalsa20 : Salsa20Base public XSalsa20(ReadOnlyMemory key, int initialCounter = 0) : base(key, initialCounter) { } /// - protected override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) + internal protected override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) { if (nonce.IsEmpty || nonce.Length != NonceSizeInBytes) throw new CryptographicException(FormatNonceLengthExceptionMessage(GetType().Name, nonce.Length, NonceSizeInBytes)); From 58d6c25c406ae12e246c9ee4312d138c45f6f867 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Sun, 16 Oct 2022 12:07:03 +0100 Subject: [PATCH 30/59] Use ChaChaCore or ChaChaIntrinsics instead of pre processor checks --- src/NaCl.Core/Base/ChaCha20Base.cs | 97 ++++++-------------- src/NaCl.Core/Base/ChaCha20Core.cs | 67 ++++++++++++++ src/NaCl.Core/Base/ChaCha20CoreIntrinsics.cs | 53 +++++++++++ src/NaCl.Core/Base/IChaCha20Core.cs | 9 ++ src/NaCl.Core/ChaCha20.cs | 2 +- src/NaCl.Core/XChaCha20.cs | 2 +- 6 files changed, 157 insertions(+), 73 deletions(-) create mode 100644 src/NaCl.Core/Base/ChaCha20Core.cs create mode 100644 src/NaCl.Core/Base/ChaCha20CoreIntrinsics.cs create mode 100644 src/NaCl.Core/Base/IChaCha20Core.cs diff --git a/src/NaCl.Core/Base/ChaCha20Base.cs b/src/NaCl.Core/Base/ChaCha20Base.cs index 871cec9..0eba223 100644 --- a/src/NaCl.Core/Base/ChaCha20Base.cs +++ b/src/NaCl.Core/Base/ChaCha20Base.cs @@ -3,6 +3,9 @@ using System; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +#if INTRINSICS + using System.Runtime.Intrinsics.X86; +#endif using System.Security.Cryptography; using Internal; @@ -13,12 +16,28 @@ /// public abstract class ChaCha20Base : Snuffle { + readonly IChaCha20Core _chaCha20Core; + /// /// Initializes a new instance of the class. /// /// The key. /// The initial counter. - protected ChaCha20Base(ReadOnlyMemory key, int initialCounter) : base(key, initialCounter) { } + protected ChaCha20Base(ReadOnlyMemory key, int initialCounter) : base(key, initialCounter) + { +#if INTRINSICS + if (Sse3.IsSupported) + { + _chaCha20Core = new ChaCha20CoreIntrinsics(this); + } + else + { + _chaCha20Core = new ChaCha20Core(this); + } +#else + _chaCha20Core = new ChaCha20Core(this); +#endif + } /// public override int BlockSizeInBytes => BLOCK_SIZE_IN_BYTES; @@ -30,52 +49,13 @@ protected ChaCha20Base(ReadOnlyMemory key, int initialCounter) : base(key, /// The state. /// The nonce. /// The counter. - protected abstract void SetInitialState(Span state, ReadOnlySpan nonce, int counter); + protected internal abstract void SetInitialState(Span state, ReadOnlySpan nonce, int counter); /// - public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) - { - if (block.Length != BLOCK_SIZE_IN_BYTES) - throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); - - // Set the initial state based on https://tools.ietf.org/html/rfc8439#section-2.3 - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - SetInitialState(state, nonce, counter); - -#if INTRINSICS - if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) - { - ChaCha20BaseIntrinsics.ChaCha20KeyStream(state, block); - return; - } -#endif - - // Create a copy of the state and then run 20 rounds on it, - // alternating between "column rounds" and "diagonal rounds"; each round consisting of four quarter-rounds. - Span workingState = stackalloc uint[BLOCK_SIZE_IN_INTS]; - state.CopyTo(workingState); - ShuffleState(workingState); + internal override void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) => _chaCha20Core.Process(nonce, output, input, offset); - // At the end of the rounds, add the result to the original state. - for (var i = 0; i < BLOCK_SIZE_IN_INTS; i++) - state[i] += workingState[i]; - - ArrayUtils.StoreArray16UInt32LittleEndian(block, 0, state); - } - -#if INTRINSICS - public override unsafe void ProcessStream(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int initialCounter, int offset = 0) - { - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - SetInitialState(state, nonce, initialCounter); - - fixed(uint* x = state) - fixed (byte* m = input, c = output.Slice(offset)) - { - ChaCha20BaseIntrinsics.ChaCha20(x, m, c, (ulong)input.Length); - } - } -#endif + /// + public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) => _chaCha20Core.ProcessKeyStreamBlock(nonce, counter, block); /// /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . @@ -83,32 +63,7 @@ public override unsafe void ProcessStream(ReadOnlySpan nonce, Span o /// The subKey. /// The nonce. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void HChaCha20(Span subKey, ReadOnlySpan nonce) - { - // See https://tools.ietf.org/html/draft-arciszewski-xchacha-01#section-2.2. - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - - // Setting HChaCha20 initial state - HChaCha20InitialState(state, nonce); - -#if INTRINSICS - if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) - { - ChaCha20BaseIntrinsics.HChaCha20(state, subKey); - return; - } -#endif - - // Block function - ShuffleState(state); - - state[4] = state[12]; - state[5] = state[13]; - state[6] = state[14]; - state[7] = state[15]; - - ArrayUtils.StoreArray8UInt32LittleEndian(subKey, 0, state); - } + public void HChaCha20(Span subKey, ReadOnlySpan nonce) => _chaCha20Core.HChaCha20(subKey, nonce); /// /// Sets the initial of the HChaCha20 using the key and the . @@ -254,7 +209,7 @@ protected static void ShuffleState(ref Array16 state) */ [MethodImpl(MethodImplOptions.AggressiveInlining)] - protected static void ShuffleState(Span state) + protected internal static void ShuffleState(Span state) { // 10 loops × 2 rounds/loop = 20 rounds for (var i = 0; i < 10; i++) diff --git a/src/NaCl.Core/Base/ChaCha20Core.cs b/src/NaCl.Core/Base/ChaCha20Core.cs new file mode 100644 index 0000000..301e556 --- /dev/null +++ b/src/NaCl.Core/Base/ChaCha20Core.cs @@ -0,0 +1,67 @@ +namespace NaCl.Core.Base +{ + using System; + using System.Runtime.CompilerServices; + using System.Runtime.InteropServices; + using System.Security.Cryptography; + + using Internal; + + internal class ChaCha20Core : IChaCha20Core + { + public const int BLOCK_SIZE_IN_BYTES = Snuffle.BLOCK_SIZE_IN_BYTES; + public const int BLOCK_SIZE_IN_INTS = Snuffle.BLOCK_SIZE_IN_INTS; + + private readonly ChaCha20Base _chaCha20; + public ChaCha20Core(ChaCha20Base chaCha20) => _chaCha20 = chaCha20; + + public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) + { + if (block.Length != BLOCK_SIZE_IN_BYTES) + throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); + + // Set the initial state based on https://tools.ietf.org/html/rfc8439#section-2.3 + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + _chaCha20.SetInitialState(state, nonce, counter); + + // Create a copy of the state and then run 20 rounds on it, + // alternating between "column rounds" and "diagonal rounds"; each round consisting of four quarter-rounds. + Span workingState = stackalloc uint[BLOCK_SIZE_IN_INTS]; + state.CopyTo(workingState); + ChaCha20Base.ShuffleState(state); + + // At the end of the rounds, add the result to the original state. + for (var i = 0; i < BLOCK_SIZE_IN_INTS; i++) + state[i] += workingState[i]; + + ArrayUtils.StoreArray16UInt32LittleEndian(block, 0, state); + } + + public void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) => _chaCha20.Process(nonce, output, input, offset); + + /// + /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . + /// + /// The subKey. + /// The nonce. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void HChaCha20(Span subKey, ReadOnlySpan nonce) + { + // See https://tools.ietf.org/html/draft-arciszewski-xchacha-01#section-2.2. + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + + // Setting HChaCha20 initial state + _chaCha20.HChaCha20InitialState(state, nonce); + + // Block function + ChaCha20Base.ShuffleState(state); + + state[4] = state[12]; + state[5] = state[13]; + state[6] = state[14]; + state[7] = state[15]; + + ArrayUtils.StoreArray8UInt32LittleEndian(subKey, 0, state); + } + } +} diff --git a/src/NaCl.Core/Base/ChaCha20CoreIntrinsics.cs b/src/NaCl.Core/Base/ChaCha20CoreIntrinsics.cs new file mode 100644 index 0000000..b8be67b --- /dev/null +++ b/src/NaCl.Core/Base/ChaCha20CoreIntrinsics.cs @@ -0,0 +1,53 @@ +#if INTRINSICS +namespace NaCl.Core.Base; + +using System; +using System.Runtime.CompilerServices; +using System.Security.Cryptography; + + +internal class ChaCha20CoreIntrinsics : IChaCha20Core +{ + const int BLOCK_SIZE_IN_BYTES = Snuffle.BLOCK_SIZE_IN_BYTES; + const int BLOCK_SIZE_IN_INTS = Snuffle.BLOCK_SIZE_IN_INTS; + + private readonly ChaCha20Base _chaCha20; + public ChaCha20CoreIntrinsics(ChaCha20Base chaCha20Base) => _chaCha20=chaCha20Base; + + public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) + { + if (block.Length != BLOCK_SIZE_IN_BYTES) + throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); + + // Set the initial state based on https://tools.ietf.org/html/rfc8439#section-2.3 + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + _chaCha20.SetInitialState(state, nonce, counter); + + ChaCha20BaseIntrinsics.ChaCha20KeyStream(state, block); + } + + public unsafe void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) + { + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + _chaCha20.SetInitialState(state, nonce, _chaCha20.InitialCounter); + + fixed(uint* x = state) + fixed (byte* m = input, c = output.Slice(offset)) + { + ChaCha20BaseIntrinsics.ChaCha20(x, m, c, (ulong)input.Length); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void HChaCha20(Span subKey, ReadOnlySpan nonce) + { + // See https://tools.ietf.org/html/draft-arciszewski-xchacha-01#section-2.2. + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + + // Setting HChaCha20 initial state + _chaCha20.HChaCha20InitialState(state, nonce); + + ChaCha20BaseIntrinsics.HChaCha20(state, subKey); + } +} +#endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/IChaCha20Core.cs b/src/NaCl.Core/Base/IChaCha20Core.cs new file mode 100644 index 0000000..e0432e8 --- /dev/null +++ b/src/NaCl.Core/Base/IChaCha20Core.cs @@ -0,0 +1,9 @@ +using System; + +namespace NaCl.Core.Base; +internal interface IChaCha20Core +{ + void HChaCha20(Span subKey, ReadOnlySpan nonce); + void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0); + void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block); +} \ No newline at end of file diff --git a/src/NaCl.Core/ChaCha20.cs b/src/NaCl.Core/ChaCha20.cs index 88c8303..1427929 100644 --- a/src/NaCl.Core/ChaCha20.cs +++ b/src/NaCl.Core/ChaCha20.cs @@ -26,7 +26,7 @@ public class ChaCha20 : ChaCha20Base public ChaCha20(ReadOnlyMemory key, int initialCounter) : base(key, initialCounter) { } /// - protected override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) + protected internal override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) { if (nonce.IsEmpty || nonce.Length != NonceSizeInBytes) throw new CryptographicException(FormatNonceLengthExceptionMessage(GetType().Name, nonce.Length, NonceSizeInBytes)); diff --git a/src/NaCl.Core/XChaCha20.cs b/src/NaCl.Core/XChaCha20.cs index 4924f7c..49dd29a 100644 --- a/src/NaCl.Core/XChaCha20.cs +++ b/src/NaCl.Core/XChaCha20.cs @@ -24,7 +24,7 @@ public class XChaCha20 : ChaCha20Base public XChaCha20(ReadOnlyMemory key, int initialCounter) : base(key, initialCounter) { } /// - protected override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) + protected internal override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) { if (nonce.IsEmpty || nonce.Length != NonceSizeInBytes) throw new CryptographicException(FormatNonceLengthExceptionMessage(GetType().Name, nonce.Length, NonceSizeInBytes)); From 24b33573a34f35e2d45b54f6a1af5a18b1b56a56 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Sun, 16 Oct 2022 12:07:20 +0100 Subject: [PATCH 31/59] Delete Snuffle pre processor checks --- src/NaCl.Core/Base/Salsa20Base.cs | 6 +- src/NaCl.Core/Base/Salsa20CoreIntrinsics.cs | 91 ++++++++++----------- src/NaCl.Core/Base/Snuffle.cs | 30 ++----- 3 files changed, 55 insertions(+), 72 deletions(-) diff --git a/src/NaCl.Core/Base/Salsa20Base.cs b/src/NaCl.Core/Base/Salsa20Base.cs index 4260a6f..7c04504 100644 --- a/src/NaCl.Core/Base/Salsa20Base.cs +++ b/src/NaCl.Core/Base/Salsa20Base.cs @@ -38,10 +38,6 @@ protected Salsa20Base(ReadOnlyMemory key, int initialCounter) : base(key, #endif } -#if INTRINSICS - public override void ProcessStream(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int initialCounter, int offset = 0) => throw new NotImplementedException(); -#endif - /// public override int BlockSizeInBytes => BLOCK_SIZE_IN_BYTES; @@ -55,7 +51,7 @@ protected Salsa20Base(ReadOnlyMemory key, int initialCounter) : base(key, internal protected abstract void SetInitialState(Span state, ReadOnlySpan nonce, int counter); /// - internal protected override void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) => salsa20Core.Process(nonce, output, input, offset); + internal override void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) => salsa20Core.Process(nonce, output, input, offset); /// public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) => salsa20Core.ProcessKeyStreamBlock(nonce, counter, block); diff --git a/src/NaCl.Core/Base/Salsa20CoreIntrinsics.cs b/src/NaCl.Core/Base/Salsa20CoreIntrinsics.cs index 60db3de..9babd92 100644 --- a/src/NaCl.Core/Base/Salsa20CoreIntrinsics.cs +++ b/src/NaCl.Core/Base/Salsa20CoreIntrinsics.cs @@ -1,64 +1,63 @@ #if INTRINSICS -namespace NaCl.Core.Base -{ - using System; - using System.Runtime.CompilerServices; - using System.Runtime.InteropServices; - using System.Security.Cryptography; +namespace NaCl.Core.Base; - using Internal; +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Security.Cryptography; - internal class Salsa20CoreIntrinsics : ISalsa20Core - { - protected const int KEY_SIZE_IN_INTS = 8; - public const int KEY_SIZE_IN_BYTES = KEY_SIZE_IN_INTS * 4; // 32 - protected const int BLOCK_SIZE_IN_INTS = 16; - public const int BLOCK_SIZE_IN_BYTES = BLOCK_SIZE_IN_INTS * 4; // 64 +using Internal; - protected static uint[] SIGMA = new uint[] { 0x61707865, 0x3320646E, 0x79622D32, 0x6B206574 }; // "expand 32-byte k" (4 words constant: "expa", "nd 3", "2-by", and "te k") - private readonly Salsa20Base _salsa20; +internal class Salsa20CoreIntrinsics : ISalsa20Core +{ + protected const int KEY_SIZE_IN_INTS = 8; + public const int KEY_SIZE_IN_BYTES = KEY_SIZE_IN_INTS * 4; // 32 + protected const int BLOCK_SIZE_IN_INTS = 16; + public const int BLOCK_SIZE_IN_BYTES = BLOCK_SIZE_IN_INTS * 4; // 64 - public Salsa20CoreIntrinsics(Salsa20Base salsa20) => _salsa20 = salsa20; + protected static uint[] SIGMA = new uint[] { 0x61707865, 0x3320646E, 0x79622D32, 0x6B206574 }; // "expand 32-byte k" (4 words constant: "expa", "nd 3", "2-by", and "te k") + private readonly Salsa20Base _salsa20; - public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) - { - if (block.Length != BLOCK_SIZE_IN_BYTES) - throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); + public Salsa20CoreIntrinsics(Salsa20Base salsa20) => _salsa20 = salsa20; - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - _salsa20.SetInitialState(state, nonce, counter); + public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) + { + if (block.Length != BLOCK_SIZE_IN_BYTES) + throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); - Salsa20BaseIntrinsics.Salsa20KeyStream(state, block); - } + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + _salsa20.SetInitialState(state, nonce, counter); - public unsafe void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) + Salsa20BaseIntrinsics.Salsa20KeyStream(state, block); + } + + public unsafe void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) + { + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + _salsa20.SetInitialState(state, nonce, _salsa20.InitialCounter); + fixed (uint* x = state) + fixed (byte* m = input, c = output.Slice(offset)) { - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - _salsa20.SetInitialState(state, nonce, _salsa20.InitialCounter); - fixed (uint* x = state) - fixed (byte* m = input, c = output.Slice(offset)) - { - Salsa20BaseIntrinsics.Salsa20(x, m, c, (ulong)input.Length); - } + Salsa20BaseIntrinsics.Salsa20(x, m, c, (ulong)input.Length); } + } - /// - /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . - /// - /// The subKey. - /// The nonce. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void HSalsa20(Span subKey, ReadOnlySpan nonce) - { - // See: http://cr.yp.to/snuffle/xsalsa-20081128.pdf under 2. Specification - Definition of HSalsa20 + /// + /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . + /// + /// The subKey. + /// The nonce. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void HSalsa20(Span subKey, ReadOnlySpan nonce) + { + // See: http://cr.yp.to/snuffle/xsalsa-20081128.pdf under 2. Specification - Definition of HSalsa20 - Span state = stackalloc uint[BLOCK_SIZE_IN_BYTES]; + Span state = stackalloc uint[BLOCK_SIZE_IN_BYTES]; - // Setting HSalsa20 initial state - _salsa20.HSalsa20InitialState(state, nonce); + // Setting HSalsa20 initial state + _salsa20.HSalsa20InitialState(state, nonce); - Salsa20BaseIntrinsics.HSalsa20(state, subKey); - } + Salsa20BaseIntrinsics.HSalsa20(state, subKey); } } #endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/Snuffle.cs b/src/NaCl.Core/Base/Snuffle.cs index dcb679e..7301057 100644 --- a/src/NaCl.Core/Base/Snuffle.cs +++ b/src/NaCl.Core/Base/Snuffle.cs @@ -26,7 +26,7 @@ public abstract class Snuffle { protected const int KEY_SIZE_IN_INTS = 8; public const int KEY_SIZE_IN_BYTES = KEY_SIZE_IN_INTS * 4; // 32 - protected const int BLOCK_SIZE_IN_INTS = 16; + protected internal const int BLOCK_SIZE_IN_INTS = 16; public const int BLOCK_SIZE_IN_BYTES = BLOCK_SIZE_IN_INTS * 4; // 64 protected static uint[] SIGMA = new uint[] { 0x61707865, 0x3320646E, 0x79622D32, 0x6B206574 }; // "expand 32-byte k" (4 words constant: "expa", "nd 3", "2-by", and "te k") @@ -62,10 +62,6 @@ protected Snuffle(ReadOnlyMemory key, int initialCounter) /// ByteBuffer. public abstract void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block); -#if INTRINSICS - public abstract void ProcessStream(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int initialCounter, int offset = 0); -#endif - /// /// The size of the nonce in bytes. /// Salsa20 uses a 8-byte (64-bit) nonce, ChaCha20 uses a 12-byte (96-bit) nonce, but XSalsa20 and XChaCha20 use a 24-byte (192-bit) nonce. @@ -126,14 +122,6 @@ public void Decrypt(ReadOnlySpan ciphertext, ReadOnlySpan nonce, Spa /// The output's starting offset. internal virtual void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) { -//#if INTRINSICS -// if (Sse3.IsSupported && BitConverter.IsLittleEndian) -// { -// ProcessStream(nonce, output, input, InitialCounter, offset); -// return; -// } -//#endif - var length = input.Length; var numBlocks = (length / BlockSizeInBytes) + 1; @@ -168,14 +156,14 @@ internal virtual void Process(ReadOnlySpan nonce, Span output, ReadO } } - /// - /// Formats the nonce length exception message. - /// - /// The crypto primitive name. - /// The actual nonce length. - /// The expected nonce length. - /// System.String. - internal static string FormatNonceLengthExceptionMessage(string name, int actual, int expected) => $"{name} uses {expected * 8}-bit nonces, but got a {actual * 8}-bit nonce. The nonce length in bytes must be {expected}."; + /// + /// Formats the nonce length exception message. + /// + /// The crypto primitive name. + /// The actual nonce length. + /// The expected nonce length. + /// System.String. + internal static string FormatNonceLengthExceptionMessage(string name, int actual, int expected) => $"{name} uses {expected * 8}-bit nonces, but got a {actual * 8}-bit nonce. The nonce length in bytes must be {expected}."; /// /// XOR the specified output. From 8c88ea5c3c1c4d77f2ed49f5116685715f0ac6e5 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Sun, 16 Oct 2022 12:07:20 +0100 Subject: [PATCH 32/59] Delete Snuffle pre processor checks --- src/NaCl.Core/Base/Salsa20Base.cs | 20 ++--- src/NaCl.Core/Base/Salsa20CoreIntrinsics.cs | 91 ++++++++++----------- src/NaCl.Core/Base/Snuffle.cs | 30 ++----- 3 files changed, 61 insertions(+), 80 deletions(-) diff --git a/src/NaCl.Core/Base/Salsa20Base.cs b/src/NaCl.Core/Base/Salsa20Base.cs index 4260a6f..c3a5957 100644 --- a/src/NaCl.Core/Base/Salsa20Base.cs +++ b/src/NaCl.Core/Base/Salsa20Base.cs @@ -2,8 +2,6 @@ { using System; using System.Runtime.CompilerServices; - using System.Runtime.InteropServices; - using System.Security.Cryptography; #if INTRINSICS using System.Runtime.Intrinsics.X86; #endif @@ -15,7 +13,7 @@ /// public abstract class Salsa20Base : Snuffle { - readonly ISalsa20Core salsa20Core; + readonly ISalsa20Core _salsa20Core; /// /// Initializes a new instance of the class. @@ -27,21 +25,17 @@ protected Salsa20Base(ReadOnlyMemory key, int initialCounter) : base(key, #if INTRINSICS if (Sse3.IsSupported) { - salsa20Core = new Salsa20CoreIntrinsics(this); + _salsa20Core = new Salsa20CoreIntrinsics(this); } else { - salsa20Core = new Salsa20Core(this); + _salsa20Core = new Salsa20Core(this); } #else - salsa20Core = new Salsa20Core(this); + _salsa20Core = new Salsa20Core(this); #endif } -#if INTRINSICS - public override void ProcessStream(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int initialCounter, int offset = 0) => throw new NotImplementedException(); -#endif - /// public override int BlockSizeInBytes => BLOCK_SIZE_IN_BYTES; @@ -55,10 +49,10 @@ protected Salsa20Base(ReadOnlyMemory key, int initialCounter) : base(key, internal protected abstract void SetInitialState(Span state, ReadOnlySpan nonce, int counter); /// - internal protected override void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) => salsa20Core.Process(nonce, output, input, offset); + internal override void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) => _salsa20Core.Process(nonce, output, input, offset); /// - public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) => salsa20Core.ProcessKeyStreamBlock(nonce, counter, block); + public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) => _salsa20Core.ProcessKeyStreamBlock(nonce, counter, block); /// /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . @@ -66,7 +60,7 @@ protected Salsa20Base(ReadOnlyMemory key, int initialCounter) : base(key, /// The subKey. /// The nonce. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void HSalsa20(Span subKey, ReadOnlySpan nonce) => salsa20Core.HSalsa20(subKey, nonce); + public void HSalsa20(Span subKey, ReadOnlySpan nonce) => _salsa20Core.HSalsa20(subKey, nonce); /// /// Sets the initial of the HSalsa20 using the key and the . diff --git a/src/NaCl.Core/Base/Salsa20CoreIntrinsics.cs b/src/NaCl.Core/Base/Salsa20CoreIntrinsics.cs index 60db3de..9babd92 100644 --- a/src/NaCl.Core/Base/Salsa20CoreIntrinsics.cs +++ b/src/NaCl.Core/Base/Salsa20CoreIntrinsics.cs @@ -1,64 +1,63 @@ #if INTRINSICS -namespace NaCl.Core.Base -{ - using System; - using System.Runtime.CompilerServices; - using System.Runtime.InteropServices; - using System.Security.Cryptography; +namespace NaCl.Core.Base; - using Internal; +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Security.Cryptography; - internal class Salsa20CoreIntrinsics : ISalsa20Core - { - protected const int KEY_SIZE_IN_INTS = 8; - public const int KEY_SIZE_IN_BYTES = KEY_SIZE_IN_INTS * 4; // 32 - protected const int BLOCK_SIZE_IN_INTS = 16; - public const int BLOCK_SIZE_IN_BYTES = BLOCK_SIZE_IN_INTS * 4; // 64 +using Internal; - protected static uint[] SIGMA = new uint[] { 0x61707865, 0x3320646E, 0x79622D32, 0x6B206574 }; // "expand 32-byte k" (4 words constant: "expa", "nd 3", "2-by", and "te k") - private readonly Salsa20Base _salsa20; +internal class Salsa20CoreIntrinsics : ISalsa20Core +{ + protected const int KEY_SIZE_IN_INTS = 8; + public const int KEY_SIZE_IN_BYTES = KEY_SIZE_IN_INTS * 4; // 32 + protected const int BLOCK_SIZE_IN_INTS = 16; + public const int BLOCK_SIZE_IN_BYTES = BLOCK_SIZE_IN_INTS * 4; // 64 - public Salsa20CoreIntrinsics(Salsa20Base salsa20) => _salsa20 = salsa20; + protected static uint[] SIGMA = new uint[] { 0x61707865, 0x3320646E, 0x79622D32, 0x6B206574 }; // "expand 32-byte k" (4 words constant: "expa", "nd 3", "2-by", and "te k") + private readonly Salsa20Base _salsa20; - public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) - { - if (block.Length != BLOCK_SIZE_IN_BYTES) - throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); + public Salsa20CoreIntrinsics(Salsa20Base salsa20) => _salsa20 = salsa20; - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - _salsa20.SetInitialState(state, nonce, counter); + public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) + { + if (block.Length != BLOCK_SIZE_IN_BYTES) + throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); - Salsa20BaseIntrinsics.Salsa20KeyStream(state, block); - } + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + _salsa20.SetInitialState(state, nonce, counter); - public unsafe void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) + Salsa20BaseIntrinsics.Salsa20KeyStream(state, block); + } + + public unsafe void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) + { + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + _salsa20.SetInitialState(state, nonce, _salsa20.InitialCounter); + fixed (uint* x = state) + fixed (byte* m = input, c = output.Slice(offset)) { - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - _salsa20.SetInitialState(state, nonce, _salsa20.InitialCounter); - fixed (uint* x = state) - fixed (byte* m = input, c = output.Slice(offset)) - { - Salsa20BaseIntrinsics.Salsa20(x, m, c, (ulong)input.Length); - } + Salsa20BaseIntrinsics.Salsa20(x, m, c, (ulong)input.Length); } + } - /// - /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . - /// - /// The subKey. - /// The nonce. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void HSalsa20(Span subKey, ReadOnlySpan nonce) - { - // See: http://cr.yp.to/snuffle/xsalsa-20081128.pdf under 2. Specification - Definition of HSalsa20 + /// + /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . + /// + /// The subKey. + /// The nonce. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void HSalsa20(Span subKey, ReadOnlySpan nonce) + { + // See: http://cr.yp.to/snuffle/xsalsa-20081128.pdf under 2. Specification - Definition of HSalsa20 - Span state = stackalloc uint[BLOCK_SIZE_IN_BYTES]; + Span state = stackalloc uint[BLOCK_SIZE_IN_BYTES]; - // Setting HSalsa20 initial state - _salsa20.HSalsa20InitialState(state, nonce); + // Setting HSalsa20 initial state + _salsa20.HSalsa20InitialState(state, nonce); - Salsa20BaseIntrinsics.HSalsa20(state, subKey); - } + Salsa20BaseIntrinsics.HSalsa20(state, subKey); } } #endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/Snuffle.cs b/src/NaCl.Core/Base/Snuffle.cs index dcb679e..7301057 100644 --- a/src/NaCl.Core/Base/Snuffle.cs +++ b/src/NaCl.Core/Base/Snuffle.cs @@ -26,7 +26,7 @@ public abstract class Snuffle { protected const int KEY_SIZE_IN_INTS = 8; public const int KEY_SIZE_IN_BYTES = KEY_SIZE_IN_INTS * 4; // 32 - protected const int BLOCK_SIZE_IN_INTS = 16; + protected internal const int BLOCK_SIZE_IN_INTS = 16; public const int BLOCK_SIZE_IN_BYTES = BLOCK_SIZE_IN_INTS * 4; // 64 protected static uint[] SIGMA = new uint[] { 0x61707865, 0x3320646E, 0x79622D32, 0x6B206574 }; // "expand 32-byte k" (4 words constant: "expa", "nd 3", "2-by", and "te k") @@ -62,10 +62,6 @@ protected Snuffle(ReadOnlyMemory key, int initialCounter) /// ByteBuffer. public abstract void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block); -#if INTRINSICS - public abstract void ProcessStream(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int initialCounter, int offset = 0); -#endif - /// /// The size of the nonce in bytes. /// Salsa20 uses a 8-byte (64-bit) nonce, ChaCha20 uses a 12-byte (96-bit) nonce, but XSalsa20 and XChaCha20 use a 24-byte (192-bit) nonce. @@ -126,14 +122,6 @@ public void Decrypt(ReadOnlySpan ciphertext, ReadOnlySpan nonce, Spa /// The output's starting offset. internal virtual void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) { -//#if INTRINSICS -// if (Sse3.IsSupported && BitConverter.IsLittleEndian) -// { -// ProcessStream(nonce, output, input, InitialCounter, offset); -// return; -// } -//#endif - var length = input.Length; var numBlocks = (length / BlockSizeInBytes) + 1; @@ -168,14 +156,14 @@ internal virtual void Process(ReadOnlySpan nonce, Span output, ReadO } } - /// - /// Formats the nonce length exception message. - /// - /// The crypto primitive name. - /// The actual nonce length. - /// The expected nonce length. - /// System.String. - internal static string FormatNonceLengthExceptionMessage(string name, int actual, int expected) => $"{name} uses {expected * 8}-bit nonces, but got a {actual * 8}-bit nonce. The nonce length in bytes must be {expected}."; + /// + /// Formats the nonce length exception message. + /// + /// The crypto primitive name. + /// The actual nonce length. + /// The expected nonce length. + /// System.String. + internal static string FormatNonceLengthExceptionMessage(string name, int actual, int expected) => $"{name} uses {expected * 8}-bit nonces, but got a {actual * 8}-bit nonce. The nonce length in bytes must be {expected}."; /// /// XOR the specified output. From 9270136f5b434e297e16da3fe2501fb054e0eb3f Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Sun, 16 Oct 2022 18:44:25 +0100 Subject: [PATCH 33/59] Fix process stackoverflow bug --- src/NaCl.Core/Base/ChaCha20Core.cs | 66 +++++++++++++++++++++++++++++- src/NaCl.Core/Base/Salsa20Core.cs | 50 +++++++++++++++++++++- 2 files changed, 114 insertions(+), 2 deletions(-) diff --git a/src/NaCl.Core/Base/ChaCha20Core.cs b/src/NaCl.Core/Base/ChaCha20Core.cs index 301e556..ebf3308 100644 --- a/src/NaCl.Core/Base/ChaCha20Core.cs +++ b/src/NaCl.Core/Base/ChaCha20Core.cs @@ -1,6 +1,7 @@ namespace NaCl.Core.Base { using System; + using System.Buffers; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Security.Cryptography; @@ -37,8 +38,71 @@ public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span nonce, Span output, ReadOnlySpan input, int offset = 0) => _chaCha20.Process(nonce, output, input, offset); + /// + /// Processes the Encryption/Decryption function. + /// + /// The nonce. + /// The output. + /// The input. + /// The output's starting offset. + public void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) + { + var blockSizeInBytes = _chaCha20.BlockSizeInBytes; + var length = input.Length; + var numBlocks = (length / blockSizeInBytes) + 1; + + /* + * Allocates 64 bytes more than below impl as per the benchmarks... + * + var block = new byte[BLOCK_SIZE_IN_BYTES]; + for (var i = 0; i < numBlocks; i++) + { + ProcessKeyStreamBlock(nonce, i + InitialCounter, block); + + if (i == numBlocks - 1) + Xor(output, input, block, length % BLOCK_SIZE_IN_BYTES, offset, i); // last block + else + Xor(output, input, block, BLOCK_SIZE_IN_BYTES, offset, i); + + CryptoBytes.Wipe(block); // Array.Clear(block, 0, block.Length); + } + */ + + using var owner = MemoryPool.Shared.Rent(blockSizeInBytes); + for (var i = 0; i < numBlocks; i++) + { + ProcessKeyStreamBlock(nonce, i + _chaCha20.InitialCounter, owner.Memory.Span); + + if (i == numBlocks - 1) + Xor(output, input, owner.Memory.Span, length % blockSizeInBytes, offset, i); // last block + else + Xor(output, input, owner.Memory.Span, blockSizeInBytes, offset, i); + + owner.Memory.Span.Clear(); + } + } + /// + /// XOR the specified output. + /// + /// The output. + /// The input. + /// The key stream block. + /// The length. + /// The output's starting offset. + /// The current block number. + /// The combination of blocks, offsets and length to be XORed is out-of-bonds. + private void Xor(Span output, ReadOnlySpan input, ReadOnlySpan block, int len, int offset, int curBlock) + { + var blockOffset = curBlock * _chaCha20.BlockSizeInBytes; + + // Since is not called directly from outside, there's no need to check + //if (len < 0 || offset < 0 || curBlock < 0 || output.Length < len || (input.Length - blockOffset) < len || block.Length < len) + // throw new CryptographicException("The combination of blocks, offsets and length to be XORed is out-of-bonds."); + + for (var i = 0; i < len; i++) + output[i + offset + blockOffset] = (byte)(input[i + blockOffset] ^ block[i]); + } /// /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . /// diff --git a/src/NaCl.Core/Base/Salsa20Core.cs b/src/NaCl.Core/Base/Salsa20Core.cs index 1a7bbfe..17b335c 100644 --- a/src/NaCl.Core/Base/Salsa20Core.cs +++ b/src/NaCl.Core/Base/Salsa20Core.cs @@ -1,6 +1,7 @@ namespace NaCl.Core.Base { using System; + using System.Buffers; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Security.Cryptography; @@ -41,7 +42,54 @@ public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span nonce, Span output, ReadOnlySpan input, int offset = 0) => _salsa20.Process(nonce, output, input, offset); + public void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) + { + var blockSizeInBytes = _salsa20.BlockSizeInBytes; + var length = input.Length; + var numBlocks = (length / blockSizeInBytes) + 1; + + /* + * Allocates 64 bytes more than below impl as per the benchmarks... + * + var block = new byte[BLOCK_SIZE_IN_BYTES]; + for (var i = 0; i < numBlocks; i++) + { + ProcessKeyStreamBlock(nonce, i + InitialCounter, block); + + if (i == numBlocks - 1) + Xor(output, input, block, length % BLOCK_SIZE_IN_BYTES, offset, i); // last block + else + Xor(output, input, block, BLOCK_SIZE_IN_BYTES, offset, i); + + CryptoBytes.Wipe(block); // Array.Clear(block, 0, block.Length); + } + */ + + using var owner = MemoryPool.Shared.Rent(blockSizeInBytes); + for (var i = 0; i < numBlocks; i++) + { + ProcessKeyStreamBlock(nonce, i + _salsa20.InitialCounter, owner.Memory.Span); + + if (i == numBlocks - 1) + Xor(output, input, owner.Memory.Span, length % blockSizeInBytes, offset, i); // last block + else + Xor(output, input, owner.Memory.Span, blockSizeInBytes, offset, i); + + owner.Memory.Span.Clear(); + } + } + + private void Xor(Span output, ReadOnlySpan input, ReadOnlySpan block, int len, int offset, int curBlock) + { + var blockOffset = curBlock * _salsa20.BlockSizeInBytes; + + // Since is not called directly from outside, there's no need to check + //if (len < 0 || offset < 0 || curBlock < 0 || output.Length < len || (input.Length - blockOffset) < len || block.Length < len) + // throw new CryptographicException("The combination of blocks, offsets and length to be XORed is out-of-bonds."); + + for (var i = 0; i < len; i++) + output[i + offset + blockOffset] = (byte)(input[i + blockOffset] ^ block[i]); + } /// /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . From b46ca3445b2f1dd28befe3fb497f6777f4833329 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Mon, 17 Oct 2022 01:04:21 +0100 Subject: [PATCH 34/59] Fix incorrect system checks --- src/NaCl.Core/Base/ChaCha20Base.cs | 4 ++-- src/NaCl.Core/Base/Salsa20Base.cs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/NaCl.Core/Base/ChaCha20Base.cs b/src/NaCl.Core/Base/ChaCha20Base.cs index 13fd1a2..9ff3385 100644 --- a/src/NaCl.Core/Base/ChaCha20Base.cs +++ b/src/NaCl.Core/Base/ChaCha20Base.cs @@ -43,7 +43,7 @@ public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter SetInitialState(state, nonce, counter); #if INTRINSICS - if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported && BitConverter.IsLittleEndian) { ChaCha20BaseIntrinsics.ChaCha20KeyStream(state, block); return; @@ -93,7 +93,7 @@ public void HChaCha20(Span subKey, ReadOnlySpan nonce) HChaCha20InitialState(state, nonce); #if INTRINSICS - if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported && BitConverter.IsLittleEndian) { ChaCha20BaseIntrinsics.HChaCha20(state, subKey); return; diff --git a/src/NaCl.Core/Base/Salsa20Base.cs b/src/NaCl.Core/Base/Salsa20Base.cs index bccd6ef..9dc22db 100644 --- a/src/NaCl.Core/Base/Salsa20Base.cs +++ b/src/NaCl.Core/Base/Salsa20Base.cs @@ -42,7 +42,7 @@ public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter SetInitialState(state, nonce, counter); #if INTRINSICS - if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported && BitConverter.IsLittleEndian) { Salsa20BaseIntrinsics.Salsa20KeyStream(state, block); return; @@ -91,7 +91,7 @@ public void HSalsa20(Span subKey, ReadOnlySpan nonce) HSalsa20InitialState(state, nonce); #if INTRINSICS - if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported && BitConverter.IsLittleEndian) { Salsa20BaseIntrinsics.HSalsa20(state, subKey); return; From b340bc47ae6e3832678a3a233debb2bb714c3d2a Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Mon, 17 Oct 2022 01:04:21 +0100 Subject: [PATCH 35/59] Fix incorrect system checks --- src/NaCl.Core/Base/ChaCha20Base.cs | 4 ++-- src/NaCl.Core/Base/Salsa20Base.cs | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/NaCl.Core/Base/ChaCha20Base.cs b/src/NaCl.Core/Base/ChaCha20Base.cs index 13fd1a2..9ff3385 100644 --- a/src/NaCl.Core/Base/ChaCha20Base.cs +++ b/src/NaCl.Core/Base/ChaCha20Base.cs @@ -43,7 +43,7 @@ public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter SetInitialState(state, nonce, counter); #if INTRINSICS - if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported && BitConverter.IsLittleEndian) { ChaCha20BaseIntrinsics.ChaCha20KeyStream(state, block); return; @@ -93,7 +93,7 @@ public void HChaCha20(Span subKey, ReadOnlySpan nonce) HChaCha20InitialState(state, nonce); #if INTRINSICS - if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported && BitConverter.IsLittleEndian) { ChaCha20BaseIntrinsics.HChaCha20(state, subKey); return; diff --git a/src/NaCl.Core/Base/Salsa20Base.cs b/src/NaCl.Core/Base/Salsa20Base.cs index bccd6ef..8cd2297 100644 --- a/src/NaCl.Core/Base/Salsa20Base.cs +++ b/src/NaCl.Core/Base/Salsa20Base.cs @@ -42,7 +42,7 @@ public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter SetInitialState(state, nonce, counter); #if INTRINSICS - if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported && BitConverter.IsLittleEndian) { Salsa20BaseIntrinsics.Salsa20KeyStream(state, block); return; @@ -85,13 +85,13 @@ public void HSalsa20(Span subKey, ReadOnlySpan nonce) { // See: http://cr.yp.to/snuffle/xsalsa-20081128.pdf under 2. Specification - Definition of HSalsa20 - Span state = stackalloc uint[BLOCK_SIZE_IN_BYTES]; + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; // Setting HSalsa20 initial state HSalsa20InitialState(state, nonce); - + #if INTRINSICS - if (System.Runtime.Intrinsics.X86.Sse3.IsSupported || !BitConverter.IsLittleEndian) + if (System.Runtime.Intrinsics.X86.Avx2.IsSupported && BitConverter.IsLittleEndian) { Salsa20BaseIntrinsics.HSalsa20(state, subKey); return; From f3605ba5de5213c3acf3e95397afe97b14bcecaf Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Mon, 17 Oct 2022 17:38:18 +0100 Subject: [PATCH 36/59] Rewrite HSalsa to use only Sse2 --- src/NaCl.Core/Base/Salsa20Base.cs | 2 +- src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs | 38 ++++++++++++++----- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/src/NaCl.Core/Base/Salsa20Base.cs b/src/NaCl.Core/Base/Salsa20Base.cs index 8cd2297..7916b6f 100644 --- a/src/NaCl.Core/Base/Salsa20Base.cs +++ b/src/NaCl.Core/Base/Salsa20Base.cs @@ -91,7 +91,7 @@ public void HSalsa20(Span subKey, ReadOnlySpan nonce) HSalsa20InitialState(state, nonce); #if INTRINSICS - if (System.Runtime.Intrinsics.X86.Avx2.IsSupported && BitConverter.IsLittleEndian) + if (System.Runtime.Intrinsics.X86.Sse2.IsSupported && BitConverter.IsLittleEndian) { Salsa20BaseIntrinsics.HSalsa20(state, subKey); return; diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs index b249c78..a3263c8 100644 --- a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs @@ -1,8 +1,8 @@ #if INTRINSICS +using System; using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; -using System; +using System.Runtime.Intrinsics.X86; namespace NaCl.Core.Base.SalsaIntrinsics; @@ -106,17 +106,16 @@ public static unsafe void HSalsa20(ReadOnlySpan state, Span subKey) ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + // HSalsa returns a 32 byte array of 0,5,10,15,6,7,8,9 + // <0, 5, 2, 3> + <8, 9, 10, 15> -> <0, 5, 10, 15> - Vector128 t_0 = Avx2.Blend(x_0, x_1, 0b_00_00_00_10); - Vector128 t_1 = Avx2.Blend(x_2, x_3, 0b_00_00_10_00); - Vector128 t_2 = Avx2.Blend(t_0, t_1, 0b_00_00_11_00); + var t_0 = Diagonal(x_0, x_1, x_2, x_3); - // Get <8, 9, 6, 7> then shuffle to <6, 7, 8, 9> - Vector128 t_3 = Avx2.Blend(x_1, x_2, 0b_00_00_00_11); - t_3 = Sse2.Shuffle(t_3, 0b_01_00_11_10); + // Get <4, 5, 6, 7> & <8, 9, 10, 11> then unpack halves for <6, 7, 8, 9> + var t_1 = UnpackHighLow(x_1, x_2); - Sse2.Store(sk, Vector128.AsByte(t_2)); - Sse2.Store(sk + 16, Vector128.AsByte(t_3)); + Sse2.Store(sk, Vector128.AsByte(t_0)); + Sse2.Store(sk + 16, Vector128.AsByte(t_1)); } } @@ -196,6 +195,25 @@ private static unsafe void ShuffleState(ref Vector128 x_0, ref Vector128 Vector128Rotate(Vector128 a, byte imm) => Sse2.Or(Sse2.ShiftLeftLogical(a, imm), Sse2.ShiftRightLogical(a, (byte)(32 - imm))); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static Vector128 UnpackHighLow(Vector128 a, Vector128 b) + { + var w_0 = Sse2.UnpackHigh(a.AsUInt64(), b.AsUInt64()); + return Sse2.UnpackLow(w_0, b.AsUInt64()).AsUInt32(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static Vector128 Diagonal(Vector128 a, Vector128 b, Vector128 c, Vector128 d) + { + var w_0 = Sse2.UnpackLow(a, b); + var w_1 = Sse2.UnpackHigh(c, d); + + var t_0 = Sse2.Shuffle(w_0, 0b_00_00_11_00).AsUInt64(); + var t_1 = Sse2.Shuffle(w_1, 0b_00_00_11_00).AsUInt64(); + + return Sse2.UnpackLow(t_0, t_1).AsUInt32(); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Transpose(ref Vector128 a, ref Vector128 b, ref Vector128 c, ref Vector128 d) { From 8db68e14376e2025f24f228af54d22dd91258f46 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Mon, 17 Oct 2022 17:59:25 +0100 Subject: [PATCH 37/59] Minor refactoring, add comments --- src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha256.cs | 4 ++++ src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs | 3 +++ src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs | 1 + src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs | 3 +++ src/NaCl.Core/Base/SalsaIntrinsics/Salsa512.cs | 12 ++++++++++++ src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs | 6 +++--- 6 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha256.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha256.cs index c5d7022..00e389d 100644 --- a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha256.cs +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha256.cs @@ -97,6 +97,7 @@ public static unsafe void Process(uint* x, ref byte* m, ref byte* c, ref ulong b Vec128QuarterRound(ref x_1, ref x_5, ref x_9, ref x_13); Vec128QuarterRound(ref x_2, ref x_6, ref x_10, ref x_14); Vec128QuarterRound(ref x_3, ref x_7, ref x_11, ref x_15); + Vec128QuarterRound(ref x_0, ref x_5, ref x_10, ref x_15); Vec128QuarterRound(ref x_1, ref x_6, ref x_11, ref x_12); Vec128QuarterRound(ref x_2, ref x_7, ref x_8, ref x_13); @@ -130,14 +131,17 @@ private static unsafe void OneQuad(ref Vector128 x_A, ref Vector128 x_B = Sse2.Add(x_B, origB); x_C = Sse2.Add(x_C, origC); x_D = Sse2.Add(x_D, origD); + t_A = Sse2.UnpackLow(x_A, x_B); t_B = Sse2.UnpackLow(x_C, x_D); t_C = Sse2.UnpackHigh(x_A, x_B); t_D = Sse2.UnpackHigh(x_C, x_D); + x_A = Sse2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); x_B = Sse2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); x_C = Sse2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); x_D = Sse2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + t0 = Sse2.Xor(x_A.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); Sse2.Store(c, t0.AsByte()); t1 = Sse2.Xor(x_B.AsByte(), Sse2.LoadVector128(m + 64)).AsUInt32(); diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs index d4ab373..52d212c 100644 --- a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs @@ -178,14 +178,17 @@ private static void Vec256Round(ref Vector256 A1, ref Vector256 B1, Vector256Line1(ref A2, ref B2, ref C2, ref D2); Vector256Line1(ref A3, ref B3, ref C3, ref D3); Vector256Line1(ref A4, ref B4, ref C4, ref D4); + Vector256Line2(ref A1, ref B1, ref C1, ref D1); Vector256Line2(ref A2, ref B2, ref C2, ref D2); Vector256Line2(ref A3, ref B3, ref C3, ref D3); Vector256Line2(ref A4, ref B4, ref C4, ref D4); + Vector256Line3(ref A1, ref B1, ref C1, ref D1); Vector256Line3(ref A2, ref B2, ref C2, ref D2); Vector256Line3(ref A3, ref B3, ref C3, ref D3); Vector256Line3(ref A4, ref B4, ref C4, ref D4); + Vector256Line4(ref A1, ref B1, ref C1, ref D1); Vector256Line4(ref A2, ref B2, ref C2, ref D2); Vector256Line4(ref A3, ref B3, ref C3, ref D3); diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs index 7121038..6458ded 100644 --- a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs @@ -41,6 +41,7 @@ public static unsafe void Process64(uint* x, ref byte* m, ref byte* c, ref ulong Sse2.Store(c + 32, x_2.AsByte()); Sse2.Store(c + 48, x_3.AsByte()); + // Increment 64 bit counter for the original state. uint in12 = x[12]; uint in13 = x[13]; in12++; diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs index 5ecdfac..b7da377 100644 --- a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs @@ -143,14 +143,17 @@ private static unsafe void OneQuad(ref Vector128 x_A, ref Vector128 x_B = Sse2.Add(x_B, origB); x_C = Sse2.Add(x_C, origC); x_D = Sse2.Add(x_D, origD); + t_A = Sse2.UnpackLow(x_A, x_B); t_B = Sse2.UnpackLow(x_C, x_D); t_C = Sse2.UnpackHigh(x_A, x_B); t_D = Sse2.UnpackHigh(x_C, x_D); + x_A = Sse2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); x_B = Sse2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); x_C = Sse2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); x_D = Sse2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + t0 = Sse2.Xor(x_A.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); Sse2.Store(c, t0.AsByte()); t1 = Sse2.Xor(x_B.AsByte(), Sse2.LoadVector128(m + 64)).AsUInt32(); diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa512.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa512.cs index 2fea411..7813da7 100644 --- a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa512.cs +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa512.cs @@ -6,6 +6,7 @@ namespace NaCl.Core.Base.SalsaIntrinsics; #pragma warning disable IDE0007 // Use implicit type +#pragma warning disable IDE0022 // Use expression body for methods internal static class Salsa512 { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -66,6 +67,7 @@ public static unsafe void Process(uint* x, ref byte* m, ref byte* c, ref ulong b x_14 = orig14; x_15 = orig15; + // Calculate the next 8 counter values. uint in8 = x[8]; uint in9 = x[9]; ulong in89 = in8 | ((ulong)in9 << 32); @@ -105,6 +107,7 @@ public static unsafe void Process(uint* x, ref byte* m, ref byte* c, ref ulong b t_6 = Avx2.Permute2x128(x_2, x_6, 0x31); t_3 = Avx2.Permute2x128(x_3, x_7, 0x20); t_7 = Avx2.Permute2x128(x_3, x_7, 0x31); + t_0 = Avx2.Xor(t_0, Avx.LoadVector256(m).AsUInt32()); t_1 = Avx2.Xor(t_1, Avx.LoadVector256(m + 64).AsUInt32()); t_2 = Avx2.Xor(t_2, Avx.LoadVector256(m + 128).AsUInt32()); @@ -113,6 +116,7 @@ public static unsafe void Process(uint* x, ref byte* m, ref byte* c, ref ulong b t_5 = Avx2.Xor(t_5, Avx.LoadVector256(m + 320).AsUInt32()); t_6 = Avx2.Xor(t_6, Avx.LoadVector256(m + 384).AsUInt32()); t_7 = Avx2.Xor(t_7, Avx.LoadVector256(m + 448).AsUInt32()); + Avx.Store(c, t_0.AsByte()); Avx.Store(c + 64, t_1.AsByte()); Avx.Store(c + 128, t_2.AsByte()); @@ -137,6 +141,7 @@ public static unsafe void Process(uint* x, ref byte* m, ref byte* c, ref ulong b t_14 = Avx2.Permute2x128(x_10, x_14, 0x31); t_11 = Avx2.Permute2x128(x_11, x_15, 0x20); t_15 = Avx2.Permute2x128(x_11, x_15, 0x31); + t_8 = Avx2.Xor(t_8, Avx.LoadVector256(m).AsUInt32()); t_9 = Avx2.Xor(t_9, Avx.LoadVector256(m + 64).AsUInt32()); t_10 = Avx2.Xor(t_10, Avx.LoadVector256(m + 128).AsUInt32()); @@ -145,6 +150,7 @@ public static unsafe void Process(uint* x, ref byte* m, ref byte* c, ref ulong b t_13 = Avx2.Xor(t_13, Avx.LoadVector256(m + 320).AsUInt32()); t_14 = Avx2.Xor(t_14, Avx.LoadVector256(m + 384).AsUInt32()); t_15 = Avx2.Xor(t_15, Avx.LoadVector256(m + 448).AsUInt32()); + Avx.Store(c, t_8.AsByte()); Avx.Store(c + 64, t_9.AsByte()); Avx.Store(c + 128, t_10.AsByte()); @@ -172,14 +178,17 @@ private static void Vec256Round(ref Vector256 A1, ref Vector256 B1, Vector256Line1(ref A2, ref B2, ref C2, ref D2); Vector256Line1(ref A3, ref B3, ref C3, ref D3); Vector256Line1(ref A4, ref B4, ref C4, ref D4); + Vector256Line2(ref A1, ref B1, ref C1, ref D1); Vector256Line2(ref A2, ref B2, ref C2, ref D2); Vector256Line2(ref A3, ref B3, ref C3, ref D3); Vector256Line2(ref A4, ref B4, ref C4, ref D4); + Vector256Line3(ref A1, ref B1, ref C1, ref D1); Vector256Line3(ref A2, ref B2, ref C2, ref D2); Vector256Line3(ref A3, ref B3, ref C3, ref D3); Vector256Line3(ref A4, ref B4, ref C4, ref D4); + Vector256Line4(ref A1, ref B1, ref C1, ref D1); Vector256Line4(ref A2, ref B2, ref C2, ref D2); Vector256Line4(ref A3, ref B3, ref C3, ref D3); @@ -217,10 +226,12 @@ private static void OneQuadUnpack(ref Vector256 x_A, ref Vector256 x x_B = Avx2.Add(x_B, orig_B); x_C = Avx2.Add(x_C, orig_C); x_D = Avx2.Add(x_D, orig_D); + t_A = Avx2.UnpackLow(x_A, x_B); t_B = Avx2.UnpackLow(x_C, x_D); t_C = Avx2.UnpackHigh(x_A, x_B); t_D = Avx2.UnpackHigh(x_C, x_D); + x_A = Avx2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); x_B = Avx2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); x_C = Avx2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); @@ -228,4 +239,5 @@ private static void OneQuadUnpack(ref Vector256 x_A, ref Vector256 x } } #pragma warning restore IDE0007 // Use implicit type +#pragma warning restore IDE0022 // Use expression body for methods #endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs index a3263c8..21d6b23 100644 --- a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs @@ -109,7 +109,7 @@ public static unsafe void HSalsa20(ReadOnlySpan state, Span subKey) // HSalsa returns a 32 byte array of 0,5,10,15,6,7,8,9 // <0, 5, 2, 3> + <8, 9, 10, 15> -> <0, 5, 10, 15> - var t_0 = Diagonal(x_0, x_1, x_2, x_3); + var t_0 = GetDiagonal(x_0, x_1, x_2, x_3); // Get <4, 5, 6, 7> & <8, 9, 10, 11> then unpack halves for <6, 7, 8, 9> var t_1 = UnpackHighLow(x_1, x_2); @@ -196,14 +196,14 @@ private static unsafe void ShuffleState(ref Vector128 x_0, ref Vector128 Vector128Rotate(Vector128 a, byte imm) => Sse2.Or(Sse2.ShiftLeftLogical(a, imm), Sse2.ShiftRightLogical(a, (byte)(32 - imm))); [MethodImpl(MethodImplOptions.AggressiveInlining)] - static Vector128 UnpackHighLow(Vector128 a, Vector128 b) + private static Vector128 UnpackHighLow(Vector128 a, Vector128 b) { var w_0 = Sse2.UnpackHigh(a.AsUInt64(), b.AsUInt64()); return Sse2.UnpackLow(w_0, b.AsUInt64()).AsUInt32(); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - static Vector128 Diagonal(Vector128 a, Vector128 b, Vector128 c, Vector128 d) + private static Vector128 GetDiagonal(Vector128 a, Vector128 b, Vector128 c, Vector128 d) { var w_0 = Sse2.UnpackLow(a, b); var w_1 = Sse2.UnpackHigh(c, d); From 202ff15f1fc0fa74e842cbe384581e490f1f42c1 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Mon, 17 Oct 2022 18:55:14 +0100 Subject: [PATCH 38/59] Refactor BaseIntrinsics to use pointers --- src/NaCl.Core/Base/ChaCha20Base.cs | 9 +-- src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs | 49 ++++++++---- .../Base/ChaChaIntrinsics/ChaCha64.cs | 68 +++++++--------- src/NaCl.Core/Base/Salsa20Base.cs | 8 +- src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs | 49 ++++++++---- src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs | 78 +++++++++---------- 6 files changed, 135 insertions(+), 126 deletions(-) diff --git a/src/NaCl.Core/Base/ChaCha20Base.cs b/src/NaCl.Core/Base/ChaCha20Base.cs index 9ff3385..31f90d0 100644 --- a/src/NaCl.Core/Base/ChaCha20Base.cs +++ b/src/NaCl.Core/Base/ChaCha20Base.cs @@ -68,12 +68,9 @@ public override unsafe void ProcessStream(ReadOnlySpan nonce, Span o { Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; SetInitialState(state, nonce, initialCounter); - - fixed(uint* x = state) - fixed (byte* m = input, c = output.Slice(offset)) - { - ChaCha20BaseIntrinsics.ChaCha20(x, m, c, (ulong)input.Length); - } + var c = output.Slice(offset); + + ChaCha20BaseIntrinsics.ChaCha20(state, input, c, (ulong)input.Length); } #endif diff --git a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs index 3ce192b..75f841d 100644 --- a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs @@ -10,26 +10,33 @@ namespace NaCl.Core.Base; public static class ChaCha20BaseIntrinsics { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe void ChaCha20(uint* x, byte* m, byte* c, ulong bytes) + public static unsafe void ChaCha20(Span state, ReadOnlySpan input, Span output, ulong bytes) { if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) throw new Exception("Error this vectorisation is not supported on this CPU"); - if (Avx2.IsSupported && bytes >= 512) + fixed (uint* x = state) + fixed (byte* m_p = input, c_p = output) { - ChaCha512.Process(x, ref m, ref c, ref bytes); - } - if (bytes >= 256) - { - ChaCha256.Process(x, ref m, ref c, ref bytes); - } - while (bytes >= 64) - { - ChaCha64.Process64(x, ref m, ref c, ref bytes); - } - if (bytes > 0) - { - ChaCha64.ProcessVarLength(x, ref m, ref c, ref bytes); + var m = m_p; + var c = c_p; + + if (Avx2.IsSupported && bytes >= 512) + { + ChaCha512.Process(x, ref m, ref c, ref bytes); + } + if (bytes >= 256) + { + ChaCha256.Process(x, ref m, ref c, ref bytes); + } + while (bytes >= 64) + { + ChaCha64.Process64(x, ref m, ref c, ref bytes); + } + if (bytes > 0) + { + ChaCha64.ProcessVarLength(x, ref m, ref c, ref bytes); + } } } @@ -39,7 +46,11 @@ public static unsafe void HChaCha20(ReadOnlySpan state, Span subKey) if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) throw new Exception("Error this vectorisation is not supported on this CPU"); - ChaCha64.HChaCha20(state, subKey); + fixed (uint* x = state) + fixed (byte* sk = subKey) + { + ChaCha64.HChaCha20(x, sk); + } } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -48,7 +59,11 @@ public static unsafe void ChaCha20KeyStream(ReadOnlySpan state, Span if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) throw new Exception("Error this vectorisation is not supported on this CPU"); - ChaCha64.KeyStream64(state, output); + fixed (byte* c = output) + fixed (uint* x = state) + { + ChaCha64.KeyStream64(x, c); + } } } #endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs index 6458ded..f565fc1 100644 --- a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs @@ -94,51 +94,43 @@ public static unsafe void ProcessVarLength(uint* x, ref byte* m, ref byte* c, re } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe void HChaCha20(ReadOnlySpan state, Span subKey) + public static unsafe void HChaCha20(uint* x, byte* sk) { - fixed(uint* x = state) - fixed(byte* sk = subKey) - { - Vector128 x_0 = Sse2.LoadVector128(x); - Vector128 x_1 = Sse2.LoadVector128(x + 4); - Vector128 x_2 = Sse2.LoadVector128(x + 8); - Vector128 x_3 = Sse2.LoadVector128(x + 12); + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); - ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); - Sse2.Store(sk, Vector128.AsByte(x_0)); - Sse2.Store(sk + 16, Vector128.AsByte(x_3)); - } + Sse2.Store(sk, Vector128.AsByte(x_0)); + Sse2.Store(sk + 16, Vector128.AsByte(x_3)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe void KeyStream64(ReadOnlySpan state, Span output) + public static unsafe void KeyStream64(uint* x, byte* c) { - fixed (byte* k = output) - fixed (uint* x = state) - { - Vector128 x_0 = Sse2.LoadVector128(x); - Vector128 x_1 = Sse2.LoadVector128(x + 4); - Vector128 x_2 = Sse2.LoadVector128(x + 8); - Vector128 x_3 = Sse2.LoadVector128(x + 12); - - Vector128 orig_0 = x_0; - Vector128 orig_1 = x_1; - Vector128 orig_2 = x_2; - Vector128 orig_3 = x_3; - - ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); - - x_0 = Sse2.Add(x_0, orig_0); - x_1 = Sse2.Add(x_1, orig_1); - x_2 = Sse2.Add(x_2, orig_2); - x_3 = Sse2.Add(x_3, orig_3); - - Sse2.Store(k, x_0.AsByte()); - Sse2.Store(k + 16, x_1.AsByte()); - Sse2.Store(k + 32, x_2.AsByte()); - Sse2.Store(k + 48, x_3.AsByte()); - } + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); + + Vector128 orig_0 = x_0; + Vector128 orig_1 = x_1; + Vector128 orig_2 = x_2; + Vector128 orig_3 = x_3; + + ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + + x_0 = Sse2.Add(x_0, orig_0); + x_1 = Sse2.Add(x_1, orig_1); + x_2 = Sse2.Add(x_2, orig_2); + x_3 = Sse2.Add(x_3, orig_3); + + Sse2.Store(c, x_0.AsByte()); + Sse2.Store(c + 16, x_1.AsByte()); + Sse2.Store(c + 32, x_2.AsByte()); + Sse2.Store(c + 48, x_3.AsByte()); } [MethodImpl(MethodImplOptions.AggressiveInlining)] diff --git a/src/NaCl.Core/Base/Salsa20Base.cs b/src/NaCl.Core/Base/Salsa20Base.cs index 7916b6f..c1c20b6 100644 --- a/src/NaCl.Core/Base/Salsa20Base.cs +++ b/src/NaCl.Core/Base/Salsa20Base.cs @@ -67,11 +67,9 @@ public override unsafe void ProcessStream(ReadOnlySpan nonce, Span o { Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; SetInitialState(state, nonce, initialCounter); - fixed (uint* x = state) - fixed (byte* m = input, c = output.Slice(offset)) - { - Salsa20BaseIntrinsics.Salsa20(x, m, c, (ulong)input.Length); - } + var c = output.Slice(offset); + + Salsa20BaseIntrinsics.Salsa20(state, input, c, (ulong)input.Length); } #endif diff --git a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs index 0ec174a..bbf17a7 100644 --- a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs @@ -10,26 +10,33 @@ namespace NaCl.Core.Base; public static class Salsa20BaseIntrinsics { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes) + public static unsafe void Salsa20(Span state, ReadOnlySpan input, Span output, ulong bytes) { if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) throw new Exception("Error this vectorisation is not supported on this CPU"); - if (Avx2.IsSupported && bytes >= 512) + fixed (uint* x = state) + fixed (byte* m_p = input, c_p = output) { - Salsa512.Process(x, ref m, ref c, ref bytes); - } - if (bytes >= 256) - { - Salsa256.Process(x, ref m, ref c, ref bytes); - } - while (bytes >= 64) - { - Salsa64.Process64(x, ref m, ref c, ref bytes); - } - if (bytes > 0) - { - Salsa64.ProcessVarLength(x, ref m, ref c, ref bytes); + var m = m_p; + var c = c_p; + + if (Avx2.IsSupported && bytes >= 512) + { + Salsa512.Process(x, ref m, ref c, ref bytes); + } + if (bytes >= 256) + { + Salsa256.Process(x, ref m, ref c, ref bytes); + } + while (bytes >= 64) + { + Salsa64.Process64(x, ref m, ref c, ref bytes); + } + if (bytes > 0) + { + Salsa64.ProcessVarLength(x, ref m, ref c, ref bytes); + } } } @@ -39,7 +46,11 @@ public static unsafe void HSalsa20(ReadOnlySpan state, Span subKey) if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) throw new Exception("Error this vectorisation is not supported on this CPU"); - Salsa64.HSalsa20(state, subKey); + fixed (uint* x = state) + fixed (byte* sk = subKey) + { + Salsa64.HSalsa20(x, sk); + } } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -48,7 +59,11 @@ public static unsafe void Salsa20KeyStream(ReadOnlySpan state, Span if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) throw new Exception("Error this vectorisation is not supported on this CPU"); - Salsa64.KeyStream64(state, output); + fixed (byte* c = output) + fixed (uint* x = state) + { + Salsa64.KeyStream64(x, c); + } } } #endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs index 21d6b23..2e8b883 100644 --- a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs @@ -94,59 +94,51 @@ public static unsafe void ProcessVarLength(uint* x, ref byte* m, ref byte* c, re } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe void HSalsa20(ReadOnlySpan state, Span subKey) + public static unsafe void HSalsa20(uint* x, byte* sk) { - fixed (uint* x = state) - fixed (byte* sk = subKey) - { - Vector128 x_0 = Sse2.LoadVector128(x); - Vector128 x_1 = Sse2.LoadVector128(x + 4); - Vector128 x_2 = Sse2.LoadVector128(x + 8); - Vector128 x_3 = Sse2.LoadVector128(x + 12); + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); - ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); - // HSalsa returns a 32 byte array of 0,5,10,15,6,7,8,9 + // HSalsa returns a 32 byte array of 0,5,10,15,6,7,8,9 - // <0, 5, 2, 3> + <8, 9, 10, 15> -> <0, 5, 10, 15> - var t_0 = GetDiagonal(x_0, x_1, x_2, x_3); + // <0, 5, 2, 3> + <8, 9, 10, 15> -> <0, 5, 10, 15> + var t_0 = GetDiagonal(x_0, x_1, x_2, x_3); - // Get <4, 5, 6, 7> & <8, 9, 10, 11> then unpack halves for <6, 7, 8, 9> - var t_1 = UnpackHighLow(x_1, x_2); + // Get <4, 5, 6, 7> & <8, 9, 10, 11> then unpack halves for <6, 7, 8, 9> + var t_1 = UnpackHighLow(x_1, x_2); - Sse2.Store(sk, Vector128.AsByte(t_0)); - Sse2.Store(sk + 16, Vector128.AsByte(t_1)); - } + Sse2.Store(sk, Vector128.AsByte(t_0)); + Sse2.Store(sk + 16, Vector128.AsByte(t_1)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe void KeyStream64(ReadOnlySpan state, Span output) + public static unsafe void KeyStream64(uint* x, byte* c) { - fixed (byte* k = output) - fixed (uint* x = state) - { - Vector128 x_0 = Sse2.LoadVector128(x); - Vector128 x_1 = Sse2.LoadVector128(x + 4); - Vector128 x_2 = Sse2.LoadVector128(x + 8); - Vector128 x_3 = Sse2.LoadVector128(x + 12); - - Vector128 orig_0 = x_0; - Vector128 orig_1 = x_1; - Vector128 orig_2 = x_2; - Vector128 orig_3 = x_3; - - ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); - - x_0 = Sse2.Add(x_0, orig_0); - x_1 = Sse2.Add(x_1, orig_1); - x_2 = Sse2.Add(x_2, orig_2); - x_3 = Sse2.Add(x_3, orig_3); - - Sse2.Store(k, x_0.AsByte()); - Sse2.Store(k + 16, x_1.AsByte()); - Sse2.Store(k + 32, x_2.AsByte()); - Sse2.Store(k + 48, x_3.AsByte()); - } + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); + + Vector128 orig_0 = x_0; + Vector128 orig_1 = x_1; + Vector128 orig_2 = x_2; + Vector128 orig_3 = x_3; + + ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + + x_0 = Sse2.Add(x_0, orig_0); + x_1 = Sse2.Add(x_1, orig_1); + x_2 = Sse2.Add(x_2, orig_2); + x_3 = Sse2.Add(x_3, orig_3); + + Sse2.Store(c, x_0.AsByte()); + Sse2.Store(c + 16, x_1.AsByte()); + Sse2.Store(c + 32, x_2.AsByte()); + Sse2.Store(c + 48, x_3.AsByte()); } [MethodImpl(MethodImplOptions.AggressiveInlining)] From 59797d5d0b290b91f0a2d57928ddf50f84e7759f Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Mon, 17 Oct 2022 20:29:27 +0100 Subject: [PATCH 39/59] Move Core files into folders --- src/NaCl.Core/Base/ChaCha20Base.cs | 9 ++------- .../Base/{ => ChaChaCore}/ChaCha20Core.cs | 20 +++++++++---------- .../ChaCha20CoreIntrinsics.cs | 2 +- .../Base/{ => ChaChaCore}/IChaCha20Core.cs | 2 +- src/NaCl.Core/Base/Salsa20Base.cs | 6 ++---- .../Base/{ => SalsaCore}/ISalsa20Core.cs | 2 +- .../Base/{ => SalsaCore}/Salsa20Core.cs | 4 ++-- .../{ => SalsaCore}/Salsa20CoreIntrinsics.cs | 1 + src/NaCl.Core/Base/Snuffle.cs | 4 ---- 9 files changed, 20 insertions(+), 30 deletions(-) rename src/NaCl.Core/Base/{ => ChaChaCore}/ChaCha20Core.cs (92%) rename src/NaCl.Core/Base/{ => ChaChaCore}/ChaCha20CoreIntrinsics.cs (98%) rename src/NaCl.Core/Base/{ => ChaChaCore}/IChaCha20Core.cs (89%) rename src/NaCl.Core/Base/{ => SalsaCore}/ISalsa20Core.cs (89%) rename src/NaCl.Core/Base/{ => SalsaCore}/Salsa20Core.cs (98%) rename src/NaCl.Core/Base/{ => SalsaCore}/Salsa20CoreIntrinsics.cs (98%) diff --git a/src/NaCl.Core/Base/ChaCha20Base.cs b/src/NaCl.Core/Base/ChaCha20Base.cs index 0eba223..c7829a3 100644 --- a/src/NaCl.Core/Base/ChaCha20Base.cs +++ b/src/NaCl.Core/Base/ChaCha20Base.cs @@ -2,13 +2,8 @@ { using System; using System.Runtime.CompilerServices; - using System.Runtime.InteropServices; -#if INTRINSICS - using System.Runtime.Intrinsics.X86; -#endif - using System.Security.Cryptography; - using Internal; + using NaCl.Core.Base.ChaChaCore; /// /// Base class for and . @@ -26,7 +21,7 @@ public abstract class ChaCha20Base : Snuffle protected ChaCha20Base(ReadOnlyMemory key, int initialCounter) : base(key, initialCounter) { #if INTRINSICS - if (Sse3.IsSupported) + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported && BitConverter.IsLittleEndian) { _chaCha20Core = new ChaCha20CoreIntrinsics(this); } diff --git a/src/NaCl.Core/Base/ChaCha20Core.cs b/src/NaCl.Core/Base/ChaChaCore/ChaCha20Core.cs similarity index 92% rename from src/NaCl.Core/Base/ChaCha20Core.cs rename to src/NaCl.Core/Base/ChaChaCore/ChaCha20Core.cs index ebf3308..8cbd000 100644 --- a/src/NaCl.Core/Base/ChaCha20Core.cs +++ b/src/NaCl.Core/Base/ChaChaCore/ChaCha20Core.cs @@ -1,4 +1,4 @@ -namespace NaCl.Core.Base +namespace NaCl.Core.Base.ChaChaCore { using System; using System.Buffers; @@ -12,7 +12,7 @@ internal class ChaCha20Core : IChaCha20Core { public const int BLOCK_SIZE_IN_BYTES = Snuffle.BLOCK_SIZE_IN_BYTES; public const int BLOCK_SIZE_IN_INTS = Snuffle.BLOCK_SIZE_IN_INTS; - + private readonly ChaCha20Base _chaCha20; public ChaCha20Core(ChaCha20Base chaCha20) => _chaCha20 = chaCha20; @@ -38,18 +38,18 @@ public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span - /// Processes the Encryption/Decryption function. - /// - /// The nonce. - /// The output. - /// The input. - /// The output's starting offset. + /// + /// Processes the Encryption/Decryption function. + /// + /// The nonce. + /// The output. + /// The input. + /// The output's starting offset. public void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) { var blockSizeInBytes = _chaCha20.BlockSizeInBytes; var length = input.Length; - var numBlocks = (length / blockSizeInBytes) + 1; + var numBlocks = length / blockSizeInBytes + 1; /* * Allocates 64 bytes more than below impl as per the benchmarks... diff --git a/src/NaCl.Core/Base/ChaCha20CoreIntrinsics.cs b/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs similarity index 98% rename from src/NaCl.Core/Base/ChaCha20CoreIntrinsics.cs rename to src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs index b8be67b..ead523d 100644 --- a/src/NaCl.Core/Base/ChaCha20CoreIntrinsics.cs +++ b/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs @@ -4,7 +4,7 @@ namespace NaCl.Core.Base; using System; using System.Runtime.CompilerServices; using System.Security.Cryptography; - +using NaCl.Core.Base.ChaChaCore; internal class ChaCha20CoreIntrinsics : IChaCha20Core { diff --git a/src/NaCl.Core/Base/IChaCha20Core.cs b/src/NaCl.Core/Base/ChaChaCore/IChaCha20Core.cs similarity index 89% rename from src/NaCl.Core/Base/IChaCha20Core.cs rename to src/NaCl.Core/Base/ChaChaCore/IChaCha20Core.cs index e0432e8..3ae9e72 100644 --- a/src/NaCl.Core/Base/IChaCha20Core.cs +++ b/src/NaCl.Core/Base/ChaChaCore/IChaCha20Core.cs @@ -1,6 +1,6 @@ using System; -namespace NaCl.Core.Base; +namespace NaCl.Core.Base.ChaChaCore; internal interface IChaCha20Core { void HChaCha20(Span subKey, ReadOnlySpan nonce); diff --git a/src/NaCl.Core/Base/Salsa20Base.cs b/src/NaCl.Core/Base/Salsa20Base.cs index c3a5957..8bc57d7 100644 --- a/src/NaCl.Core/Base/Salsa20Base.cs +++ b/src/NaCl.Core/Base/Salsa20Base.cs @@ -2,10 +2,8 @@ { using System; using System.Runtime.CompilerServices; -#if INTRINSICS - using System.Runtime.Intrinsics.X86; -#endif using Internal; + using NaCl.Core.Base.SalsaCore; /// /// Base class for and . @@ -23,7 +21,7 @@ public abstract class Salsa20Base : Snuffle protected Salsa20Base(ReadOnlyMemory key, int initialCounter) : base(key, initialCounter) { #if INTRINSICS - if (Sse3.IsSupported) + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported && BitConverter.IsLittleEndian) { _salsa20Core = new Salsa20CoreIntrinsics(this); } diff --git a/src/NaCl.Core/Base/ISalsa20Core.cs b/src/NaCl.Core/Base/SalsaCore/ISalsa20Core.cs similarity index 89% rename from src/NaCl.Core/Base/ISalsa20Core.cs rename to src/NaCl.Core/Base/SalsaCore/ISalsa20Core.cs index 1414275..1425b5d 100644 --- a/src/NaCl.Core/Base/ISalsa20Core.cs +++ b/src/NaCl.Core/Base/SalsaCore/ISalsa20Core.cs @@ -1,6 +1,6 @@ using System; -namespace NaCl.Core.Base; +namespace NaCl.Core.Base.SalsaCore; internal interface ISalsa20Core { void HSalsa20(Span subKey, ReadOnlySpan nonce); diff --git a/src/NaCl.Core/Base/Salsa20Core.cs b/src/NaCl.Core/Base/SalsaCore/Salsa20Core.cs similarity index 98% rename from src/NaCl.Core/Base/Salsa20Core.cs rename to src/NaCl.Core/Base/SalsaCore/Salsa20Core.cs index 17b335c..83bf328 100644 --- a/src/NaCl.Core/Base/Salsa20Core.cs +++ b/src/NaCl.Core/Base/SalsaCore/Salsa20Core.cs @@ -1,4 +1,4 @@ -namespace NaCl.Core.Base +namespace NaCl.Core.Base.SalsaCore { using System; using System.Buffers; @@ -46,7 +46,7 @@ public void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan /// Abstract base class for XSalsa20, ChaCha20, XChaCha20 and their variants. /// From cd7cd48838f437f03ea8f67a50b9f91646b9bd90 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Mon, 17 Oct 2022 20:54:19 +0100 Subject: [PATCH 40/59] Fix process methods --- .../Base/ChaChaCore/ChaCha20CoreIntrinsics.cs | 13 +++++++------ .../Base/SalsaCore/Salsa20CoreIntrinsics.cs | 14 ++++---------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs b/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs index ead523d..02e4f82 100644 --- a/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs +++ b/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs @@ -30,14 +30,15 @@ public unsafe void Process(ReadOnlySpan nonce, Span output, ReadOnly { Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; _chaCha20.SetInitialState(state, nonce, _chaCha20.InitialCounter); - - fixed(uint* x = state) - fixed (byte* m = input, c = output.Slice(offset)) - { - ChaCha20BaseIntrinsics.ChaCha20(x, m, c, (ulong)input.Length); - } + + ChaCha20BaseIntrinsics.ChaCha20(state, input, output.Slice(offset), (ulong)input.Length); } + /// + /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . + /// + /// The subKey. + /// The nonce. [MethodImpl(MethodImplOptions.AggressiveInlining)] public void HChaCha20(Span subKey, ReadOnlySpan nonce) { diff --git a/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs b/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs index 073be9e..32abf16 100644 --- a/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs +++ b/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs @@ -11,12 +11,9 @@ namespace NaCl.Core.Base; internal class Salsa20CoreIntrinsics : ISalsa20Core { - protected const int KEY_SIZE_IN_INTS = 8; - public const int KEY_SIZE_IN_BYTES = KEY_SIZE_IN_INTS * 4; // 32 - protected const int BLOCK_SIZE_IN_INTS = 16; - public const int BLOCK_SIZE_IN_BYTES = BLOCK_SIZE_IN_INTS * 4; // 64 + const int BLOCK_SIZE_IN_BYTES = Snuffle.BLOCK_SIZE_IN_BYTES; + const int BLOCK_SIZE_IN_INTS = Snuffle.BLOCK_SIZE_IN_INTS; - protected static uint[] SIGMA = new uint[] { 0x61707865, 0x3320646E, 0x79622D32, 0x6B206574 }; // "expand 32-byte k" (4 words constant: "expa", "nd 3", "2-by", and "te k") private readonly Salsa20Base _salsa20; public Salsa20CoreIntrinsics(Salsa20Base salsa20) => _salsa20 = salsa20; @@ -36,11 +33,8 @@ public unsafe void Process(ReadOnlySpan nonce, Span output, ReadOnly { Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; _salsa20.SetInitialState(state, nonce, _salsa20.InitialCounter); - fixed (uint* x = state) - fixed (byte* m = input, c = output.Slice(offset)) - { - Salsa20BaseIntrinsics.Salsa20(x, m, c, (ulong)input.Length); - } + + Salsa20BaseIntrinsics.Salsa20(state, input, output.Slice(offset), (ulong)input.Length); } /// From 6b3298deaca0c5803e681afff60097f513e5cd4e Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 18 Oct 2022 12:29:31 +0100 Subject: [PATCH 41/59] Added powershell test file, runs tests with various simd modes enabled/disabled --- TestIntrinsics.ps1 | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 TestIntrinsics.ps1 diff --git a/TestIntrinsics.ps1 b/TestIntrinsics.ps1 new file mode 100644 index 0000000..05a5e01 --- /dev/null +++ b/TestIntrinsics.ps1 @@ -0,0 +1,41 @@ +$env:COMPlus_EnableAVX2 = 1 +$env:COMPlus_EnableSSE3 = 1 +$env:COMPlus_EnableSSE2 = 1 +Write-Host "Test Environment: Normal" -ForegroundColor "Cyan" +dotnet test $config.TestProject +if ($LastExitCode -ne 0) { + Write-Host "Tests failed, aborting build!" -Foreground "Red" + Exit 1 +} + +$env:COMPlus_EnableAVX2 = 0 +$env:COMPlus_EnableSSE3 = 1 +$env:COMPlus_EnableSSE2 = 1 +Write-Host "Test Environment: AVX2 Disabled" -ForegroundColor "Cyan" +dotnet test $config.TestProject --framework netcoreapp3.1 +if ($LastExitCode -ne 0) { + Write-Host "Tests failed, aborting build!" -Foreground "Red" + Exit 1 +} + +$env:COMPlus_EnableAVX2 = 0 +$env:COMPlus_EnableSSE3 = 0 +$env:COMPlus_EnableSSE2 = 1 +Write-Host "Test Environment: SSE3 Disabled" -ForegroundColor "Cyan" +dotnet test $config.TestProject --framework netcoreapp3.1 +if ($LastExitCode -ne 0) { + Write-Host "Tests failed, aborting build!" -Foreground "Red" + Exit 1 +} + +$env:COMPlus_EnableAVX2 = 0 +$env:COMPlus_EnableSSE3 = 0 +$env:COMPlus_EnableSSE2 = 0 +Write-Host "Test Environment: SSE2 Disabled" -ForegroundColor "Cyan" +dotnet test $config.TestProject --framework netcoreapp3.1 +if ($LastExitCode -ne 0) { + Write-Host "Tests failed, aborting build!" -Foreground "Red" + Exit 1 +} + +Write-Host "Tests passed!" -ForegroundColor "Green" \ No newline at end of file From d1ede2a93db7bea42be7e8232e671711f9618c77 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 18 Oct 2022 13:18:18 +0100 Subject: [PATCH 42/59] Update core namespaces --- src/NaCl.Core/Base/ChaChaCore/ChaCha20Core.cs | 214 +++++++++--------- .../Base/ChaChaCore/ChaCha20CoreIntrinsics.cs | 2 +- .../Base/ChaChaCore/IChaCha20Core.cs | 5 +- src/NaCl.Core/Base/SalsaCore/ISalsa20Core.cs | 5 +- src/NaCl.Core/Base/SalsaCore/Salsa20Core.cs | 192 ++++++++-------- .../Base/SalsaCore/Salsa20CoreIntrinsics.cs | 3 +- 6 files changed, 209 insertions(+), 212 deletions(-) diff --git a/src/NaCl.Core/Base/ChaChaCore/ChaCha20Core.cs b/src/NaCl.Core/Base/ChaChaCore/ChaCha20Core.cs index 8cbd000..a8220ad 100644 --- a/src/NaCl.Core/Base/ChaChaCore/ChaCha20Core.cs +++ b/src/NaCl.Core/Base/ChaChaCore/ChaCha20Core.cs @@ -1,131 +1,129 @@ -namespace NaCl.Core.Base.ChaChaCore +namespace NaCl.Core.Base.ChaChaCore; + +using System; +using System.Buffers; +using System.Runtime.CompilerServices; +using System.Security.Cryptography; + +using Internal; + +internal class ChaCha20Core : IChaCha20Core { - using System; - using System.Buffers; - using System.Runtime.CompilerServices; - using System.Runtime.InteropServices; - using System.Security.Cryptography; + public const int BLOCK_SIZE_IN_BYTES = Snuffle.BLOCK_SIZE_IN_BYTES; + public const int BLOCK_SIZE_IN_INTS = Snuffle.BLOCK_SIZE_IN_INTS; - using Internal; + private readonly ChaCha20Base _chaCha20; + public ChaCha20Core(ChaCha20Base chaCha20) => _chaCha20 = chaCha20; - internal class ChaCha20Core : IChaCha20Core + public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) { - public const int BLOCK_SIZE_IN_BYTES = Snuffle.BLOCK_SIZE_IN_BYTES; - public const int BLOCK_SIZE_IN_INTS = Snuffle.BLOCK_SIZE_IN_INTS; + if (block.Length != BLOCK_SIZE_IN_BYTES) + throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); - private readonly ChaCha20Base _chaCha20; - public ChaCha20Core(ChaCha20Base chaCha20) => _chaCha20 = chaCha20; + // Set the initial state based on https://tools.ietf.org/html/rfc8439#section-2.3 + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + _chaCha20.SetInitialState(state, nonce, counter); - public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) - { - if (block.Length != BLOCK_SIZE_IN_BYTES) - throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); + // Create a copy of the state and then run 20 rounds on it, + // alternating between "column rounds" and "diagonal rounds"; each round consisting of four quarter-rounds. + Span workingState = stackalloc uint[BLOCK_SIZE_IN_INTS]; + state.CopyTo(workingState); + ChaCha20Base.ShuffleState(state); - // Set the initial state based on https://tools.ietf.org/html/rfc8439#section-2.3 - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - _chaCha20.SetInitialState(state, nonce, counter); + // At the end of the rounds, add the result to the original state. + for (var i = 0; i < BLOCK_SIZE_IN_INTS; i++) + state[i] += workingState[i]; - // Create a copy of the state and then run 20 rounds on it, - // alternating between "column rounds" and "diagonal rounds"; each round consisting of four quarter-rounds. - Span workingState = stackalloc uint[BLOCK_SIZE_IN_INTS]; - state.CopyTo(workingState); - ChaCha20Base.ShuffleState(state); + ArrayUtils.StoreArray16UInt32LittleEndian(block, 0, state); + } - // At the end of the rounds, add the result to the original state. - for (var i = 0; i < BLOCK_SIZE_IN_INTS; i++) - state[i] += workingState[i]; + /// + /// Processes the Encryption/Decryption function. + /// + /// The nonce. + /// The output. + /// The input. + /// The output's starting offset. + public void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) + { + var blockSizeInBytes = _chaCha20.BlockSizeInBytes; + var length = input.Length; + var numBlocks = length / blockSizeInBytes + 1; + + /* + * Allocates 64 bytes more than below impl as per the benchmarks... + * + var block = new byte[BLOCK_SIZE_IN_BYTES]; + for (var i = 0; i < numBlocks; i++) + { + ProcessKeyStreamBlock(nonce, i + InitialCounter, block); - ArrayUtils.StoreArray16UInt32LittleEndian(block, 0, state); - } + if (i == numBlocks - 1) + Xor(output, input, block, length % BLOCK_SIZE_IN_BYTES, offset, i); // last block + else + Xor(output, input, block, BLOCK_SIZE_IN_BYTES, offset, i); - /// - /// Processes the Encryption/Decryption function. - /// - /// The nonce. - /// The output. - /// The input. - /// The output's starting offset. - public void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) - { - var blockSizeInBytes = _chaCha20.BlockSizeInBytes; - var length = input.Length; - var numBlocks = length / blockSizeInBytes + 1; - - /* - * Allocates 64 bytes more than below impl as per the benchmarks... - * - var block = new byte[BLOCK_SIZE_IN_BYTES]; - for (var i = 0; i < numBlocks; i++) - { - ProcessKeyStreamBlock(nonce, i + InitialCounter, block); - - if (i == numBlocks - 1) - Xor(output, input, block, length % BLOCK_SIZE_IN_BYTES, offset, i); // last block - else - Xor(output, input, block, BLOCK_SIZE_IN_BYTES, offset, i); - - CryptoBytes.Wipe(block); // Array.Clear(block, 0, block.Length); - } - */ - - using var owner = MemoryPool.Shared.Rent(blockSizeInBytes); - for (var i = 0; i < numBlocks; i++) - { - ProcessKeyStreamBlock(nonce, i + _chaCha20.InitialCounter, owner.Memory.Span); - - if (i == numBlocks - 1) - Xor(output, input, owner.Memory.Span, length % blockSizeInBytes, offset, i); // last block - else - Xor(output, input, owner.Memory.Span, blockSizeInBytes, offset, i); - - owner.Memory.Span.Clear(); - } + CryptoBytes.Wipe(block); // Array.Clear(block, 0, block.Length); } + */ - /// - /// XOR the specified output. - /// - /// The output. - /// The input. - /// The key stream block. - /// The length. - /// The output's starting offset. - /// The current block number. - /// The combination of blocks, offsets and length to be XORed is out-of-bonds. - private void Xor(Span output, ReadOnlySpan input, ReadOnlySpan block, int len, int offset, int curBlock) + using var owner = MemoryPool.Shared.Rent(blockSizeInBytes); + for (var i = 0; i < numBlocks; i++) { - var blockOffset = curBlock * _chaCha20.BlockSizeInBytes; + ProcessKeyStreamBlock(nonce, i + _chaCha20.InitialCounter, owner.Memory.Span); - // Since is not called directly from outside, there's no need to check - //if (len < 0 || offset < 0 || curBlock < 0 || output.Length < len || (input.Length - blockOffset) < len || block.Length < len) - // throw new CryptographicException("The combination of blocks, offsets and length to be XORed is out-of-bonds."); + if (i == numBlocks - 1) + Xor(output, input, owner.Memory.Span, length % blockSizeInBytes, offset, i); // last block + else + Xor(output, input, owner.Memory.Span, blockSizeInBytes, offset, i); - for (var i = 0; i < len; i++) - output[i + offset + blockOffset] = (byte)(input[i + blockOffset] ^ block[i]); + owner.Memory.Span.Clear(); } - /// - /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . - /// - /// The subKey. - /// The nonce. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void HChaCha20(Span subKey, ReadOnlySpan nonce) - { - // See https://tools.ietf.org/html/draft-arciszewski-xchacha-01#section-2.2. - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + } - // Setting HChaCha20 initial state - _chaCha20.HChaCha20InitialState(state, nonce); + /// + /// XOR the specified output. + /// + /// The output. + /// The input. + /// The key stream block. + /// The length. + /// The output's starting offset. + /// The current block number. + /// The combination of blocks, offsets and length to be XORed is out-of-bonds. + private void Xor(Span output, ReadOnlySpan input, ReadOnlySpan block, int len, int offset, int curBlock) + { + var blockOffset = curBlock * _chaCha20.BlockSizeInBytes; - // Block function - ChaCha20Base.ShuffleState(state); + // Since is not called directly from outside, there's no need to check + //if (len < 0 || offset < 0 || curBlock < 0 || output.Length < len || (input.Length - blockOffset) < len || block.Length < len) + // throw new CryptographicException("The combination of blocks, offsets and length to be XORed is out-of-bonds."); - state[4] = state[12]; - state[5] = state[13]; - state[6] = state[14]; - state[7] = state[15]; + for (var i = 0; i < len; i++) + output[i + offset + blockOffset] = (byte)(input[i + blockOffset] ^ block[i]); + } + /// + /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . + /// + /// The subKey. + /// The nonce. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void HChaCha20(Span subKey, ReadOnlySpan nonce) + { + // See https://tools.ietf.org/html/draft-arciszewski-xchacha-01#section-2.2. + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - ArrayUtils.StoreArray8UInt32LittleEndian(subKey, 0, state); - } + // Setting HChaCha20 initial state + _chaCha20.HChaCha20InitialState(state, nonce); + + // Block function + ChaCha20Base.ShuffleState(state); + + state[4] = state[12]; + state[5] = state[13]; + state[6] = state[14]; + state[7] = state[15]; + + ArrayUtils.StoreArray8UInt32LittleEndian(subKey, 0, state); } } diff --git a/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs b/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs index 02e4f82..9e0f951 100644 --- a/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs +++ b/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs @@ -1,5 +1,5 @@ #if INTRINSICS -namespace NaCl.Core.Base; +namespace NaCl.Core.Base.ChaChaCore; using System; using System.Runtime.CompilerServices; diff --git a/src/NaCl.Core/Base/ChaChaCore/IChaCha20Core.cs b/src/NaCl.Core/Base/ChaChaCore/IChaCha20Core.cs index 3ae9e72..35270fd 100644 --- a/src/NaCl.Core/Base/ChaChaCore/IChaCha20Core.cs +++ b/src/NaCl.Core/Base/ChaChaCore/IChaCha20Core.cs @@ -1,6 +1,7 @@ -using System; +namespace NaCl.Core.Base.ChaChaCore; + +using System; -namespace NaCl.Core.Base.ChaChaCore; internal interface IChaCha20Core { void HChaCha20(Span subKey, ReadOnlySpan nonce); diff --git a/src/NaCl.Core/Base/SalsaCore/ISalsa20Core.cs b/src/NaCl.Core/Base/SalsaCore/ISalsa20Core.cs index 1425b5d..fdc97d0 100644 --- a/src/NaCl.Core/Base/SalsaCore/ISalsa20Core.cs +++ b/src/NaCl.Core/Base/SalsaCore/ISalsa20Core.cs @@ -1,6 +1,7 @@ -using System; +namespace NaCl.Core.Base.SalsaCore; + +using System; -namespace NaCl.Core.Base.SalsaCore; internal interface ISalsa20Core { void HSalsa20(Span subKey, ReadOnlySpan nonce); diff --git a/src/NaCl.Core/Base/SalsaCore/Salsa20Core.cs b/src/NaCl.Core/Base/SalsaCore/Salsa20Core.cs index 83bf328..b05f233 100644 --- a/src/NaCl.Core/Base/SalsaCore/Salsa20Core.cs +++ b/src/NaCl.Core/Base/SalsaCore/Salsa20Core.cs @@ -1,123 +1,121 @@ -namespace NaCl.Core.Base.SalsaCore -{ - using System; - using System.Buffers; - using System.Runtime.CompilerServices; - using System.Runtime.InteropServices; - using System.Security.Cryptography; +namespace NaCl.Core.Base.SalsaCore; - using Internal; +using System; +using System.Buffers; +using System.Runtime.CompilerServices; +using System.Security.Cryptography; - internal class Salsa20Core : ISalsa20Core - { - protected const int KEY_SIZE_IN_INTS = 8; - public const int KEY_SIZE_IN_BYTES = KEY_SIZE_IN_INTS * 4; // 32 - protected const int BLOCK_SIZE_IN_INTS = 16; - public const int BLOCK_SIZE_IN_BYTES = BLOCK_SIZE_IN_INTS * 4; // 64 +using Internal; - protected static uint[] SIGMA = new uint[] { 0x61707865, 0x3320646E, 0x79622D32, 0x6B206574 }; // "expand 32-byte k" (4 words constant: "expa", "nd 3", "2-by", and "te k") +internal class Salsa20Core : ISalsa20Core +{ + protected const int KEY_SIZE_IN_INTS = 8; + public const int KEY_SIZE_IN_BYTES = KEY_SIZE_IN_INTS * 4; // 32 + protected const int BLOCK_SIZE_IN_INTS = 16; + public const int BLOCK_SIZE_IN_BYTES = BLOCK_SIZE_IN_INTS * 4; // 64 - private readonly Salsa20Base _salsa20; + protected static uint[] SIGMA = new uint[] { 0x61707865, 0x3320646E, 0x79622D32, 0x6B206574 }; // "expand 32-byte k" (4 words constant: "expa", "nd 3", "2-by", and "te k") - public Salsa20Core(Salsa20Base salsa20) => _salsa20 = salsa20; + private readonly Salsa20Base _salsa20; - public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) - { - if (block.Length != BLOCK_SIZE_IN_BYTES) - throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); + public Salsa20Core(Salsa20Base salsa20) => _salsa20 = salsa20; + + public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) + { + if (block.Length != BLOCK_SIZE_IN_BYTES) + throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - _salsa20.SetInitialState(state, nonce, counter); + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + _salsa20.SetInitialState(state, nonce, counter); - // Create a copy of the state and then run 20 rounds on it, - // alternating between "column rounds" and "diagonal rounds"; each round consisting of four quarter-rounds. - Span workingState = stackalloc uint[BLOCK_SIZE_IN_INTS]; - state.CopyTo(workingState); - Salsa20Base.ShuffleState(workingState); + // Create a copy of the state and then run 20 rounds on it, + // alternating between "column rounds" and "diagonal rounds"; each round consisting of four quarter-rounds. + Span workingState = stackalloc uint[BLOCK_SIZE_IN_INTS]; + state.CopyTo(workingState); + Salsa20Base.ShuffleState(workingState); - // At the end of the rounds, add the result to the original state. - for (var i = 0; i < BLOCK_SIZE_IN_INTS; i++) - state[i] += workingState[i]; + // At the end of the rounds, add the result to the original state. + for (var i = 0; i < BLOCK_SIZE_IN_INTS; i++) + state[i] += workingState[i]; - ArrayUtils.StoreArray16UInt32LittleEndian(block, 0, state); - } + ArrayUtils.StoreArray16UInt32LittleEndian(block, 0, state); + } - public void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) + public void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) + { + var blockSizeInBytes = _salsa20.BlockSizeInBytes; + var length = input.Length; + var numBlocks = length / blockSizeInBytes + 1; + + /* + * Allocates 64 bytes more than below impl as per the benchmarks... + * + var block = new byte[BLOCK_SIZE_IN_BYTES]; + for (var i = 0; i < numBlocks; i++) { - var blockSizeInBytes = _salsa20.BlockSizeInBytes; - var length = input.Length; - var numBlocks = length / blockSizeInBytes + 1; - - /* - * Allocates 64 bytes more than below impl as per the benchmarks... - * - var block = new byte[BLOCK_SIZE_IN_BYTES]; - for (var i = 0; i < numBlocks; i++) - { - ProcessKeyStreamBlock(nonce, i + InitialCounter, block); - - if (i == numBlocks - 1) - Xor(output, input, block, length % BLOCK_SIZE_IN_BYTES, offset, i); // last block - else - Xor(output, input, block, BLOCK_SIZE_IN_BYTES, offset, i); - - CryptoBytes.Wipe(block); // Array.Clear(block, 0, block.Length); - } - */ - - using var owner = MemoryPool.Shared.Rent(blockSizeInBytes); - for (var i = 0; i < numBlocks; i++) - { - ProcessKeyStreamBlock(nonce, i + _salsa20.InitialCounter, owner.Memory.Span); - - if (i == numBlocks - 1) - Xor(output, input, owner.Memory.Span, length % blockSizeInBytes, offset, i); // last block - else - Xor(output, input, owner.Memory.Span, blockSizeInBytes, offset, i); - - owner.Memory.Span.Clear(); - } + ProcessKeyStreamBlock(nonce, i + InitialCounter, block); + + if (i == numBlocks - 1) + Xor(output, input, block, length % BLOCK_SIZE_IN_BYTES, offset, i); // last block + else + Xor(output, input, block, BLOCK_SIZE_IN_BYTES, offset, i); + + CryptoBytes.Wipe(block); // Array.Clear(block, 0, block.Length); } + */ - private void Xor(Span output, ReadOnlySpan input, ReadOnlySpan block, int len, int offset, int curBlock) + using var owner = MemoryPool.Shared.Rent(blockSizeInBytes); + for (var i = 0; i < numBlocks; i++) { - var blockOffset = curBlock * _salsa20.BlockSizeInBytes; + ProcessKeyStreamBlock(nonce, i + _salsa20.InitialCounter, owner.Memory.Span); - // Since is not called directly from outside, there's no need to check - //if (len < 0 || offset < 0 || curBlock < 0 || output.Length < len || (input.Length - blockOffset) < len || block.Length < len) - // throw new CryptographicException("The combination of blocks, offsets and length to be XORed is out-of-bonds."); + if (i == numBlocks - 1) + Xor(output, input, owner.Memory.Span, length % blockSizeInBytes, offset, i); // last block + else + Xor(output, input, owner.Memory.Span, blockSizeInBytes, offset, i); - for (var i = 0; i < len; i++) - output[i + offset + blockOffset] = (byte)(input[i + blockOffset] ^ block[i]); + owner.Memory.Span.Clear(); } + } - /// - /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . - /// - /// The subKey. - /// The nonce. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void HSalsa20(Span subKey, ReadOnlySpan nonce) - { - // See: http://cr.yp.to/snuffle/xsalsa-20081128.pdf under 2. Specification - Definition of HSalsa20 + private void Xor(Span output, ReadOnlySpan input, ReadOnlySpan block, int len, int offset, int curBlock) + { + var blockOffset = curBlock * _salsa20.BlockSizeInBytes; + + // Since is not called directly from outside, there's no need to check + //if (len < 0 || offset < 0 || curBlock < 0 || output.Length < len || (input.Length - blockOffset) < len || block.Length < len) + // throw new CryptographicException("The combination of blocks, offsets and length to be XORed is out-of-bonds."); - Span state = stackalloc uint[BLOCK_SIZE_IN_BYTES]; + for (var i = 0; i < len; i++) + output[i + offset + blockOffset] = (byte)(input[i + blockOffset] ^ block[i]); + } - // Setting HSalsa20 initial state - _salsa20.HSalsa20InitialState(state, nonce); + /// + /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . + /// + /// The subKey. + /// The nonce. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void HSalsa20(Span subKey, ReadOnlySpan nonce) + { + // See: http://cr.yp.to/snuffle/xsalsa-20081128.pdf under 2. Specification - Definition of HSalsa20 - // Block function - Salsa20Base.ShuffleState(state); + Span state = stackalloc uint[BLOCK_SIZE_IN_BYTES]; - state[1] = state[5]; - state[2] = state[10]; - state[3] = state[15]; - state[4] = state[6]; - state[5] = state[7]; - state[6] = state[8]; - state[7] = state[9]; + // Setting HSalsa20 initial state + _salsa20.HSalsa20InitialState(state, nonce); - ArrayUtils.StoreArray8UInt32LittleEndian(subKey, 0, state); - } + // Block function + Salsa20Base.ShuffleState(state); + + state[1] = state[5]; + state[2] = state[10]; + state[3] = state[15]; + state[4] = state[6]; + state[5] = state[7]; + state[6] = state[8]; + state[7] = state[9]; + + ArrayUtils.StoreArray8UInt32LittleEndian(subKey, 0, state); } } \ No newline at end of file diff --git a/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs b/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs index 32abf16..a724a5d 100644 --- a/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs +++ b/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs @@ -1,5 +1,5 @@ #if INTRINSICS -namespace NaCl.Core.Base; +namespace NaCl.Core.Base.SalsaCore; using System; using System.Runtime.CompilerServices; @@ -7,7 +7,6 @@ namespace NaCl.Core.Base; using System.Security.Cryptography; using Internal; -using NaCl.Core.Base.SalsaCore; internal class Salsa20CoreIntrinsics : ISalsa20Core { From 2ba36afb8c9238b3aafefe3351ce494754633ed9 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 18 Oct 2022 15:14:48 +0100 Subject: [PATCH 43/59] Added Intrinsic/Scalar tests --- NaCl.Core.sln | 10 +- src/NaCl.Core/Properties/AssemblyInfo.cs | 3 +- .../ChaCha20IntrinsicsTests .cs | 347 ++++++++++++++ .../ChaCha20ScalarTests.cs | 347 ++++++++++++++ .../NaCl.Core.SimdTests.csproj | 58 +++ .../Salsa20IntrinsicsTests.cs | 433 ++++++++++++++++++ .../NaCl.Core.SimdTests/Salsa20ScalarTests.cs | 432 +++++++++++++++++ test/NaCl.Core.SimdTests/TestHelpers.cs | 91 ++++ .../Vectors/HChaCha20TestVector.cs | 67 +++ .../Vectors/Rfc8439TestVector.cs | 160 +++++++ .../Vectors/Salsa20TestVector.cs | 35 ++ test/NaCl.Core.Tests/NaCl.Core.Tests.csproj | 1 - test/NaCl.Core.Tests/XSalsa20Tests.cs | 9 - 13 files changed, 1980 insertions(+), 13 deletions(-) create mode 100644 test/NaCl.Core.SimdTests/ChaCha20IntrinsicsTests .cs create mode 100644 test/NaCl.Core.SimdTests/ChaCha20ScalarTests.cs create mode 100644 test/NaCl.Core.SimdTests/NaCl.Core.SimdTests.csproj create mode 100644 test/NaCl.Core.SimdTests/Salsa20IntrinsicsTests.cs create mode 100644 test/NaCl.Core.SimdTests/Salsa20ScalarTests.cs create mode 100644 test/NaCl.Core.SimdTests/TestHelpers.cs create mode 100644 test/NaCl.Core.SimdTests/Vectors/HChaCha20TestVector.cs create mode 100644 test/NaCl.Core.SimdTests/Vectors/Rfc8439TestVector.cs create mode 100644 test/NaCl.Core.SimdTests/Vectors/Salsa20TestVector.cs diff --git a/NaCl.Core.sln b/NaCl.Core.sln index 5eb42a9..fd8ea3e 100644 --- a/NaCl.Core.sln +++ b/NaCl.Core.sln @@ -1,7 +1,7 @@  Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 15 -VisualStudioVersion = 15.0.27428.2002 +# Visual Studio Version 17 +VisualStudioVersion = 17.4.32916.344 MinimumVisualStudioVersion = 10.0.40219.1 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "NaCl.Core", "src\NaCl.Core\NaCl.Core.csproj", "{5B711EBA-6E41-429F-A2AC-719C4441B663}" EndProject @@ -14,6 +14,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution CodeCoverage.runsettings = CodeCoverage.runsettings EndProjectSection EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NaCl.Core.SimdTests", "test\NaCl.Core.SimdTests\NaCl.Core.SimdTests.csproj", "{BF42937A-028C-4870-AAB4-220667A57457}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -32,6 +34,10 @@ Global {F55397B0-B348-40C6-A6A8-5BECEB74A840}.Debug|Any CPU.Build.0 = Debug|Any CPU {F55397B0-B348-40C6-A6A8-5BECEB74A840}.Release|Any CPU.ActiveCfg = Release|Any CPU {F55397B0-B348-40C6-A6A8-5BECEB74A840}.Release|Any CPU.Build.0 = Release|Any CPU + {BF42937A-028C-4870-AAB4-220667A57457}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {BF42937A-028C-4870-AAB4-220667A57457}.Debug|Any CPU.Build.0 = Debug|Any CPU + {BF42937A-028C-4870-AAB4-220667A57457}.Release|Any CPU.ActiveCfg = Release|Any CPU + {BF42937A-028C-4870-AAB4-220667A57457}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/src/NaCl.Core/Properties/AssemblyInfo.cs b/src/NaCl.Core/Properties/AssemblyInfo.cs index 69cd83c..36635ef 100644 --- a/src/NaCl.Core/Properties/AssemblyInfo.cs +++ b/src/NaCl.Core/Properties/AssemblyInfo.cs @@ -1,2 +1,3 @@ [assembly: System.Runtime.CompilerServices.InternalsVisibleTo("NaCl.Core.Benchmarks, PublicKey=0024000004800000940000000602000000240000525341310004000001000100ad335fda87151ecc6a84cc4b5b349b1edbb51a19dfa9d0cfedfa2d804a39f9fbc0afbb943a1285e27ec5fce25103d333873deba41dd792e8a89c50134ae409dd01021bf9408b7dcb378b3d87a3bd563882721cbd50097146903e61d601a9fa47fedb19cdac1fb1c7ab280a8dd0cccbf95f4278c44529d1ef6c478fa08036a0af")] -[assembly: System.Runtime.CompilerServices.InternalsVisibleTo("NaCl.Core.Tests, PublicKey=0024000004800000940000000602000000240000525341310004000001000100ad335fda87151ecc6a84cc4b5b349b1edbb51a19dfa9d0cfedfa2d804a39f9fbc0afbb943a1285e27ec5fce25103d333873deba41dd792e8a89c50134ae409dd01021bf9408b7dcb378b3d87a3bd563882721cbd50097146903e61d601a9fa47fedb19cdac1fb1c7ab280a8dd0cccbf95f4278c44529d1ef6c478fa08036a0af")] \ No newline at end of file +[assembly: System.Runtime.CompilerServices.InternalsVisibleTo("NaCl.Core.Tests, PublicKey=0024000004800000940000000602000000240000525341310004000001000100ad335fda87151ecc6a84cc4b5b349b1edbb51a19dfa9d0cfedfa2d804a39f9fbc0afbb943a1285e27ec5fce25103d333873deba41dd792e8a89c50134ae409dd01021bf9408b7dcb378b3d87a3bd563882721cbd50097146903e61d601a9fa47fedb19cdac1fb1c7ab280a8dd0cccbf95f4278c44529d1ef6c478fa08036a0af")] +[assembly: System.Runtime.CompilerServices.InternalsVisibleTo("NaCl.Core.SimdTests, PublicKey=0024000004800000940000000602000000240000525341310004000001000100ad335fda87151ecc6a84cc4b5b349b1edbb51a19dfa9d0cfedfa2d804a39f9fbc0afbb943a1285e27ec5fce25103d333873deba41dd792e8a89c50134ae409dd01021bf9408b7dcb378b3d87a3bd563882721cbd50097146903e61d601a9fa47fedb19cdac1fb1c7ab280a8dd0cccbf95f4278c44529d1ef6c478fa08036a0af")] \ No newline at end of file diff --git a/test/NaCl.Core.SimdTests/ChaCha20IntrinsicsTests .cs b/test/NaCl.Core.SimdTests/ChaCha20IntrinsicsTests .cs new file mode 100644 index 0000000..cc2c7c6 --- /dev/null +++ b/test/NaCl.Core.SimdTests/ChaCha20IntrinsicsTests .cs @@ -0,0 +1,347 @@ +namespace NaCl.Core.SimdTests +{ + using System; + using System.Security.Cryptography; + using System.Text; + + using FluentAssertions; + using Xunit; + + using Base; + using Internal; + using Vectors; + using NaCl.Core.Base.ChaChaCore; + using NaCl.Core.SimdTests.Vectors; + + public class ChaCha20IntrinsicsTests + { + [Fact] + public void HChaCha20IntrinsicsTestVectors() + { + // Arrange + foreach (var test in HChaCha20TestVector.HChaCha20TestVectors) + { + var xChaCha20 = new XChaCha20(test.Key, 0); + var cipher = new ChaCha20CoreIntrinsics(xChaCha20); + + // Act + var output = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + cipher.HChaCha20(output, test.Input); + + // Assert + output.Should().Equal(test.Output); + } + } + + [Fact] + public void HChaCha20IntrinsicsBlockTestVector() + { + // https://tools.ietf.org/html/draft-irtf-cfrg-xchacha-03#section-2.2.1 + + // Arrange + var key = CryptoBytes.FromHexString("00:01:02:03:04:05:06:07:08:09:0a:0b:0c:0d:0e:0f:10:11:12:13:14:15:16:17:18:19:1a:1b:1c:1d:1e:1f".Replace(":", string.Empty)); + var nonce = CryptoBytes.FromHexString("00:00:00:09:00:00:00:4a:00:00:00:00:31:41:59:27".Replace(":", string.Empty)); + + var xChaCha20 = new XChaCha20(key, 0); + var cipher = new ChaCha20CoreIntrinsics(xChaCha20); + + // Act + var subKey = new byte[32]; + cipher.HChaCha20(subKey, nonce); + var state = subKey.ToUInt16Array(); + //var stateHex = CryptoBytes.ToHexStringLower(subKey.ToArray()); + + // Assert + // HChaCha20 returns only the first and last rows + var expectedState = new uint[] + { + 0x423b4182, 0xfe7bb227, 0x50420ed3, 0x737d878a, + //0x0aa76448, 0x7954cdf3, 0x846acd37, 0x7b3c58ad, + //0x77e35583, 0x83e77c12, 0xe0076a2d, 0xbc6cd0e5, + 0xd5e4f9a0, 0x53a8748a, 0x13c42ec1, 0xdcecd326 + }; + + // Same as above but in HEX + //var expectedStateHex = "82413b4" + "227b27bfe" + "d30e4250" + "8a877d73" + // + "a0f9e4d" + "58a74a853" + "c12ec413" + "26d3ecdc"; + + state.Should().BeEquivalentTo(expectedState); + //stateHex.Should().Be(expectedStateHex); + } + + [Fact] + public void IntrinsicsEncryptDecrypt1BlockTest() + { + // Arrange + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(key); + + var nonce = new byte[ChaCha20.NONCE_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(nonce); + + var expected = Encoding.UTF8.GetBytes("This is a secret content!!"); + + var chacha20 = new ChaCha20(key, 0); + var cipher = new ChaCha20CoreIntrinsics(chacha20); + + // Act + var ciphertext = new byte[expected.Length]; + cipher.Process(nonce, ciphertext, expected); + + var plaintext = new byte[expected.Length]; + cipher.Process(nonce, plaintext, ciphertext); + + // Assert + plaintext.Should().Equal(expected); + } + + [Fact] + public void IntrinsicsEncryptDecryptNBlocksTest() + { + // Arrange + var rnd = new Random(); + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + var nonce = new byte[ChaCha20.NONCE_SIZE_IN_BYTES]; + + for (var i = 0; i < 64; i++) + { + RandomNumberGenerator.Fill(key); + RandomNumberGenerator.Fill(nonce); + + var chacha20 = new ChaCha20(key, 0); + var cipher = new ChaCha20CoreIntrinsics(chacha20); + + for (var j = 0; j < 64; j++) + { + var expected = new byte[rnd.Next(300)]; + rnd.NextBytes(expected); + + var ciphertext = new byte[expected.Length]; + var plaintext = new byte[expected.Length]; + + // Act + cipher.Process(nonce, ciphertext, expected); + cipher.Process(nonce, plaintext, ciphertext); + + // Assert + plaintext.Should().Equal(expected); + } + } + } + + [Fact] + public void IntrinsicsEncryptDecryptLongMessagesTest() + { + var rnd = new Random(); + + var dataSize = 16; + while (dataSize <= 1 << 24) + { + var plaintext = new byte[dataSize]; + rnd.NextBytes(plaintext); + + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(key); + + var nonce = new byte[ChaCha20.NONCE_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(nonce); + + var chacha20 = new ChaCha20(key, 0); + var cipher = new ChaCha20CoreIntrinsics(chacha20); + + var ciphertext = new byte[plaintext.Length]; + cipher.Process(nonce, ciphertext, plaintext); + + var decrypted = new byte[plaintext.Length]; + cipher.Process(nonce, decrypted, ciphertext); + + decrypted.Should().Equal(plaintext); + dataSize += 5 * dataSize / 11; + } + } + + + [Fact] + public void ChaCha20IntrinsicsBlockWhenNonceLengthIsEmptyFails() + { + // Arrange + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + + var chacha20 = new ChaCha20(key, 0); + var core = new ChaCha20CoreIntrinsics(chacha20); + + var nonce = new byte[0]; + var block = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; + + // Act & Assert + var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); + act.Should().Throw(); + } + + [Fact] + public void ChaCha20IntrinsicsBlockWhenNonceLengthIsInvalidFails() + { + // Arrange + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + + var chacha20 = new ChaCha20(key, 0); + var core = new ChaCha20CoreIntrinsics(chacha20); + var nonce = new byte[chacha20.NonceSizeInBytes + TestHelpers.ReturnRandomPositiveNegative()]; + var block = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; + + // Act & Assert + var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); + act.Should().Throw(); + } + + [Fact] + public void ChaCha20IntrinsicsBlockWhenLengthIsInvalidFails() + { + // Arrange + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + + var chacha20 = new ChaCha20(key, 0); + var core = new ChaCha20CoreIntrinsics(chacha20); + var nonce = new byte[chacha20.NonceSizeInBytes + TestHelpers.ReturnRandomPositiveNegative()]; + var block = new byte[0]; + + // Act & Assert + var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); + act.Should().Throw(); + } + + [Fact] + public void ChaCha20IntrinsicsBlockTestVector() + { + // https://tools.ietf.org/html/rfc8439#section-2.3.2 + + // Arrange + var key = CryptoBytes.FromHexString("00:01:02:03:04:05:06:07:08:09:0a:0b:0c:0d:0e:0f:10:11:12:13:14:15:16:17:18:19:1a:1b:1c:1d:1e:1f".Replace(":", string.Empty)); + var nonce = CryptoBytes.FromHexString("00:00:00:09:00:00:00:4a:00:00:00:00".Replace(":", string.Empty)); + var counter = 1; + + // Act + var chacha20 = new ChaCha20(key, 1); + var core = new ChaCha20CoreIntrinsics(chacha20); + var output = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; + core.ProcessKeyStreamBlock(nonce, counter, output); + + // Assert + var expected = new uint[16] + { + 0xe4e7f110, 0x15593bd1, 0x1fdd0f50, 0xc47120a3, + 0xc7f4d1c7, 0x0368c033, 0x9aaa2204, 0x4e6cd4c3, + 0x466482d2, 0x09aa9f07, 0x05d7c214, 0xa2028bd9, + 0xd19c12b5, 0xb94e16de, 0xe883d0cb, 0x4e3c50a2, + }; + + output.ToUInt16Array().Should().Equal(expected); + } + + [Fact] + public void ChaCha20IntrinsicsTestVector() + { + // https://tools.ietf.org/html/rfc8439#section-2.4.2 + + // Arrange + foreach (var test in Rfc8439TestVector.Rfc8439TestVectors) + { + // Act + var chacha20 = new ChaCha20(test.Key, test.InitialCounter); + var cipher = new ChaCha20CoreIntrinsics(chacha20); + + var output = new byte[test.CipherText.Length]; + cipher.Process(test.Nonce, output, test.CipherText); + + // Assert + output.Should().Equal(test.PlainText); + } + } + + [Theory] + [InlineData(33)] + [InlineData(64)] + [InlineData(65)] + [InlineData(255)] + [InlineData(256)] + [InlineData(511)] + [InlineData(512)] + [InlineData(1023)] + [InlineData(1024)] + public void IntrinsicsCreateVariableLengthCiphers(int size) + { + var input = new byte[size]; + var output = new byte[size]; + + var nonce = new byte[12]; + Array.Fill(nonce, (byte)2); + var key = new byte[32]; + Array.Fill(key, (byte)1); + + var chacha20 = new ChaCha20(key, 0); + var cipher = new ChaCha20CoreIntrinsics(chacha20); + cipher.Process(nonce, output, input); + var value = Convert.ToHexString(output); + + value.Should().Be(LongKeyStream[..(size*2)]); + } + + [Fact] + public void ChaCha20IntrinsicsTestVectorTC8() + { + // TC8: key: 'All your base are belong to us!, IV: 'IETF2013' + // Test vector TC8 from RFC draft by J. Strombergson + // https://tools.ietf.org/html/draft-strombergson-chacha-test-vectors-01 + + // Arrange + var key = new byte[32] + { + 0xC4, 0x6E, 0xC1, 0xB1, 0x8C, 0xE8, 0xA8, 0x78, + 0x72, 0x5A, 0x37, 0xE7, 0x80, 0xDF, 0xB7, 0x35, + 0x1F, 0x68, 0xED, 0x2E, 0x19, 0x4C, 0x79, 0xFB, + 0xC6, 0xAE, 0xBE, 0xE1, 0xA6, 0x67, 0x97, 0x5D + }; + + // The first 4 bytes are set to zero and a large counter + // is used; this makes the RFC 8439 version of ChaCha20 + // compatible with the original specification by D. J. Bernstein. + var nonce = new byte[12] { 0x00, 0x00, 0x00, 0x00, + 0x1A, 0xDA, 0x31, 0xD5, 0xCF, 0x68, 0x82, 0x21 + }; + + // Act + var chacha20 = new ChaCha20(key, 0); + var cipher = new ChaCha20CoreIntrinsics(chacha20); + var block0 = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; + var block1 = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; + cipher.ProcessKeyStreamBlock(nonce, 0, block0); + cipher.ProcessKeyStreamBlock(nonce, 1, block1); + + // Assert + var expected = new byte[128] + { + 0xF6, 0x3A, 0x89, 0xB7, 0x5C, 0x22, 0x71, 0xF9, + 0x36, 0x88, 0x16, 0x54, 0x2B, 0xA5, 0x2F, 0x06, + 0xED, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2B, 0x00, + 0xB5, 0xE8, 0xF8, 0x0A, 0xE9, 0xA4, 0x73, 0xAF, + 0xC2, 0x5B, 0x21, 0x8F, 0x51, 0x9A, 0xF0, 0xFD, + 0xD4, 0x06, 0x36, 0x2E, 0x8D, 0x69, 0xDE, 0x7F, + 0x54, 0xC6, 0x04, 0xA6, 0xE0, 0x0F, 0x35, 0x3F, + 0x11, 0x0F, 0x77, 0x1B, 0xDC, 0xA8, 0xAB, 0x92, + + 0xE5, 0xFB, 0xC3, 0x4E, 0x60, 0xA1, 0xD9, 0xA9, + 0xDB, 0x17, 0x34, 0x5B, 0x0A, 0x40, 0x27, 0x36, + 0x85, 0x3B, 0xF9, 0x10, 0xB0, 0x60, 0xBD, 0xF1, + 0xF8, 0x97, 0xB6, 0x29, 0x0F, 0x01, 0xD1, 0x38, + 0xAE, 0x2C, 0x4C, 0x90, 0x22, 0x5B, 0xA9, 0xEA, + 0x14, 0xD5, 0x18, 0xF5, 0x59, 0x29, 0xDE, 0xA0, + 0x98, 0xCA, 0x7A, 0x6C, 0xCF, 0xE6, 0x12, 0x27, + 0x05, 0x3C, 0x84, 0xE4, 0x9A, 0x4A, 0x33, 0x32 + }; + + CryptoBytes.Combine(block0, block1).Should().Equal(expected); + } + + private const string LongKeyStream = "06E1F8D66AC5C75181F3E5ED9FA16AA909A1FB57A4A9B0110C84FCDC0D710880072A4342AF88DEC0138DAF141A3F471C01E77C1FDA90999496D601A36A8C0412E61CF22E8DA3E8DA712DE9F9D38BE4298CB36C0D83AA7DD314841BBDF59644DCD313F9F53B0E06B9D6CB3F0788CE2EE78993D9D27A3EDF0A52589CBB698519D583B68F72F3961AD77C1358394F29B08FE9F98A29F98311723013591E698557A04A73FB277E3E247083444A6C139ADE01BDE3C368C3A484D6824B33C024C0285CBD665D4F2E4DE87BF79565F08FE09766C16639279A243DAE8395F3E0E5D96E711B210355605A5A8E7B50CEA4BA25E4CB0E273488E223CD69FB699BD937A30D33488EF6076192E1ED08758F7F4774E4C0B8E70955D3CAFE790EB40F7725EB87B8BE6BBECDE1E140966973B5B05FDBFBE05C4BC599888693D96AC0C429B75591EF228A243A6EFDBEEEE49F09383AF2D4AFB6305DE60C5D195A44ED646B0CAFCEC5E445562FFFBB56D444C650E2D892FA99BCE78F2EBF866B154FDB110DDF8CAFB7BE4BEA46724B3952906F0C6E81BE7A17E3C95DF350BB970D2C97499924BDCC4EA0E1DE33AA4E62B5C1FC65FFD2728D81A79AE218AE1C639108323C3D22BA1B8C746CAB0CD535C8661CCA4B6B047790EF148A1B9A88CD3CDD8D79389E2F0D9AAAE135B361ED6778A6F6E03186651692F8DABEDF8872939F694C41E2CAD064FF4C537B92AFD0951DF77302749DCDBC9560FCE001DACAAFAA703BDA73007174C549B69EB031324E31BC9F60049E39254146AEB39BEE8A52CAEA1DD31C42346E44EBCC0771A2548D55ABD085323BA69625845F34831E7518F129CB1D80B76D3C94634F38A1226B5E212D917D593838F51D6CC35F87EB500030AB1446D87F6FFC4717B51C619DDAFD75DBA4C25A09C8C961CDA12A9E01203D678AD2ABB4B7D1BED7EBF0C2932DCE5F0C97F9488DD01A7891DC18D5EEFF6129B7942726A5B5110877260E2A78075C666F4410A2F8A2909D03DE0FBE2BFCA2B068B438ADAF767D804BA85278FB930945D15380281C215BC664B6627EE76CBBC8C5355E607721AAAC069B16B78C2F282795E7BF9B6509E7DC36FD2D45A227BF9D20C5E9678A040B63E964817F98B5F4828EB5D66740C595304D08A0A3C5A50EE3B3F99D2269992DD400A5B452A213DCD2579F7A193FC7FE33E498E91203DE19FF9D54BEBDE9E124A17E784430C38110FE3552861737DE1F2B7678F63417FE2224ED6571D43A8015F6F81362E7B95CB93C86735787F0980B0A3A65549844768EDF0DDEC75A24FA1EF5A26640932F65FF141CAEE2E14506A34E925C21BC268769CD95328675953E79B4B375912434834018ADD9C1832057EE4386C95B6E9407346B4A1582FB3C095E4B0882087DB48F081B5C0DE69ADBC447A6BA2ED6A4F90909911CD3B51ECEC2C6BE6EFE"; + } +} diff --git a/test/NaCl.Core.SimdTests/ChaCha20ScalarTests.cs b/test/NaCl.Core.SimdTests/ChaCha20ScalarTests.cs new file mode 100644 index 0000000..faa1ea2 --- /dev/null +++ b/test/NaCl.Core.SimdTests/ChaCha20ScalarTests.cs @@ -0,0 +1,347 @@ +namespace NaCl.Core.SimdTests +{ + using System; + using System.Security.Cryptography; + using System.Text; + + using FluentAssertions; + using Xunit; + + using Base; + using Internal; + using Vectors; + using NaCl.Core.Base.ChaChaCore; + using NaCl.Core.SimdTests.Vectors; + + public class ChaCha20ScalarTests + { + [Fact] + public void HChaCha20ScalarTestVectors() + { + // Arrange + foreach (var test in HChaCha20TestVector.HChaCha20TestVectors) + { + var xChaCha20 = new XChaCha20(test.Key, 0); + var cipher = new ChaCha20Core(xChaCha20); + + // Act + var output = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + cipher.HChaCha20(output, test.Input); + + // Assert + output.Should().Equal(test.Output); + } + } + + [Fact] + public void HChaCha20ScalarBlockTestVector() + { + // https://tools.ietf.org/html/draft-irtf-cfrg-xchacha-03#section-2.2.1 + + // Arrange + var key = CryptoBytes.FromHexString("00:01:02:03:04:05:06:07:08:09:0a:0b:0c:0d:0e:0f:10:11:12:13:14:15:16:17:18:19:1a:1b:1c:1d:1e:1f".Replace(":", string.Empty)); + var nonce = CryptoBytes.FromHexString("00:00:00:09:00:00:00:4a:00:00:00:00:31:41:59:27".Replace(":", string.Empty)); + + var xChaCha20 = new XChaCha20(key, 0); + var cipher = new ChaCha20Core(xChaCha20); + + // Act + var subKey = new byte[32]; + cipher.HChaCha20(subKey, nonce); + var state = subKey.ToUInt16Array(); + //var stateHex = CryptoBytes.ToHexStringLower(subKey.ToArray()); + + // Assert + // HChaCha20 returns only the first and last rows + var expectedState = new uint[] + { + 0x423b4182, 0xfe7bb227, 0x50420ed3, 0x737d878a, + //0x0aa76448, 0x7954cdf3, 0x846acd37, 0x7b3c58ad, + //0x77e35583, 0x83e77c12, 0xe0076a2d, 0xbc6cd0e5, + 0xd5e4f9a0, 0x53a8748a, 0x13c42ec1, 0xdcecd326 + }; + + // Same as above but in HEX + //var expectedStateHex = "82413b4" + "227b27bfe" + "d30e4250" + "8a877d73" + // + "a0f9e4d" + "58a74a853" + "c12ec413" + "26d3ecdc"; + + state.Should().BeEquivalentTo(expectedState); + //stateHex.Should().Be(expectedStateHex); + } + + [Fact] + public void ScalarEncryptDecrypt1BlockTest() + { + // Arrange + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(key); + + var nonce = new byte[ChaCha20.NONCE_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(nonce); + + var expected = Encoding.UTF8.GetBytes("This is a secret content!!"); + + var chacha20 = new ChaCha20(key, 0); + var cipher = new ChaCha20Core(chacha20); + + // Act + var ciphertext = new byte[expected.Length]; + cipher.Process(nonce, ciphertext, expected); + + var plaintext = new byte[expected.Length]; + cipher.Process(nonce, plaintext, ciphertext); + + // Assert + plaintext.Should().Equal(expected); + } + + [Fact] + public void ScalarEncryptDecryptNBlocksTest() + { + // Arrange + var rnd = new Random(); + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + var nonce = new byte[ChaCha20.NONCE_SIZE_IN_BYTES]; + + for (var i = 0; i < 64; i++) + { + RandomNumberGenerator.Fill(key); + RandomNumberGenerator.Fill(nonce); + + var chacha20 = new ChaCha20(key, 0); + var cipher = new ChaCha20Core(chacha20); + + for (var j = 0; j < 64; j++) + { + var expected = new byte[rnd.Next(300)]; + rnd.NextBytes(expected); + + var ciphertext = new byte[expected.Length]; + var plaintext = new byte[expected.Length]; + + // Act + cipher.Process(nonce, ciphertext, expected); + cipher.Process(nonce, plaintext, ciphertext); + + // Assert + plaintext.Should().Equal(expected); + } + } + } + + [Fact] + public void ScalarEncryptDecryptLongMessagesTest() + { + var rnd = new Random(); + + var dataSize = 16; + while (dataSize <= 1 << 24) + { + var plaintext = new byte[dataSize]; + rnd.NextBytes(plaintext); + + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(key); + + var nonce = new byte[ChaCha20.NONCE_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(nonce); + + var chacha20 = new ChaCha20(key, 0); + var cipher = new ChaCha20Core(chacha20); + + var ciphertext = new byte[plaintext.Length]; + cipher.Process(nonce, ciphertext, plaintext); + + var decrypted = new byte[plaintext.Length]; + cipher.Process(nonce, decrypted, ciphertext); + + decrypted.Should().Equal(plaintext); + dataSize += 5 * dataSize / 11; + } + } + + + [Fact] + public void ChaCha20ScalarBlockWhenNonceLengthIsEmptyFails() + { + // Arrange + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + + var chacha20 = new ChaCha20(key, 0); + var core = new ChaCha20Core(chacha20); + + var nonce = new byte[0]; + var block = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; + + // Act & Assert + var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); + act.Should().Throw(); + } + + [Fact] + public void ChaCha20ScalarBlockWhenNonceLengthIsInvalidFails() + { + // Arrange + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + + var chacha20 = new ChaCha20(key, 0); + var core = new ChaCha20Core(chacha20); + var nonce = new byte[chacha20.NonceSizeInBytes + TestHelpers.ReturnRandomPositiveNegative()]; + var block = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; + + // Act & Assert + var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); + act.Should().Throw(); + } + + [Fact] + public void ChaCha20ScalarBlockWhenLengthIsInvalidFails() + { + // Arrange + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + + var chacha20 = new ChaCha20(key, 0); + var core = new ChaCha20Core(chacha20); + var nonce = new byte[chacha20.NonceSizeInBytes + TestHelpers.ReturnRandomPositiveNegative()]; + var block = new byte[0]; + + // Act & Assert + var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); + act.Should().Throw(); + } + + [Fact] + public void ChaCha20ScalarBlockTestVector() + { + // https://tools.ietf.org/html/rfc8439#section-2.3.2 + + // Arrange + var key = CryptoBytes.FromHexString("00:01:02:03:04:05:06:07:08:09:0a:0b:0c:0d:0e:0f:10:11:12:13:14:15:16:17:18:19:1a:1b:1c:1d:1e:1f".Replace(":", string.Empty)); + var nonce = CryptoBytes.FromHexString("00:00:00:09:00:00:00:4a:00:00:00:00".Replace(":", string.Empty)); + var counter = 1; + + // Act + var chacha20 = new ChaCha20(key, 1); + var core = new ChaCha20Core(chacha20); + var output = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; + core.ProcessKeyStreamBlock(nonce, counter, output); + + // Assert + var expected = new uint[16] + { + 0xe4e7f110, 0x15593bd1, 0x1fdd0f50, 0xc47120a3, + 0xc7f4d1c7, 0x0368c033, 0x9aaa2204, 0x4e6cd4c3, + 0x466482d2, 0x09aa9f07, 0x05d7c214, 0xa2028bd9, + 0xd19c12b5, 0xb94e16de, 0xe883d0cb, 0x4e3c50a2, + }; + + output.ToUInt16Array().Should().Equal(expected); + } + + [Fact] + public void ChaCha20ScalarTestVector() + { + // https://tools.ietf.org/html/rfc8439#section-2.4.2 + + // Arrange + foreach (var test in Rfc8439TestVector.Rfc8439TestVectors) + { + // Act + var chacha20 = new ChaCha20(test.Key, test.InitialCounter); + var cipher = new ChaCha20Core(chacha20); + + var output = new byte[test.CipherText.Length]; + cipher.Process(test.Nonce, output, test.CipherText); + + // Assert + output.Should().Equal(test.PlainText); + } + } + + [Theory] + [InlineData(33)] + [InlineData(64)] + [InlineData(65)] + [InlineData(255)] + [InlineData(256)] + [InlineData(511)] + [InlineData(512)] + [InlineData(1023)] + [InlineData(1024)] + public void ScalarCreateVariableLengthCiphers(int size) + { + var input = new byte[size]; + var output = new byte[size]; + + var nonce = new byte[12]; + Array.Fill(nonce, (byte)2); + var key = new byte[32]; + Array.Fill(key, (byte)1); + + var chacha20 = new ChaCha20(key, 0); + var cipher = new ChaCha20Core(chacha20); + cipher.Process(nonce, output, input); + var value = Convert.ToHexString(output); + + value.Should().Be(LongKeyStream[..(size*2)]); + } + + [Fact] + public void ChaCha20ScalarTestVectorTC8() + { + // TC8: key: 'All your base are belong to us!, IV: 'IETF2013' + // Test vector TC8 from RFC draft by J. Strombergson + // https://tools.ietf.org/html/draft-strombergson-chacha-test-vectors-01 + + // Arrange + var key = new byte[32] + { + 0xC4, 0x6E, 0xC1, 0xB1, 0x8C, 0xE8, 0xA8, 0x78, + 0x72, 0x5A, 0x37, 0xE7, 0x80, 0xDF, 0xB7, 0x35, + 0x1F, 0x68, 0xED, 0x2E, 0x19, 0x4C, 0x79, 0xFB, + 0xC6, 0xAE, 0xBE, 0xE1, 0xA6, 0x67, 0x97, 0x5D + }; + + // The first 4 bytes are set to zero and a large counter + // is used; this makes the RFC 8439 version of ChaCha20 + // compatible with the original specification by D. J. Bernstein. + var nonce = new byte[12] { 0x00, 0x00, 0x00, 0x00, + 0x1A, 0xDA, 0x31, 0xD5, 0xCF, 0x68, 0x82, 0x21 + }; + + // Act + var chacha20 = new ChaCha20(key, 0); + var cipher = new ChaCha20Core(chacha20); + var block0 = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; + var block1 = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; + cipher.ProcessKeyStreamBlock(nonce, 0, block0); + cipher.ProcessKeyStreamBlock(nonce, 1, block1); + + // Assert + var expected = new byte[128] + { + 0xF6, 0x3A, 0x89, 0xB7, 0x5C, 0x22, 0x71, 0xF9, + 0x36, 0x88, 0x16, 0x54, 0x2B, 0xA5, 0x2F, 0x06, + 0xED, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2B, 0x00, + 0xB5, 0xE8, 0xF8, 0x0A, 0xE9, 0xA4, 0x73, 0xAF, + 0xC2, 0x5B, 0x21, 0x8F, 0x51, 0x9A, 0xF0, 0xFD, + 0xD4, 0x06, 0x36, 0x2E, 0x8D, 0x69, 0xDE, 0x7F, + 0x54, 0xC6, 0x04, 0xA6, 0xE0, 0x0F, 0x35, 0x3F, + 0x11, 0x0F, 0x77, 0x1B, 0xDC, 0xA8, 0xAB, 0x92, + + 0xE5, 0xFB, 0xC3, 0x4E, 0x60, 0xA1, 0xD9, 0xA9, + 0xDB, 0x17, 0x34, 0x5B, 0x0A, 0x40, 0x27, 0x36, + 0x85, 0x3B, 0xF9, 0x10, 0xB0, 0x60, 0xBD, 0xF1, + 0xF8, 0x97, 0xB6, 0x29, 0x0F, 0x01, 0xD1, 0x38, + 0xAE, 0x2C, 0x4C, 0x90, 0x22, 0x5B, 0xA9, 0xEA, + 0x14, 0xD5, 0x18, 0xF5, 0x59, 0x29, 0xDE, 0xA0, + 0x98, 0xCA, 0x7A, 0x6C, 0xCF, 0xE6, 0x12, 0x27, + 0x05, 0x3C, 0x84, 0xE4, 0x9A, 0x4A, 0x33, 0x32 + }; + + CryptoBytes.Combine(block0, block1).Should().Equal(expected); + } + + private const string LongKeyStream = "06E1F8D66AC5C75181F3E5ED9FA16AA909A1FB57A4A9B0110C84FCDC0D710880072A4342AF88DEC0138DAF141A3F471C01E77C1FDA90999496D601A36A8C0412E61CF22E8DA3E8DA712DE9F9D38BE4298CB36C0D83AA7DD314841BBDF59644DCD313F9F53B0E06B9D6CB3F0788CE2EE78993D9D27A3EDF0A52589CBB698519D583B68F72F3961AD77C1358394F29B08FE9F98A29F98311723013591E698557A04A73FB277E3E247083444A6C139ADE01BDE3C368C3A484D6824B33C024C0285CBD665D4F2E4DE87BF79565F08FE09766C16639279A243DAE8395F3E0E5D96E711B210355605A5A8E7B50CEA4BA25E4CB0E273488E223CD69FB699BD937A30D33488EF6076192E1ED08758F7F4774E4C0B8E70955D3CAFE790EB40F7725EB87B8BE6BBECDE1E140966973B5B05FDBFBE05C4BC599888693D96AC0C429B75591EF228A243A6EFDBEEEE49F09383AF2D4AFB6305DE60C5D195A44ED646B0CAFCEC5E445562FFFBB56D444C650E2D892FA99BCE78F2EBF866B154FDB110DDF8CAFB7BE4BEA46724B3952906F0C6E81BE7A17E3C95DF350BB970D2C97499924BDCC4EA0E1DE33AA4E62B5C1FC65FFD2728D81A79AE218AE1C639108323C3D22BA1B8C746CAB0CD535C8661CCA4B6B047790EF148A1B9A88CD3CDD8D79389E2F0D9AAAE135B361ED6778A6F6E03186651692F8DABEDF8872939F694C41E2CAD064FF4C537B92AFD0951DF77302749DCDBC9560FCE001DACAAFAA703BDA73007174C549B69EB031324E31BC9F60049E39254146AEB39BEE8A52CAEA1DD31C42346E44EBCC0771A2548D55ABD085323BA69625845F34831E7518F129CB1D80B76D3C94634F38A1226B5E212D917D593838F51D6CC35F87EB500030AB1446D87F6FFC4717B51C619DDAFD75DBA4C25A09C8C961CDA12A9E01203D678AD2ABB4B7D1BED7EBF0C2932DCE5F0C97F9488DD01A7891DC18D5EEFF6129B7942726A5B5110877260E2A78075C666F4410A2F8A2909D03DE0FBE2BFCA2B068B438ADAF767D804BA85278FB930945D15380281C215BC664B6627EE76CBBC8C5355E607721AAAC069B16B78C2F282795E7BF9B6509E7DC36FD2D45A227BF9D20C5E9678A040B63E964817F98B5F4828EB5D66740C595304D08A0A3C5A50EE3B3F99D2269992DD400A5B452A213DCD2579F7A193FC7FE33E498E91203DE19FF9D54BEBDE9E124A17E784430C38110FE3552861737DE1F2B7678F63417FE2224ED6571D43A8015F6F81362E7B95CB93C86735787F0980B0A3A65549844768EDF0DDEC75A24FA1EF5A26640932F65FF141CAEE2E14506A34E925C21BC268769CD95328675953E79B4B375912434834018ADD9C1832057EE4386C95B6E9407346B4A1582FB3C095E4B0882087DB48F081B5C0DE69ADBC447A6BA2ED6A4F90909911CD3B51ECEC2C6BE6EFE"; + } +} diff --git a/test/NaCl.Core.SimdTests/NaCl.Core.SimdTests.csproj b/test/NaCl.Core.SimdTests/NaCl.Core.SimdTests.csproj new file mode 100644 index 0000000..43be8af --- /dev/null +++ b/test/NaCl.Core.SimdTests/NaCl.Core.SimdTests.csproj @@ -0,0 +1,58 @@ + + + + net6.0 + latest + true + + + + true + ../../Key.snk + + + + + Always + + + Always + + + Always + + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + + + + + + diff --git a/test/NaCl.Core.SimdTests/Salsa20IntrinsicsTests.cs b/test/NaCl.Core.SimdTests/Salsa20IntrinsicsTests.cs new file mode 100644 index 0000000..fb3ab93 --- /dev/null +++ b/test/NaCl.Core.SimdTests/Salsa20IntrinsicsTests.cs @@ -0,0 +1,433 @@ +namespace NaCl.Core.SimdTests +{ + using System; + using System.Collections.Generic; + using System.IO; + using System.Net.Http; + using System.Security.Cryptography; + using System.Text; + + using FluentAssertions; + using Xunit; + using Xunit.Abstractions; + using Xunit.Categories; + + using Base; + using Internal; + using Vectors; + using System.Linq; + using NaCl.Core.Base.SalsaCore; + using NaCl.Core.SimdTests.Vectors; + + public class Salsa20IntrinsicsTests + { + private readonly ITestOutputHelper _output; + + public Salsa20IntrinsicsTests(ITestOutputHelper output) => _output = output; + + [Fact] + public void HSalsa20IntrinsicsTestVector1() + { + // 8. Example of the long stream, ref: https://cr.yp.to/highspeed/naclcrypto-20090310.pdf + + // Arrange + var shared = new byte[32] + { + 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, + 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, + 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, + 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 + }; + var zero = new byte[32]; + var c = new byte[16] // SIGMA + { + 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x20, 0x33, + 0x32, 0x2d, 0x62, 0x79, 0x74, 0x65, 0x20, 0x6b + }; + + var salsa20 = new XSalsa20(shared); + var cipher = new Salsa20CoreIntrinsics(salsa20); + + // Act + var firstKey = new byte[32]; + cipher.HSalsa20(firstKey, zero); + + // Assert + firstKey.Should().Equal(new byte[] + { + 0x1b, 0x27, 0x55, 0x64, 0x73, 0xe9, 0x85, 0xd4, + 0x62, 0xcd, 0x51, 0x19, 0x7a, 0x9a, 0x46, 0xc7, + 0x60, 0x09, 0x54, 0x9e, 0xac, 0x64, 0x74, 0xf2, + 0x06, 0xc4, 0xee, 0x08, 0x44, 0xf6, 0x83, 0x89 + }); + } + + [Fact] + public void HSalsa20IntrinsicsTestVector2() + { + // 8. Example of the long stream, ref: https://cr.yp.to/highspeed/naclcrypto-20090310.pdf + + // Arrange + var firstKey = new byte[32] + { + 0x1b, 0x27, 0x55, 0x64, 0x73, 0xe9, 0x85, 0xd4, + 0x62, 0xcd, 0x51, 0x19, 0x7a, 0x9a, 0x46, 0xc7, + 0x60, 0x09, 0x54, 0x9e, 0xac, 0x64, 0x74, 0xf2, + 0x06, 0xc4, 0xee, 0x08, 0x44, 0xf6, 0x83, 0x89 + }; + var noncePrefix = new byte[16] + { + 0x69, 0x69, 0x6e, 0xe9, 0x55, 0xb6, 0x2b, 0x73, + 0xcd, 0x62, 0xbd, 0xa8, 0x75, 0xfc, 0x73, 0xd6 + }; + var c = new byte[16] // SIGMA + { + 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x20, 0x33, + 0x32, 0x2d, 0x62, 0x79, 0x74, 0x65, 0x20, 0x6b + }; + + var salsa20 = new XSalsa20(firstKey); + var cipher = new Salsa20CoreIntrinsics(salsa20); + + // Act + var secondKey = new byte[32]; + cipher.HSalsa20(secondKey, noncePrefix); + + // Assert + secondKey.Should().Equal(new byte[] + { + 0xdc, 0x90, 0x8d, 0xda, 0x0b, 0x93, 0x44, 0xa9, + 0x53, 0x62, 0x9b, 0x73, 0x38, 0x20, 0x77, 0x88, + 0x80, 0xf3, 0xce, 0xb4, 0x21, 0xbb, 0x61, 0xb9, + 0x1c, 0xbd, 0x4c, 0x3e, 0x66, 0x25, 0x6c, 0xe4 + }); + } + + [Fact] + public void HSalsa20IntrinsicsTestVector3() + { + // 8. Example of the long stream, ref: https://cr.yp.to/highspeed/naclcrypto-20090310.pdf + + // Arrange + var k = new byte[32] + { + 0xee, 0x30, 0x4f, 0xca, 0x27, 0x00, 0x8d, 0x8c, + 0x12, 0x6f, 0x90, 0x02, 0x79, 0x01, 0xd8, 0x0f, + 0x7f, 0x1d, 0x8b, 0x8d, 0xc9, 0x36, 0xcf, 0x3b, + 0x9f, 0x81, 0x96, 0x92, 0x82, 0x7e, 0x57, 0x77 + }; + var n = new byte[16] + { + 0x81, 0x91, 0x8e, 0xf2, 0xa5, 0xe0, 0xda, 0x9b, + 0x3e, 0x90, 0x60, 0x52, 0x1e, 0x4b, 0xb3, 0x52 + }; + + var salsa20 = new XSalsa20(k); + var cipher = new Salsa20CoreIntrinsics(salsa20); + + // Act + var output = new byte[32]; + cipher.HSalsa20(output, n); + + // Assert + output.Should().Equal(new byte[] + { + 0xbc, 0x1b, 0x30, 0xfc, 0x07, 0x2c, 0xc1, 0x40, + 0x75, 0xe4, 0xba, 0xa7, 0x31, 0xb5, 0xa8, 0x45, + 0xea, 0x9b, 0x11, 0xe9, 0xa5, 0x19, 0x1f, 0x94, + 0xe1, 0x8c, 0xba, 0x8f, 0xd8, 0x21, 0xa7, 0xcd + }); + } + + [Fact] + public void IntrinsicsEncryptDecrypt1BlockTest() + { + // Arrange + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(key); + + var nonce = new byte[Salsa20.NONCE_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(nonce); + + var expected = Encoding.UTF8.GetBytes("This is a secret content!!"); + + var salsa20 = new Salsa20(key, 0); + var cipher = new Salsa20CoreIntrinsics(salsa20); + + // Act + var ciphertext = new byte[expected.Length]; + cipher.Process(nonce, ciphertext, expected); + + var plaintext = new byte[expected.Length]; + cipher.Process(nonce, plaintext, ciphertext); + + // Assert + plaintext.Should().Equal(expected); + } + + [Fact] + public void IntrinsicsEncryptDecryptNBlocksTest() + { + // Arrange + var rnd = new Random(); + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + var nonce = new byte[Salsa20.NONCE_SIZE_IN_BYTES]; + + for (var i = 0; i < 64; i++) + { + RandomNumberGenerator.Fill(key); + RandomNumberGenerator.Fill(nonce); + + var salsa20 = new Salsa20(key, 0); + var cipher = new Salsa20CoreIntrinsics(salsa20); + + for (var j = 0; j < 64; j++) + { + var expected = new byte[rnd.Next(300)]; + rnd.NextBytes(expected); + + var ciphertext = new byte[expected.Length]; + var plaintext = new byte[expected.Length]; + + // Act + cipher.Process(nonce, ciphertext, expected); + cipher.Process(nonce, plaintext, ciphertext); + + // Assert + plaintext.Should().Equal(expected); + } + } + } + + [Fact] + public void IntrinsicsEncryptDecryptLongMessagesTest() + { + var rnd = new Random(); + + var dataSize = 16; + while (dataSize <= 1 << 24) + { + var plaintext = new byte[dataSize]; + rnd.NextBytes(plaintext); + + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(key); + + var nonce = new byte[Salsa20.NONCE_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(nonce); + + var salsa20 = new Salsa20(key, 0); + var cipher = new Salsa20CoreIntrinsics(salsa20); + + var ciphertext = new byte[plaintext.Length]; + cipher.Process(nonce, ciphertext, plaintext); + + var decrypted = new byte[plaintext.Length]; + cipher.Process(nonce, decrypted, ciphertext); + + decrypted.Should().Equal(plaintext); + dataSize += 5 * dataSize / 11; + } + } + + [Fact] + public void Salsa20IntrinsicsBlockWhenNonceLengthIsEmptyFails() + { + // Arrange + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + + var salsa20 = new Salsa20(key, 0); + var nonce = new byte[0]; + var block = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; + var core = new Salsa20CoreIntrinsics(salsa20); + + // Act & Assert + var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); + act.Should().Throw(); + } + + [Fact] + public void Salsa20IntrinsicsBlockWhenNonceLengthIsInvalidFails() + { + // Arrange + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + + var salsa20 = new Salsa20(key, 0); + var nonce = new byte[salsa20.NonceSizeInBytes + TestHelpers.ReturnRandomPositiveNegative()]; + var block = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; + var core = new Salsa20CoreIntrinsics(salsa20); + + // Act & Assert + var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); + act.Should().Throw(); + } + + [Fact] + public void Salsa20IntrinsicsBlockWhenLengthIsInvalidFails() + { + // Arrange + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + + var salsa20 = new Salsa20(key, 0); + var nonce = new byte[salsa20.NonceSizeInBytes + TestHelpers.ReturnRandomPositiveNegative()]; + var block = new byte[0]; + var core = new Salsa20CoreIntrinsics(salsa20); + + // Act & Assert + var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); + act.Should().Throw(); + } + + [Fact] + public void Salsa20IntrinsicsBlockTestVector() + { + // Arrange + var key = CryptoBytes.FromHexString("00:01:02:03:04:05:06:07:08:09:0a:0b:0c:0d:0e:0f:10:11:12:13:14:15:16:17:18:19:1a:1b:1c:1d:1e:1f".Replace(":", string.Empty)); + var nonce = CryptoBytes.FromHexString("00:00:00:09:00:00:00:4a".Replace(":", string.Empty)); + var counter = 1; + + // Act + var salsa20 = new Salsa20(key, 1); + var output = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; + var core = new Salsa20CoreIntrinsics(salsa20); + core.ProcessKeyStreamBlock(nonce, counter, output); + + // Assert + var expected = new uint[16] + { + 3649387971u, 3432934094u, 2867581180u, 544842727u, + 3442094382u, 3233001746u, 2484653980u, 586338650u, + 3037335121u, 3388889956u, 1351682463u, 2284954070u, + 3021171268u, 2617586057u, 3288245149u, 2763695160u }; + + output.ToUInt16Array().Should().Equal(expected); + } + + public static IEnumerable Salsa20TestData => ParseTestVectors(GetTestVector()).Select(d => new object[] { d }); + + [Theory] + [MemberData(nameof(Salsa20TestData))] + public void Salsa20IntrinsicsProcessTestVectors(Salsa20TestVector test) + { + _output.WriteLine($"Salsa20 - {test.Name}"); + + var input = new byte[512]; + var output = new byte[512]; + + var cipher = new Salsa20(test.Key, 0); + var core = new Salsa20CoreIntrinsics(cipher); + core.Process(test.IV, output, input); + + ToBlock1(output).Should().Be(test.ExpectedBlock1); + ToBlock4(output).Should().Be(test.ExpectedBlock4); + ToBlock5(output).Should().Be(test.ExpectedBlock5); + ToBlock8(output).Should().Be(test.ExpectedBlock8); + } + + [Theory] + [InlineData(33)] + [InlineData(64)] + [InlineData(65)] + [InlineData(255)] + [InlineData(256)] + [InlineData(511)] + [InlineData(512)] + [InlineData(1023)] + [InlineData(1024)] + public void IntrinsicsCreateVariableLengthCiphers(int size) + { + var input = new byte[size]; + var output = new byte[size]; + + var nonce = new byte[8]; + Array.Fill(nonce, (byte)2); + + var key = new byte[32]; + Array.Fill(key, (byte)1); + + var cipher = new Salsa20(key, 0); + cipher.Encrypt(input, nonce, output); + var value = Convert.ToHexString(output); + + value.Should().Be(LongKeyStream[..(size*2)]); + } + + private static string GetTestVector() + { + try + { + using var client = new HttpClient(); + return client.GetStringAsync("https://github.com/das-labor/legacy/raw/master/microcontroller-2/arm-crypto-lib/testvectors/salsa20-256.64-verified.test-vectors").Result; + } + catch (Exception) + { + return File.ReadAllText(@"Vectors\salsa20-256.64-verified.test-vectors"); + } + } + + private static IList ParseTestVectors(string raw) + { + var lines = raw.Split(new[] { '\r', '\n' }); + + var result = new List(); + + string ReadValue(string toFind, int idx, int len) + { + var toFindIdx = lines[idx].IndexOf(toFind, StringComparison.Ordinal) + toFind.Length; + return lines[idx].Substring(toFindIdx, len); + } + + for (var i = 0; i < lines.Length; i++) + { + if (!lines[i].StartsWith("Set ")) + continue; + + // We skip Set 6 vector tests for now... + if (!lines[i + 8].Contains("stream[192..255] = ")) + continue; + + var name = lines[i].Replace(":", ""); + + var key = ReadValue("key = ", i + 1, 32); + key += lines[i + 2].Trim(); + + var iv = ReadValue("IV = ", i + 3, 16); + + var block1 = ReadValue("stream[0..63] = ", i + 4, 32); + block1 += lines[i + 5].Trim(); + block1 += lines[i + 6].Trim(); + block1 += lines[i + 7].Trim(); + + var block4 = ReadValue("stream[192..255] = ", i + 8, 32); + block4 += lines[i + 9].Trim(); + block4 += lines[i + 10].Trim(); + block4 += lines[i + 11].Trim(); + + var block5 = ReadValue("stream[256..319] = ", i + 12, 32); + block5 += lines[i + 13].Trim(); + block5 += lines[i + 14].Trim(); + block5 += lines[i + 15].Trim(); + + var block8 = ReadValue("stream[448..511] = ", i + 16, 32); + block8 += lines[i + 17].Trim(); + block8 += lines[i + 18].Trim(); + block8 += lines[i + 19].Trim(); + + result.Add(new Salsa20TestVector(name, key, iv, block1, block4, block5, block8)); + i += 20; + } + + return result; + } + + private static string ToBlock1(byte[] output) => CryptoBytes.ToHexStringUpper(output[0..64]); + + private static string ToBlock4(byte[] output) => CryptoBytes.ToHexStringUpper(output[192..256]); + + private static string ToBlock5(byte[] output) => CryptoBytes.ToHexStringUpper(output[256..320]); + + private static string ToBlock8(byte[] output) => CryptoBytes.ToHexStringUpper(output[448..512]); + + private const string LongKeyStream = "A3D1F8292CAB0B2096AB2AA26FC59AAF3EE159B39FC6029EF160D82EC80FA110FF958AB802861180EC006F8C8450030024A2D7744BF564C1782F15DB6681144C65A730622A14AE9A4E95F753289A6D2DBBEE47B457B57DB75C009B287BF240EBE02890581E3628BDBCC9B79E93500CA15F6E10D4EBCAAFC2FB936AF2EC05BBCB1610036E840621D7CE53E4A06822D6073EA0FA8943EDFB70E45B4D2525AE4B616BD08B33F23A7E0B6CD501E80B8E80B7423E7C9D5D900AE2194AF0CF4A74D721534063D3F17BC7993B5B3EC20A373F933B43CEB6987934C1456521F098BA0CB1205109F534F80D4EA1767EA9DFC08BED97BE40C539DD37EC24EAE0C68AC1B56DD0189747A4B8278B1E0E5206EAE893C0E45C76751002F38924B8C9A036CFAB9E3D44C1E323BCE43F2C69EB8212994803C1D2AC00C3B8F97DA6D09F29B974E0DF4D6D36C9D2E88C2D7B73AB399C0920A2996A4727272339D991C6BF45CE63C2DEF3FC9C2625F87EA6268C196829BB1F7E659736AF4B0CC2A771FB0962B19005E53DD880879C052556312BA353B51C26D5F5949464EAECE15ACA240E339BF3C581E7D93D220B1C3C0DE87F65B4F340DAB924EB72072211C41B18770230A3A123619006BE5FD4ABAAFD2BFAD0F34D5FB491DEBEBF5CA9EC92D997B5A171482CC6E949C70759A0B8EC64D590B6FFF6500E8425C3AE4178C2EDE996C0003F6FA76A6D90F49D6D3D128C0DE82EA8C7C16415DDD07081940701677C32D5B5E3BB57A93315474C5B648D31AA7AE52FCD63BF22550900077FF5CF6A5F5148B285E34A57A3DA1BEB0662A20C23857CA8D5D1748F654F54F42F30CD413F408A0C7B31F57AD59E9F152DBDEEA3EA9C3DBB3517615735CFF0226E179C4A9149C6477A2903B338AE308300A86D91043E2AA437C5F2A77A49B547B05BD98CEBE49500FF367CE204157BB3EFD182A8A96FCC31025D4C948105F6762F22357446367B87A01FA3F954D52810CBE5C4EEB04C3AE827973E481F3C38EF14A6F0FE3FB2D89969D2CCB0DFB63D7366D91F29DDBF1EB90B136191745B8AC8B8F0AAEF4D3A1C763D63AED1E76CC7B920979CB8163C413273CA1A563C37B925A0251C9AD31363F978437D92437A0D250C7F221C00F2E13CF371554DF191ECDDB46C95659739A1CDC257A067D9251FE89EA328D313C4D7EF8E33614FFC4C615D3195CD6282D82633067C81E1F563DA307B14253CBF0492256A409E3007EB6A4A7BDA694E1FFA9B5106AB9868CC359B976441C7B362C03E501D8B3FBEF98771A41C4DA542DB8DA4761EA3792695288437DEAC50E7B6A62E6D00B7511A5DB0E567090ADDDFCF0521F6DD62F969D5BE89378DB127219C38931A0AEDBCE784C35D4215B09B1F96732615813753B67846E9505DF974F4B1ECDFBD0C850A9644D720884B80B4FE4CC08508A8A65D1C5F"; + } +} \ No newline at end of file diff --git a/test/NaCl.Core.SimdTests/Salsa20ScalarTests.cs b/test/NaCl.Core.SimdTests/Salsa20ScalarTests.cs new file mode 100644 index 0000000..56fc03f --- /dev/null +++ b/test/NaCl.Core.SimdTests/Salsa20ScalarTests.cs @@ -0,0 +1,432 @@ +namespace NaCl.Core.SimdTests +{ + using System; + using System.Collections.Generic; + using System.IO; + using System.Net.Http; + using System.Security.Cryptography; + using System.Text; + + using FluentAssertions; + using Xunit; + using Xunit.Abstractions; + + using Base; + using Internal; + using Vectors; + using System.Linq; + using NaCl.Core.Base.SalsaCore; + using NaCl.Core.SimdTests.Vectors; + + public class Salsa20ScalarTests + { + private readonly ITestOutputHelper _output; + + public Salsa20ScalarTests(ITestOutputHelper output) => _output = output; + + [Fact] + public void HSalsa20ScalarTestVector1() + { + // 8. Example of the long stream, ref: https://cr.yp.to/highspeed/naclcrypto-20090310.pdf + + // Arrange + var shared = new byte[32] + { + 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, + 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, + 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, + 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 + }; + var zero = new byte[32]; + var c = new byte[16] // SIGMA + { + 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x20, 0x33, + 0x32, 0x2d, 0x62, 0x79, 0x74, 0x65, 0x20, 0x6b + }; + + var salsa20 = new XSalsa20(shared); + var cipher = new Salsa20Core(salsa20); + + // Act + var firstKey = new byte[32]; + cipher.HSalsa20(firstKey, zero); + + // Assert + firstKey.Should().Equal(new byte[] + { + 0x1b, 0x27, 0x55, 0x64, 0x73, 0xe9, 0x85, 0xd4, + 0x62, 0xcd, 0x51, 0x19, 0x7a, 0x9a, 0x46, 0xc7, + 0x60, 0x09, 0x54, 0x9e, 0xac, 0x64, 0x74, 0xf2, + 0x06, 0xc4, 0xee, 0x08, 0x44, 0xf6, 0x83, 0x89 + }); + } + + [Fact] + public void HSalsa20ScalarTestVector2() + { + // 8. Example of the long stream, ref: https://cr.yp.to/highspeed/naclcrypto-20090310.pdf + + // Arrange + var firstKey = new byte[32] + { + 0x1b, 0x27, 0x55, 0x64, 0x73, 0xe9, 0x85, 0xd4, + 0x62, 0xcd, 0x51, 0x19, 0x7a, 0x9a, 0x46, 0xc7, + 0x60, 0x09, 0x54, 0x9e, 0xac, 0x64, 0x74, 0xf2, + 0x06, 0xc4, 0xee, 0x08, 0x44, 0xf6, 0x83, 0x89 + }; + var noncePrefix = new byte[16] + { + 0x69, 0x69, 0x6e, 0xe9, 0x55, 0xb6, 0x2b, 0x73, + 0xcd, 0x62, 0xbd, 0xa8, 0x75, 0xfc, 0x73, 0xd6 + }; + var c = new byte[16] // SIGMA + { + 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x20, 0x33, + 0x32, 0x2d, 0x62, 0x79, 0x74, 0x65, 0x20, 0x6b + }; + + var salsa20 = new XSalsa20(firstKey); + var cipher = new Salsa20Core(salsa20); + + // Act + var secondKey = new byte[32]; + cipher.HSalsa20(secondKey, noncePrefix); + + // Assert + secondKey.Should().Equal(new byte[] + { + 0xdc, 0x90, 0x8d, 0xda, 0x0b, 0x93, 0x44, 0xa9, + 0x53, 0x62, 0x9b, 0x73, 0x38, 0x20, 0x77, 0x88, + 0x80, 0xf3, 0xce, 0xb4, 0x21, 0xbb, 0x61, 0xb9, + 0x1c, 0xbd, 0x4c, 0x3e, 0x66, 0x25, 0x6c, 0xe4 + }); + } + + [Fact] + public void HSalsa20ScalarTestVector3() + { + // 8. Example of the long stream, ref: https://cr.yp.to/highspeed/naclcrypto-20090310.pdf + + // Arrange + var k = new byte[32] + { + 0xee, 0x30, 0x4f, 0xca, 0x27, 0x00, 0x8d, 0x8c, + 0x12, 0x6f, 0x90, 0x02, 0x79, 0x01, 0xd8, 0x0f, + 0x7f, 0x1d, 0x8b, 0x8d, 0xc9, 0x36, 0xcf, 0x3b, + 0x9f, 0x81, 0x96, 0x92, 0x82, 0x7e, 0x57, 0x77 + }; + var n = new byte[16] + { + 0x81, 0x91, 0x8e, 0xf2, 0xa5, 0xe0, 0xda, 0x9b, + 0x3e, 0x90, 0x60, 0x52, 0x1e, 0x4b, 0xb3, 0x52 + }; + + var salsa20 = new XSalsa20(k); + var cipher = new Salsa20Core(salsa20); + + // Act + var output = new byte[32]; + cipher.HSalsa20(output, n); + + // Assert + output.Should().Equal(new byte[] + { + 0xbc, 0x1b, 0x30, 0xfc, 0x07, 0x2c, 0xc1, 0x40, + 0x75, 0xe4, 0xba, 0xa7, 0x31, 0xb5, 0xa8, 0x45, + 0xea, 0x9b, 0x11, 0xe9, 0xa5, 0x19, 0x1f, 0x94, + 0xe1, 0x8c, 0xba, 0x8f, 0xd8, 0x21, 0xa7, 0xcd + }); + } + + [Fact] + public void ScalarEncryptDecrypt1BlockTest() + { + // Arrange + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(key); + + var nonce = new byte[Salsa20.NONCE_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(nonce); + + var expected = Encoding.UTF8.GetBytes("This is a secret content!!"); + + var salsa20 = new Salsa20(key, 0); + var cipher = new Salsa20Core(salsa20); + + // Act + var ciphertext = new byte[expected.Length]; + cipher.Process(nonce, ciphertext, expected); + + var plaintext = new byte[expected.Length]; + cipher.Process(nonce, plaintext, ciphertext); + + // Assert + plaintext.Should().Equal(expected); + } + + [Fact] + public void ScalarEncryptDecryptNBlocksTest() + { + // Arrange + var rnd = new Random(); + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + var nonce = new byte[Salsa20.NONCE_SIZE_IN_BYTES]; + + for (var i = 0; i < 64; i++) + { + RandomNumberGenerator.Fill(key); + RandomNumberGenerator.Fill(nonce); + + var salsa20 = new Salsa20(key, 0); + var cipher = new Salsa20Core(salsa20); + + for (var j = 0; j < 64; j++) + { + var expected = new byte[rnd.Next(300)]; + rnd.NextBytes(expected); + + var ciphertext = new byte[expected.Length]; + var plaintext = new byte[expected.Length]; + + // Act + cipher.Process(nonce, ciphertext, expected); + cipher.Process(nonce, plaintext, ciphertext); + + // Assert + plaintext.Should().Equal(expected); + } + } + } + + [Fact] + public void ScalarEncryptDecryptLongMessagesTest() + { + var rnd = new Random(); + + var dataSize = 16; + while (dataSize <= 1 << 24) + { + var plaintext = new byte[dataSize]; + rnd.NextBytes(plaintext); + + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(key); + + var nonce = new byte[Salsa20.NONCE_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(nonce); + + var salsa20 = new Salsa20(key, 0); + var cipher = new Salsa20Core(salsa20); + + var ciphertext = new byte[plaintext.Length]; + cipher.Process(nonce, ciphertext, plaintext); + + var decrypted = new byte[plaintext.Length]; + cipher.Process(nonce, decrypted, ciphertext); + + decrypted.Should().Equal(plaintext); + dataSize += 5 * dataSize / 11; + } + } + + [Fact] + public void Salsa20ScalarBlockWhenNonceLengthIsEmptyFails() + { + // Arrange + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + + var salsa20 = new Salsa20(key, 0); + var nonce = new byte[0]; + var block = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; + var core = new Salsa20Core(salsa20); + + // Act & Assert + var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); + act.Should().Throw(); + } + + [Fact] + public void Salsa20ScalarBlockWhenNonceLengthIsInvalidFails() + { + // Arrange + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + + var salsa20 = new Salsa20(key, 0); + var nonce = new byte[salsa20.NonceSizeInBytes + TestHelpers.ReturnRandomPositiveNegative()]; + var block = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; + var core = new Salsa20Core(salsa20); + + // Act & Assert + var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); + act.Should().Throw(); + } + + [Fact] + public void Salsa20ScalarBlockWhenLengthIsInvalidFails() + { + // Arrange + var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + + var salsa20 = new Salsa20(key, 0); + var nonce = new byte[salsa20.NonceSizeInBytes + TestHelpers.ReturnRandomPositiveNegative()]; + var block = new byte[0]; + var core = new Salsa20Core(salsa20); + + // Act & Assert + var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); + act.Should().Throw(); + } + + [Fact] + public void Salsa20ScalarBlockTestVector() + { + // Arrange + var key = CryptoBytes.FromHexString("00:01:02:03:04:05:06:07:08:09:0a:0b:0c:0d:0e:0f:10:11:12:13:14:15:16:17:18:19:1a:1b:1c:1d:1e:1f".Replace(":", string.Empty)); + var nonce = CryptoBytes.FromHexString("00:00:00:09:00:00:00:4a".Replace(":", string.Empty)); + var counter = 1; + + // Act + var salsa20 = new Salsa20(key, 1); + var output = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; + var core = new Salsa20Core(salsa20); + core.ProcessKeyStreamBlock(nonce, counter, output); + + // Assert + var expected = new uint[16] + { + 3649387971u, 3432934094u, 2867581180u, 544842727u, + 3442094382u, 3233001746u, 2484653980u, 586338650u, + 3037335121u, 3388889956u, 1351682463u, 2284954070u, + 3021171268u, 2617586057u, 3288245149u, 2763695160u }; + + output.ToUInt16Array().Should().Equal(expected); + } + + public static IEnumerable Salsa20TestData => ParseTestVectors(GetTestVector()).Select(d => new object[] { d }); + + [Theory] + [MemberData(nameof(Salsa20TestData))] + public void Salsa20ScalarProcessTestVectors(Salsa20TestVector test) + { + _output.WriteLine($"Salsa20 - {test.Name}"); + + var input = new byte[512]; + var output = new byte[512]; + + var cipher = new Salsa20(test.Key, 0); + var core = new Salsa20Core(cipher); + core.Process(test.IV, output, input); + + ToBlock1(output).Should().Be(test.ExpectedBlock1); + ToBlock4(output).Should().Be(test.ExpectedBlock4); + ToBlock5(output).Should().Be(test.ExpectedBlock5); + ToBlock8(output).Should().Be(test.ExpectedBlock8); + } + + [Theory] + [InlineData(33)] + [InlineData(64)] + [InlineData(65)] + [InlineData(255)] + [InlineData(256)] + [InlineData(511)] + [InlineData(512)] + [InlineData(1023)] + [InlineData(1024)] + public void ScalarCreateVariableLengthCiphers(int size) + { + var input = new byte[size]; + var output = new byte[size]; + + var nonce = new byte[8]; + Array.Fill(nonce, (byte)2); + + var key = new byte[32]; + Array.Fill(key, (byte)1); + + var cipher = new Salsa20(key, 0); + cipher.Encrypt(input, nonce, output); + var value = Convert.ToHexString(output); + + value.Should().Be(LongKeyStream[..(size*2)]); + } + + private static string GetTestVector() + { + try + { + using var client = new HttpClient(); + return client.GetStringAsync("https://github.com/das-labor/legacy/raw/master/microcontroller-2/arm-crypto-lib/testvectors/salsa20-256.64-verified.test-vectors").Result; + } + catch (Exception) + { + return File.ReadAllText(@"Vectors\salsa20-256.64-verified.test-vectors"); + } + } + + private static IList ParseTestVectors(string raw) + { + var lines = raw.Split(new[] { '\r', '\n' }); + + var result = new List(); + + string ReadValue(string toFind, int idx, int len) + { + var toFindIdx = lines[idx].IndexOf(toFind, StringComparison.Ordinal) + toFind.Length; + return lines[idx].Substring(toFindIdx, len); + } + + for (var i = 0; i < lines.Length; i++) + { + if (!lines[i].StartsWith("Set ")) + continue; + + // We skip Set 6 vector tests for now... + if (!lines[i + 8].Contains("stream[192..255] = ")) + continue; + + var name = lines[i].Replace(":", ""); + + var key = ReadValue("key = ", i + 1, 32); + key += lines[i + 2].Trim(); + + var iv = ReadValue("IV = ", i + 3, 16); + + var block1 = ReadValue("stream[0..63] = ", i + 4, 32); + block1 += lines[i + 5].Trim(); + block1 += lines[i + 6].Trim(); + block1 += lines[i + 7].Trim(); + + var block4 = ReadValue("stream[192..255] = ", i + 8, 32); + block4 += lines[i + 9].Trim(); + block4 += lines[i + 10].Trim(); + block4 += lines[i + 11].Trim(); + + var block5 = ReadValue("stream[256..319] = ", i + 12, 32); + block5 += lines[i + 13].Trim(); + block5 += lines[i + 14].Trim(); + block5 += lines[i + 15].Trim(); + + var block8 = ReadValue("stream[448..511] = ", i + 16, 32); + block8 += lines[i + 17].Trim(); + block8 += lines[i + 18].Trim(); + block8 += lines[i + 19].Trim(); + + result.Add(new Salsa20TestVector(name, key, iv, block1, block4, block5, block8)); + i += 20; + } + + return result; + } + + private static string ToBlock1(byte[] output) => CryptoBytes.ToHexStringUpper(output[0..64]); + + private static string ToBlock4(byte[] output) => CryptoBytes.ToHexStringUpper(output[192..256]); + + private static string ToBlock5(byte[] output) => CryptoBytes.ToHexStringUpper(output[256..320]); + + private static string ToBlock8(byte[] output) => CryptoBytes.ToHexStringUpper(output[448..512]); + + private const string LongKeyStream = "A3D1F8292CAB0B2096AB2AA26FC59AAF3EE159B39FC6029EF160D82EC80FA110FF958AB802861180EC006F8C8450030024A2D7744BF564C1782F15DB6681144C65A730622A14AE9A4E95F753289A6D2DBBEE47B457B57DB75C009B287BF240EBE02890581E3628BDBCC9B79E93500CA15F6E10D4EBCAAFC2FB936AF2EC05BBCB1610036E840621D7CE53E4A06822D6073EA0FA8943EDFB70E45B4D2525AE4B616BD08B33F23A7E0B6CD501E80B8E80B7423E7C9D5D900AE2194AF0CF4A74D721534063D3F17BC7993B5B3EC20A373F933B43CEB6987934C1456521F098BA0CB1205109F534F80D4EA1767EA9DFC08BED97BE40C539DD37EC24EAE0C68AC1B56DD0189747A4B8278B1E0E5206EAE893C0E45C76751002F38924B8C9A036CFAB9E3D44C1E323BCE43F2C69EB8212994803C1D2AC00C3B8F97DA6D09F29B974E0DF4D6D36C9D2E88C2D7B73AB399C0920A2996A4727272339D991C6BF45CE63C2DEF3FC9C2625F87EA6268C196829BB1F7E659736AF4B0CC2A771FB0962B19005E53DD880879C052556312BA353B51C26D5F5949464EAECE15ACA240E339BF3C581E7D93D220B1C3C0DE87F65B4F340DAB924EB72072211C41B18770230A3A123619006BE5FD4ABAAFD2BFAD0F34D5FB491DEBEBF5CA9EC92D997B5A171482CC6E949C70759A0B8EC64D590B6FFF6500E8425C3AE4178C2EDE996C0003F6FA76A6D90F49D6D3D128C0DE82EA8C7C16415DDD07081940701677C32D5B5E3BB57A93315474C5B648D31AA7AE52FCD63BF22550900077FF5CF6A5F5148B285E34A57A3DA1BEB0662A20C23857CA8D5D1748F654F54F42F30CD413F408A0C7B31F57AD59E9F152DBDEEA3EA9C3DBB3517615735CFF0226E179C4A9149C6477A2903B338AE308300A86D91043E2AA437C5F2A77A49B547B05BD98CEBE49500FF367CE204157BB3EFD182A8A96FCC31025D4C948105F6762F22357446367B87A01FA3F954D52810CBE5C4EEB04C3AE827973E481F3C38EF14A6F0FE3FB2D89969D2CCB0DFB63D7366D91F29DDBF1EB90B136191745B8AC8B8F0AAEF4D3A1C763D63AED1E76CC7B920979CB8163C413273CA1A563C37B925A0251C9AD31363F978437D92437A0D250C7F221C00F2E13CF371554DF191ECDDB46C95659739A1CDC257A067D9251FE89EA328D313C4D7EF8E33614FFC4C615D3195CD6282D82633067C81E1F563DA307B14253CBF0492256A409E3007EB6A4A7BDA694E1FFA9B5106AB9868CC359B976441C7B362C03E501D8B3FBEF98771A41C4DA542DB8DA4761EA3792695288437DEAC50E7B6A62E6D00B7511A5DB0E567090ADDDFCF0521F6DD62F969D5BE89378DB127219C38931A0AEDBCE784C35D4215B09B1F96732615813753B67846E9505DF974F4B1ECDFBD0C850A9644D720884B80B4FE4CC08508A8A65D1C5F"; + } +} \ No newline at end of file diff --git a/test/NaCl.Core.SimdTests/TestHelpers.cs b/test/NaCl.Core.SimdTests/TestHelpers.cs new file mode 100644 index 0000000..95a81eb --- /dev/null +++ b/test/NaCl.Core.SimdTests/TestHelpers.cs @@ -0,0 +1,91 @@ +namespace NaCl.Core.SimdTests +{ + using System; + using System.Collections.Generic; + using System.Linq; + + using FluentAssertions; + using NaCl.Core.SimdTests; + + public static class TestHelpers + { + private static readonly Random _random = new Random(); + private static readonly object _sync = new object(); + + private static int Random(int min, int max) + { + lock (_sync) + { + return _random.Next(min, max); + } + } + + public static void AssertEqualBytes(byte[] expected, byte[] actual) + => BitConverter.ToString(actual).Should().Be(BitConverter.ToString(expected)); + + public static ArraySegment Pad(this byte[] array) + => array.Pad(Random(1, 100), Random(0, 50)); + + private static ArraySegment Pad(this byte[] array, int paddingLeft, int paddingRight) + { + byte padByte = 0xE7; + if (array.Length > 0) + { + if (array[0] == padByte) + padByte++; + if (array[array.Length - 1] == padByte) + padByte++; + } + var resultBytes = Enumerable.Repeat(padByte, paddingLeft).Concat(array).Concat(Enumerable.Repeat(padByte, paddingRight)).ToArray(); + return new ArraySegment(resultBytes, paddingLeft, array.Length); + } + + public static byte[] UnPad(this ArraySegment paddedData) + { + var padByte = paddedData.Array[0]; + if (padByte < 0xE7 || padByte > 0xE9) + throw new ArgumentException("Padding invalid"); + + for (var i = 0; i < paddedData.Offset; i++) + { + if (paddedData.Array[i] != padByte) + throw new ArgumentException("Padding invalid"); + } + for (var i = paddedData.Offset + paddedData.Count; i < paddedData.Array.Length; i++) + { + if (paddedData.Array[i] != padByte) + throw new ArgumentException("Padding invalid"); + } + return paddedData.ToArray(); + } + + public static int ReturnRandomPositiveNegative() => Random(0, 2) * 2 - 1; + + public static IEnumerable WithChangedBit(this byte[] array) + { + for (var i = 0; i < array.Length; i++) + { + for (var bit = 0; bit < 8; bit++) + { + var result = array.ToArray(); + result[i] ^= (byte)(1 << bit); + yield return result; + } + } + } + + private static byte[] ToArray(this ArraySegment segment) + { + var result = new byte[segment.Count]; + Array.Copy(segment.Array, segment.Offset, result, 0, segment.Count); + return result; + } + + public static uint[] ToUInt16Array(this byte[] source) + { + var decoded = new uint[source.Length / 4]; + Buffer.BlockCopy(source, 0, decoded, 0, source.Length); + return decoded; + } + } +} diff --git a/test/NaCl.Core.SimdTests/Vectors/HChaCha20TestVector.cs b/test/NaCl.Core.SimdTests/Vectors/HChaCha20TestVector.cs new file mode 100644 index 0000000..6d73b61 --- /dev/null +++ b/test/NaCl.Core.SimdTests/Vectors/HChaCha20TestVector.cs @@ -0,0 +1,67 @@ +namespace NaCl.Core.SimdTests.Vectors +{ + using Internal; + + public class HChaCha20TestVector + { + public byte[] Key { get; private set; } + public byte[] Input { get; private set; } + public byte[] Output { get; private set; } + + public HChaCha20TestVector(string key, string input, string output) + { + Key = CryptoBytes.FromHexString(key); + Input = CryptoBytes.FromHexString(input); + Output = CryptoBytes.FromHexString(output); + } + + public static HChaCha20TestVector[] HChaCha20TestVectors = { + // From libsodium's test/default/xchacha20.c (tv_hchacha20). + new HChaCha20TestVector( + "24f11cce8a1b3d61e441561a696c1c1b7e173d084fd4812425435a8896a013dc", + "d9660c5900ae19ddad28d6e06e45fe5e", + "5966b3eec3bff1189f831f06afe4d4e3be97fa9235ec8c20d08acfbbb4e851e3"), + new HChaCha20TestVector( + "80a5f6272031e18bb9bcd84f3385da65e7731b7039f13f5e3d475364cd4d42f7", + "c0eccc384b44c88e92c57eb2d5ca4dfa", + "6ed11741f724009a640a44fce7320954c46e18e0d7ae063bdbc8d7cf372709df"), + new HChaCha20TestVector( + "cb1fc686c0eec11a89438b6f4013bf110e7171dace3297f3a657a309b3199629", + "fcd49b93e5f8f299227e64d40dc864a3", + "84b7e96937a1a0a406bb7162eeaad34308d49de60fd2f7ec9dc6a79cbab2ca34"), + new HChaCha20TestVector( + "6640f4d80af5496ca1bc2cfff1fefbe99638dbceaabd7d0ade118999d45f053d", + "31f59ceeeafdbfe8cae7914caeba90d6", + "9af4697d2f5574a44834a2c2ae1a0505af9f5d869dbe381a994a18eb374c36a0"), + new HChaCha20TestVector( + "0693ff36d971225a44ac92c092c60b399e672e4cc5aafd5e31426f123787ac27", + "3a6293da061da405db45be1731d5fc4d", + "f87b38609142c01095bfc425573bb3c698f9ae866b7e4216840b9c4caf3b0865"), + new HChaCha20TestVector( + "809539bd2639a23bf83578700f055f313561c7785a4a19fc9114086915eee551", + "780c65d6a3318e479c02141d3f0b3918", + "902ea8ce4680c09395ce71874d242f84274243a156938aaa2dd37ac5be382b42"), + new HChaCha20TestVector( + "1a170ddf25a4fd69b648926e6d794e73408805835c64b2c70efddd8cd1c56ce0", + "05dbee10de87eb0c5acb2b66ebbe67d3", + "a4e20b634c77d7db908d387b48ec2b370059db916e8ea7716dc07238532d5981"), + new HChaCha20TestVector( + "3b354e4bb69b5b4a1126f509e84cad49f18c9f5f29f0be0c821316a6986e15a6", + "d8a89af02f4b8b2901d8321796388b6c", + "9816cb1a5b61993735a4b161b51ed2265b696e7ded5309c229a5a99f53534fbc"), + new HChaCha20TestVector( + "4b9a818892e15a530db50dd2832e95ee192e5ed6afffb408bd624a0c4e12a081", + "a9079c551de70501be0286d1bc78b045", + "ebc5224cf41ea97473683b6c2f38a084bf6e1feaaeff62676db59d5b719d999b"), + new HChaCha20TestVector( + "c49758f00003714c38f1d4972bde57ee8271f543b91e07ebce56b554eb7fa6a7", + "31f0204e10cf4f2035f9e62bb5ba7303", + "0dd8cc400f702d2c06ed920be52048a287076b86480ae273c6d568a2e9e7518c"), + // From https://tools.ietf.org/html/draft-arciszewski-xchacha-01#section-2.2.1. + new HChaCha20TestVector( + "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f", + "000000090000004a0000000031415927", + "82413b4227b27bfed30e42508a877d73a0f9e4d58a74a853c12ec41326d3ecdc") + }; + } +} diff --git a/test/NaCl.Core.SimdTests/Vectors/Rfc8439TestVector.cs b/test/NaCl.Core.SimdTests/Vectors/Rfc8439TestVector.cs new file mode 100644 index 0000000..8bdc0d2 --- /dev/null +++ b/test/NaCl.Core.SimdTests/Vectors/Rfc8439TestVector.cs @@ -0,0 +1,160 @@ +namespace NaCl.Core.SimdTests.Vectors +{ + using Internal; + + public class Rfc8439TestVector + { + public byte[] Key { get; private set; } + public byte[] PlainText { get; private set; } + public byte[] Nonce { get; private set; } + public byte[] CipherText { get; private set; } + public int InitialCounter { get; private set; } + public byte[] Aad { get; private set; } + public byte[] Tag { get; private set; } + + public string Id { get; private set; } // used to identify the benchmark test + + public Rfc8439TestVector(string key, string plaintext, string nonce, string ciphertext, int initialCounter, string id) + { + Key = CryptoBytes.FromHexString(key); + PlainText = CryptoBytes.FromHexString(plaintext); + Nonce = CryptoBytes.FromHexString(nonce); + CipherText = CryptoBytes.FromHexString(ciphertext); + InitialCounter = initialCounter; + Id = id; + } + + public Rfc8439TestVector(string plaintext, string aad, string key, string nonce, string ciphertext, string tag, string id) + { + PlainText = CryptoBytes.FromHexString(plaintext); + Aad = CryptoBytes.FromHexString(aad); + Key = CryptoBytes.FromHexString(key); + Nonce = CryptoBytes.FromHexString(nonce); + CipherText = CryptoBytes.FromHexString(ciphertext); + Tag = CryptoBytes.FromHexString(tag); + Id = id; + } + + public override string ToString() => Id; + + public static Rfc8439TestVector[] Rfc8439TestVectors = + { + // Tests against the test vectors in Section 2.3.2 of RFC 8439. + // https://tools.ietf.org/html/rfc8439#section-2.3.2 + new Rfc8439TestVector( + "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f", + "4c616469657320616e642047656e746c656d656e206f662074686520636c617373206f66202739393a20496620" + + "4920636f756c64206f6666657220796f75206f6e6c79206f6e652074697020666f722074686520667574" + + "7572652c2073756e73637265656e20776f756c642062652069742e", + "000000000000004a00000000", + "6e2e359a2568f98041ba0728dd0d6981e97e7aec1d4360c20a27afccfd9fae0bf91b65c5524733ab8f593dabcd" + + "62b3571639d624e65152ab8f530c359f0861d807ca0dbf500d6a6156a38e088a22b65e52bc514d16ccf8" + + "06818ce91ab77937365af90bbf74a35be6b40b8eedf2785e42874d", + 1, "Test Vector #1"), + new Rfc8439TestVector( + "0000000000000000000000000000000000000000000000000000000000000000", + "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000" + + "00000000000000000000000000000000000000", + "000000000000000000000000", + "76b8e0ada0f13d90405d6ae55386bd28bdd219b8a08ded1aa836efcc8b770dc7da41597c5157488d7724e03fb8" + + "d84a376a43b8f41518a11cc387b669b2ee6586", + 0, "Test Vector #2"), + new Rfc8439TestVector( + "0000000000000000000000000000000000000000000000000000000000000001", + "416e79207375626d697373696f6e20746f20746865204945544620696e74656e6465642062792074686520436f" + + "6e7472696275746f7220666f72207075626c69636174696f6e20617320616c6c206f722070617274206f" + + "6620616e204945544620496e7465726e65742d4472616674206f722052464320616e6420616e79207374" + + "6174656d656e74206d6164652077697468696e2074686520636f6e74657874206f6620616e2049455446" + + "20616374697669747920697320636f6e7369646572656420616e20224945544620436f6e747269627574" + + "696f6e222e20537563682073746174656d656e747320696e636c756465206f72616c2073746174656d65" + + "6e747320696e20494554462073657373696f6e732c2061732077656c6c206173207772697474656e2061" + + "6e6420656c656374726f6e696320636f6d6d756e69636174696f6e73206d61646520617420616e792074" + + "696d65206f7220706c6163652c207768696368206172652061646472657373656420746f", + "000000000000000000000002", + "a3fbf07df3fa2fde4f376ca23e82737041605d9f4f4f57bd8cff2c1d4b7955ec2a97948bd3722915c8f3d337f7" + + "d370050e9e96d647b7c39f56e031ca5eb6250d4042e02785ececfa4b4bb5e8ead0440e20b6e8db09d881" + + "a7c6132f420e52795042bdfa7773d8a9051447b3291ce1411c680465552aa6c405b7764d5e87bea85ad0" + + "0f8449ed8f72d0d662ab052691ca66424bc86d2df80ea41f43abf937d3259dc4b2d0dfb48a6c9139ddd7" + + "f76966e928e635553ba76c5c879d7b35d49eb2e62b0871cdac638939e25e8a1e0ef9d5280fa8ca328b35" + + "1c3c765989cbcf3daa8b6ccc3aaf9f3979c92b3720fc88dc95ed84a1be059c6499b9fda236e7e818b04b" + + "0bc39c1e876b193bfe5569753f88128cc08aaa9b63d1a16f80ef2554d7189c411f5869ca52c5b83fa36f" + + "f216b9c1d30062bebcfd2dc5bce0911934fda79a86f6e698ced759c3ff9b6477338f3da4f9cd8514ea99" + + "82ccafb341b2384dd902f3d1ab7ac61dd29c6f21ba5b862f3730e37cfdc4fd806c22f221", + 1, "Test Vector #3"), + new Rfc8439TestVector( + "1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0", + "2754776173206272696c6c69672c20616e642074686520736c6974687920746f7665730a446964206779726520616e642067696d626c6520696e2074686520776162653a0a416c6c206d696d737920776572652074686520626f726f676f7665732c0a416e6420746865206d6f6d65207261746873206f757467726162652e", + "000000000000000000000002", + "62e6347f95ed87a45ffae7426f27a1df5fb69110044c0d73118effa95b01e5cf166d3df2d721caf9b21e5fb14c616871fd84c54f9d65b283196c7fe4f60553ebf39c6402c42234e32a356b3e764312a61a5532055716ead6962568f87d3f3f7704c6a8d1bcd1bf4d50d6154b6da731b187b58dfd728afa36757a797ac188d1", + 42, "Test Vector #4"), + // Tests against the test vectors in Section 2.6.2 of RFC 8439. + // https://tools.ietf.org/html/rfc8439#section-2.6.2 + new Rfc8439TestVector( + "808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f", + "0000000000000000000000000000000000000000000000000000000000000000", + "000000000001020304050607", + "8ad5a08b905f81cc815040274ab29471a833b637e3fd0da508dbb8e2fdd1a646", + 0, "Test Vector #5"), + new Rfc8439TestVector( + "0000000000000000000000000000000000000000000000000000000000000000", + "0000000000000000000000000000000000000000000000000000000000000000", + "000000000000000000000000", + "76b8e0ada0f13d90405d6ae55386bd28bdd219b8a08ded1aa836efcc8b770dc7", + 0, "Test Vector #6"), + new Rfc8439TestVector( + "0000000000000000000000000000000000000000000000000000000000000001", + "0000000000000000000000000000000000000000000000000000000000000000", + "000000000000000000000002", + "ecfa254f845f647473d3cb140da9e87606cb33066c447b87bc2666dde3fbb739", + 0, "Test Vector #7"), + new Rfc8439TestVector( + "1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0", + "0000000000000000000000000000000000000000000000000000000000000000", + "000000000000000000000002", + "965e3bc6f9ec7ed9560808f4d229f94b137ff275ca9b3fcbdd59deaad23310ae", + 0, "Test Vector #8"), + }; + + public static Rfc8439TestVector[] Rfc8439AeadTestVectors = + { + // Section 2.8.2 + // Example and Test Vector for AEAD_CHACHA20_POLY1305 + // https://tools.ietf.org/html/rfc8439#section-2.8.2 + new Rfc8439TestVector( + "4c616469657320616e642047656e746c656d656e206f662074686520636c617373206f66202739393a204966204920636f756c64206f6666657220796f75206f6e6c79206f6e652074697020666f7220746865206675747572652c2073756e73637265656e20776f756c642062652069742e", + "50515253c0c1c2c3c4c5c6c7", + "808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f", + "07000000" + "4041424344454647", + "d31a8d34648e60db7b86afbc53ef7ec2a4aded51296e08fea9e2b5a736ee62d63dbea45e8ca9671282fafb69da92728b1a71de0a9e060b2905d6a5b67ecd3b3692ddbd7f2d778b8c9803aee328091b58fab324e4fad675945585808b4831d7bc3ff4def08e4b7a9de576d26586cec64b6116", + "1ae10b594f09e26a7e902ecbd0600691", "Section 2.8.2"), + // Appendix A.5 + new Rfc8439TestVector( + "496e7465726e65742d4472616674732061726520647261667420646f63756d656e74732076616c696420666f722061206d6178696d756d206f6620736978206d6f6e74687320616e64206d617920626520757064617465642c207265706c616365642c206f72206f62736f6c65746564206279206f7468657220646f63756d656e747320617420616e792074696d652e20497420697320696e617070726f70726961746520746f2075736520496e7465726e65742d447261667473206173207265666572656e6365206d6174657269616c206f7220746f2063697465207468656d206f74686572207468616e206173202fe2809c776f726b20696e2070726f67726573732e2fe2809d", + "f33388860000000000004e91", + "1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0", + "000000000102030405060708", + "64a0861575861af460f062c79be643bd5e805cfd345cf389f108670ac76c8cb24c6cfc18755d43eea09ee94e382d26b0bdb7b73c321b0100d4f03b7f355894cf332f830e710b97ce98c8a84abd0b948114ad176e008d33bd60f982b1ff37c8559797a06ef4f0ef61c186324e2b3506383606907b6a7c02b0f9f6157b53c867e4b9166c767b804d46a59b5216cde7a4e99040c5a40433225ee282a1b0a06c523eaf4534d7f83fa1155b0047718cbc546a0d072b04b3564eea1b422273f548271a0bb2316053fa76991955ebd63159434ecebb4e466dae5a1073a6727627097a1049e617d91d361094fa68f0ff77987130305beaba2eda04df997b714d6c6f2c29a6ad5cb4022b02709b", + "eead9d67890cbb22392336fea1851f38", "Appendix A.5") + }; + + public static Rfc8439TestVector[] Rfc7634AeadTestVectors = + { + // Appendix A. + new Rfc8439TestVector( + "45000054a6f200004001e778c6336405c000020508005b7a3a080000553bec100007362708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363701020204", + "0102030400000005", + "808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f", + "a0a1a2a31011121314151617", + "24039428b97f417e3c13753a4f05087b67c352e6a7fab1b982d466ef407ae5c614ee8099d52844eb61aa95dfab4c02f72aa71e7c4c4f64c9befe2facc638e8f3cbec163fac469b502773f6fb94e664da9165b82829f641e0", + "76aaa8266b7fb0f7b11b369907e1ad43", "Appendix A"), + // Appendix B. + new Rfc8439TestVector( + "0000000c000040010000000a00", + "c0c1c2c3c4c5c6c7d0d1d2d3d4d5d6d72e202500000000090000004529000029", + "808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f", + "a0a1a2a31011121314151617", + "610394701f8d017f7c12924889", + "6b71bfe25236efd7cdc67066906315b2", "Appendix B") + }; + } +} diff --git a/test/NaCl.Core.SimdTests/Vectors/Salsa20TestVector.cs b/test/NaCl.Core.SimdTests/Vectors/Salsa20TestVector.cs new file mode 100644 index 0000000..f89a5da --- /dev/null +++ b/test/NaCl.Core.SimdTests/Vectors/Salsa20TestVector.cs @@ -0,0 +1,35 @@ +namespace NaCl.Core.SimdTests.Vectors +{ + using NaCl.Core.Internal; + + public class Salsa20TestVector + { + public Salsa20TestVector(string name, string key, string iv, string block1, string block4, string block5, string block8) + { + Name = name; + Key = CryptoBytes.FromHexString(key); + IV = CryptoBytes.FromHexString(iv); + + ExpectedBlock1 = block1; + ExpectedBlock4 = block4; + ExpectedBlock5 = block5; + ExpectedBlock8 = block8; + } + + public string Name { get; } + + public byte[] Key { get; } + + public byte[] IV { get; } + + public string ExpectedBlock1 { get; } + + public string ExpectedBlock4 { get; } + + public string ExpectedBlock5 { get; } + + public string ExpectedBlock8 { get; } + + public override string ToString() => Name; + } +} \ No newline at end of file diff --git a/test/NaCl.Core.Tests/NaCl.Core.Tests.csproj b/test/NaCl.Core.Tests/NaCl.Core.Tests.csproj index 265836f..bfb25e8 100644 --- a/test/NaCl.Core.Tests/NaCl.Core.Tests.csproj +++ b/test/NaCl.Core.Tests/NaCl.Core.Tests.csproj @@ -54,5 +54,4 @@ - diff --git a/test/NaCl.Core.Tests/XSalsa20Tests.cs b/test/NaCl.Core.Tests/XSalsa20Tests.cs index f1298d2..884ac61 100644 --- a/test/NaCl.Core.Tests/XSalsa20Tests.cs +++ b/test/NaCl.Core.Tests/XSalsa20Tests.cs @@ -12,9 +12,6 @@ using Xunit.Categories; using Base; - using Internal; - using Vectors; - using System.Runtime.Intrinsics.X86; [Category("CI")] public class XSalsa20Tests @@ -262,12 +259,6 @@ public void XSalsa20BlockWhenLengthIsInvalidFails() act.Should().Throw(); } - //[Fact] - //public void AssertSupport() - //{ - // Assert.True(Avx2.IsSupported); - //} - [Fact] public void HSalsa20TestVector1() { From d7d04d4ef083f880d22cbea6323a4461f366fc3b Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 18 Oct 2022 15:17:13 +0100 Subject: [PATCH 44/59] Updated Intrinsics test powershell script --- NaCl.Core.sln | 1 + TestIntrinsics.ps1 | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/NaCl.Core.sln b/NaCl.Core.sln index fd8ea3e..aa1f59a 100644 --- a/NaCl.Core.sln +++ b/NaCl.Core.sln @@ -12,6 +12,7 @@ EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{28EF1FB3-A057-4C17-A3B2-B9370B234F81}" ProjectSection(SolutionItems) = preProject CodeCoverage.runsettings = CodeCoverage.runsettings + TestIntrinsics.ps1= TestIntrinsics.ps1 EndProjectSection EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NaCl.Core.SimdTests", "test\NaCl.Core.SimdTests\NaCl.Core.SimdTests.csproj", "{BF42937A-028C-4870-AAB4-220667A57457}" diff --git a/TestIntrinsics.ps1 b/TestIntrinsics.ps1 index 05a5e01..2fded01 100644 --- a/TestIntrinsics.ps1 +++ b/TestIntrinsics.ps1 @@ -2,7 +2,7 @@ $env:COMPlus_EnableAVX2 = 1 $env:COMPlus_EnableSSE3 = 1 $env:COMPlus_EnableSSE2 = 1 Write-Host "Test Environment: Normal" -ForegroundColor "Cyan" -dotnet test $config.TestProject +dotnet test .\test\NaCl.Core.Tests\NaCl.Core.Tests.csproj if ($LastExitCode -ne 0) { Write-Host "Tests failed, aborting build!" -Foreground "Red" Exit 1 @@ -12,7 +12,7 @@ $env:COMPlus_EnableAVX2 = 0 $env:COMPlus_EnableSSE3 = 1 $env:COMPlus_EnableSSE2 = 1 Write-Host "Test Environment: AVX2 Disabled" -ForegroundColor "Cyan" -dotnet test $config.TestProject --framework netcoreapp3.1 +dotnet test .\test\NaCl.Core.Tests\NaCl.Core.Tests.csproj if ($LastExitCode -ne 0) { Write-Host "Tests failed, aborting build!" -Foreground "Red" Exit 1 @@ -22,7 +22,7 @@ $env:COMPlus_EnableAVX2 = 0 $env:COMPlus_EnableSSE3 = 0 $env:COMPlus_EnableSSE2 = 1 Write-Host "Test Environment: SSE3 Disabled" -ForegroundColor "Cyan" -dotnet test $config.TestProject --framework netcoreapp3.1 +dotnet test .\test\NaCl.Core.Tests\NaCl.Core.Tests.csproj if ($LastExitCode -ne 0) { Write-Host "Tests failed, aborting build!" -Foreground "Red" Exit 1 @@ -32,7 +32,7 @@ $env:COMPlus_EnableAVX2 = 0 $env:COMPlus_EnableSSE3 = 0 $env:COMPlus_EnableSSE2 = 0 Write-Host "Test Environment: SSE2 Disabled" -ForegroundColor "Cyan" -dotnet test $config.TestProject --framework netcoreapp3.1 +dotnet test .\test\NaCl.Core.Tests\NaCl.Core.Tests.csproj if ($LastExitCode -ne 0) { Write-Host "Tests failed, aborting build!" -Foreground "Red" Exit 1 From e5596777870eb1f040045719398f920b1ae2badd Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 18 Oct 2022 15:41:51 +0100 Subject: [PATCH 45/59] Code cleanup and formatting changes --- .../Base/ChaChaIntrinsics/ChaCha256.cs | 4 +- .../Base/ChaChaIntrinsics/ChaCha512.cs | 4 +- .../Base/ChaChaIntrinsics/ChaCha64.cs | 4 +- src/NaCl.Core/Base/SalsaCore/Salsa20Core.cs | 8 +- .../Base/SalsaIntrinsics/Salsa256.cs | 4 +- .../Base/SalsaIntrinsics/Salsa512.cs | 4 +- src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs | 4 +- .../NaCl.Core.Benchmarks/ChaCha20Benchmark.cs | 167 +++++++++--------- .../ChaCha20Poly1305Benchmark.cs | 161 +++++++++-------- .../NaCl.Core.Benchmarks/Poly1305Benchmark.cs | 85 +++++---- test/NaCl.Core.Benchmarks/Program.cs | 43 ++--- test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs | 133 +++++++------- .../XChaCha20Benchmark.cs | 145 ++++++++------- .../XChaCha20Poly1305Benchmark.cs | 159 +++++++++-------- .../XSalsa20Benchmark .cs | 131 +++++++------- 15 files changed, 516 insertions(+), 540 deletions(-) diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha256.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha256.cs index 00e389d..51fd65b 100644 --- a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha256.cs +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha256.cs @@ -1,10 +1,10 @@ #if INTRINSICS +namespace NaCl.Core.Base.ChaChaIntrinsics; + using System.Runtime.CompilerServices; using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; -namespace NaCl.Core.Base.ChaChaIntrinsics; - #pragma warning disable IDE0007 // Use implicit type internal static class ChaCha256 { diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs index 52d212c..3cdb418 100644 --- a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs @@ -1,10 +1,10 @@ #if INTRINSICS +namespace NaCl.Core.Base.ChaChaIntrinsics; + using System.Runtime.CompilerServices; using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; -namespace NaCl.Core.Base.ChaChaIntrinsics; - #pragma warning disable IDE0007 // Use implicit type internal static class ChaCha512 { diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs index f565fc1..13cf7e2 100644 --- a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs @@ -1,11 +1,11 @@ #if INTRINSICS +namespace NaCl.Core.Base.ChaChaIntrinsics; + using System.Runtime.CompilerServices; using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; using System; -namespace NaCl.Core.Base.ChaChaIntrinsics; - #pragma warning disable IDE0007 // Use implicit type internal static class ChaCha64 { diff --git a/src/NaCl.Core/Base/SalsaCore/Salsa20Core.cs b/src/NaCl.Core/Base/SalsaCore/Salsa20Core.cs index b05f233..ae45a4b 100644 --- a/src/NaCl.Core/Base/SalsaCore/Salsa20Core.cs +++ b/src/NaCl.Core/Base/SalsaCore/Salsa20Core.cs @@ -9,12 +9,8 @@ internal class Salsa20Core : ISalsa20Core { - protected const int KEY_SIZE_IN_INTS = 8; - public const int KEY_SIZE_IN_BYTES = KEY_SIZE_IN_INTS * 4; // 32 - protected const int BLOCK_SIZE_IN_INTS = 16; - public const int BLOCK_SIZE_IN_BYTES = BLOCK_SIZE_IN_INTS * 4; // 64 - - protected static uint[] SIGMA = new uint[] { 0x61707865, 0x3320646E, 0x79622D32, 0x6B206574 }; // "expand 32-byte k" (4 words constant: "expa", "nd 3", "2-by", and "te k") + public const int BLOCK_SIZE_IN_BYTES = Snuffle.BLOCK_SIZE_IN_BYTES; + public const int BLOCK_SIZE_IN_INTS = Snuffle.BLOCK_SIZE_IN_INTS; private readonly Salsa20Base _salsa20; diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs index b7da377..9de0de7 100644 --- a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs @@ -1,10 +1,10 @@ #if INTRINSICS +namespace NaCl.Core.Base.SalsaIntrinsics; + using System.Runtime.CompilerServices; using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; -namespace NaCl.Core.Base.SalsaIntrinsics; - #pragma warning disable IDE0007 // Use implicit type internal static class Salsa256 { diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa512.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa512.cs index 7813da7..281de50 100644 --- a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa512.cs +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa512.cs @@ -1,10 +1,10 @@ #if INTRINSICS +namespace NaCl.Core.Base.SalsaIntrinsics; + using System.Runtime.CompilerServices; using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; -namespace NaCl.Core.Base.SalsaIntrinsics; - #pragma warning disable IDE0007 // Use implicit type #pragma warning disable IDE0022 // Use expression body for methods internal static class Salsa512 diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs index 2e8b883..08a53dd 100644 --- a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs @@ -1,11 +1,11 @@ #if INTRINSICS +namespace NaCl.Core.Base.SalsaIntrinsics; + using System; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; -namespace NaCl.Core.Base.SalsaIntrinsics; - #pragma warning disable IDE0007 // Use implicit type internal static class Salsa64 { diff --git a/test/NaCl.Core.Benchmarks/ChaCha20Benchmark.cs b/test/NaCl.Core.Benchmarks/ChaCha20Benchmark.cs index bafd11a..e271a75 100644 --- a/test/NaCl.Core.Benchmarks/ChaCha20Benchmark.cs +++ b/test/NaCl.Core.Benchmarks/ChaCha20Benchmark.cs @@ -1,90 +1,91 @@ -namespace NaCl.Core.Benchmarks +namespace NaCl.Core.Benchmarks; + +using System; +using System.Collections.Generic; + +using Base; + +using BenchmarkDotNet.Attributes; + +[BenchmarkCategory("Stream Cipher")] +[MemoryDiagnoser] +[RPlotExporter, RankColumn] +public class ChaCha20Benchmark { - using System; - using System.Collections.Generic; + private static readonly Random rnd = new Random(42); - using Base; + private Memory key; + private Memory nonce; + private Memory message; + private Memory cipherText; + private ChaCha20 cipher; - using BenchmarkDotNet.Attributes; + [Params( + (int)1E+2, // 100 bytes + (int)1E+3, // 1 000 bytes = 1 KB + (int)1E+4, // 10 000 bytes = 10 KB + (int)1E+5, // 100 000 bytes = 100 KB + (int)1E+6, // 1 000 000 bytes = 1 MB + (int)1E+7)] // 10 000 000 bytes = 10 MB + public int Size { get; set; } - [BenchmarkCategory("Stream Cipher")] - [MemoryDiagnoser] - [RPlotExporter, RankColumn] - public class ChaCha20Benchmark + [GlobalSetup] + public void Setup() { - private static readonly Random rnd = new Random(42); - - private Memory key; - private Memory nonce; - private Memory message; - private Memory cipherText; - private ChaCha20 cipher; - - [Params( - (int)1E+2, // 100 bytes - (int)1E+3, // 1 000 bytes = 1 KB - (int)1E+5 // 100 000 bytes = 100 KB - )] // 10 000 000 bytes = 10 MB - public int Size { get; set; } - - [GlobalSetup] - public void Setup() - { - key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - rnd.NextBytes(key.Span); - - nonce = new byte[12]; - rnd.NextBytes(nonce.Span); - - message = new byte[Size]; - rnd.NextBytes(message.Span); - - cipherText = new byte[Size]; - var c = new ChaCha20(key, 0); - c.Encrypt(message.Span, nonce.Span, cipherText.Span); - - cipher = new ChaCha20(key, 0); - } - - [Benchmark] - [BenchmarkCategory("Encryption")] - public void Encrypt() - { - var ciphertext = new byte[message.Length]; - cipher.Encrypt(message.Span, nonce.Span, ciphertext); - } - - [Benchmark] - [BenchmarkCategory("Decryption")] - public void Decrypt() - { - var plaintext = new byte[cipherText.Length]; - cipher.Decrypt(cipherText.Span, nonce.Span, plaintext); - } - - //[Benchmark] - //[BenchmarkCategory("Decryption")] - //[ArgumentsSource(nameof(TestVectors))] - //public void Decrypt(Tests.Vectors.Rfc8439TestVector test) - //{ - // var plaintext = new byte[test.CipherText.Length]; - // var cipher = new ChaCha20(test.Key, test.InitialCounter); - // cipher.Decrypt(test.CipherText, test.Nonce, plaintext); - //} - - //public IEnumerable TestVectors() - //{ - // //foreach (var test in Tests.Rfc8439TestVector.Rfc8439TestVectors) - // // yield return test; - - // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[0]; - // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[1]; - // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[2]; - // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[3]; - // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[4]; - // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[5]; - // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[6]; - // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[7]; - //} + key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + rnd.NextBytes(key.Span); + + nonce = new byte[12]; + rnd.NextBytes(nonce.Span); + + message = new byte[Size]; + rnd.NextBytes(message.Span); + + cipherText = new byte[Size]; + var c = new ChaCha20(key, 0); + c.Encrypt(message.Span, nonce.Span, cipherText.Span); + + cipher = new ChaCha20(key, 0); } + + [Benchmark] + [BenchmarkCategory("Encryption")] + public void Encrypt() + { + var ciphertext = new byte[message.Length]; + cipher.Encrypt(message.Span, nonce.Span, ciphertext); + } + + [Benchmark] + [BenchmarkCategory("Decryption")] + public void Decrypt() + { + var plaintext = new byte[cipherText.Length]; + cipher.Decrypt(cipherText.Span, nonce.Span, plaintext); + } + + //[Benchmark] + //[BenchmarkCategory("Decryption")] + //[ArgumentsSource(nameof(TestVectors))] + //public void Decrypt(Tests.Vectors.Rfc8439TestVector test) + //{ + // var plaintext = new byte[test.CipherText.Length]; + // var cipher = new ChaCha20(test.Key, test.InitialCounter); + // cipher.Decrypt(test.CipherText, test.Nonce, plaintext); + //} + + //public IEnumerable TestVectors() + //{ + // //foreach (var test in Tests.Rfc8439TestVector.Rfc8439TestVectors) + // // yield return test; + + // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[0]; + // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[1]; + // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[2]; + // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[3]; + // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[4]; + // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[5]; + // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[6]; + // yield return Tests.Vectors.Rfc8439TestVector.Rfc8439TestVectors[7]; + //} } diff --git a/test/NaCl.Core.Benchmarks/ChaCha20Poly1305Benchmark.cs b/test/NaCl.Core.Benchmarks/ChaCha20Poly1305Benchmark.cs index d740b43..07ad32b 100644 --- a/test/NaCl.Core.Benchmarks/ChaCha20Poly1305Benchmark.cs +++ b/test/NaCl.Core.Benchmarks/ChaCha20Poly1305Benchmark.cs @@ -1,88 +1,87 @@ -namespace NaCl.Core.Benchmarks +namespace NaCl.Core.Benchmarks; + +using System; +using System.Collections.Generic; +using System.Security.Cryptography; + +using Base; + +using BenchmarkDotNet.Attributes; + +[BenchmarkCategory("AEAD")] +[MemoryDiagnoser] +[RPlotExporter, RankColumn] +public class ChaCha20Poly1305Benchmark { - using System; - using System.Collections.Generic; - using System.Security.Cryptography; + private static readonly Random rnd = new Random(42); + + private Memory key; + private Memory nonce; + private Memory message; + private Memory tag; + private Memory aad; + private Memory ciphertext; + + private ChaCha20Poly1305 aead; + + [Params( + (int)1E+2, // 100 bytes + (int)1E+3, // 1 000 bytes = 1 KB + (int)1E+4, // 10 000 bytes = 10 KB + (int)1E+5, // 100 000 bytes = 100 KB + (int)1E+6, // 1 000 000 bytes = 1 MB + (int)1E+7)] // 10 000 000 bytes = 10 MB + public int Size { get; set; } + + [GlobalSetup] + public void Setup() + { + key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(key.Span); + + nonce = new byte[ChaCha20.NONCE_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(nonce.Span); + + tag = new byte[Poly1305.MAC_TAG_SIZE_IN_BYTES]; + + message = new byte[Size]; + rnd.NextBytes(message.Span); - using Base; + aad = new byte[16]; + rnd.NextBytes(aad.Span); - using BenchmarkDotNet.Attributes; + ciphertext = new byte[message.Length]; - [BenchmarkCategory("AEAD")] - [MemoryDiagnoser] - [RPlotExporter, RankColumn] - public class ChaCha20Poly1305Benchmark + aead = new ChaCha20Poly1305(key.Span); + } + + [Benchmark] + [BenchmarkCategory("Encryption")] + public void Encrypt() => aead.Encrypt(nonce.Span, message.Span, ciphertext.Span, tag.Span, aad.Span); + + [Benchmark] + [BenchmarkCategory("Decryption")] + [ArgumentsSource(nameof(TestVectors))] + public void Decrypt(Tests.Vectors.Rfc8439TestVector test) + { + var aead = new ChaCha20Poly1305(test.Key); + var plaintext = new byte[test.CipherText.Length]; + aead.Decrypt(test.Nonce, test.CipherText, test.Tag, plaintext, test.Aad); + } + + public IEnumerable TestVectors() { - private static readonly Random rnd = new Random(42); - - private Memory key; - private Memory nonce; - private Memory message; - private Memory tag; - private Memory aad; - private Memory ciphertext; - - private ChaCha20Poly1305 aead; - - [Params( - (int)1E+2, // 100 bytes - (int)1E+3, // 1 000 bytes = 1 KB - (int)1E+4, // 10 000 bytes = 10 KB - (int)1E+5, // 100 000 bytes = 100 KB - (int)1E+6, // 1 000 000 bytes = 1 MB - (int)1E+7)] // 10 000 000 bytes = 10 MB - public int Size { get; set; } - - [GlobalSetup] - public void Setup() - { - key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(key.Span); - - nonce = new byte[ChaCha20.NONCE_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(nonce.Span); - - tag = new byte[Poly1305.MAC_TAG_SIZE_IN_BYTES]; - - message = new byte[Size]; - rnd.NextBytes(message.Span); - - aad = new byte[16]; - rnd.NextBytes(aad.Span); - - ciphertext = new byte[message.Length]; - - aead = new ChaCha20Poly1305(key.Span); - } - - [Benchmark] - [BenchmarkCategory("Encryption")] - public void Encrypt() => aead.Encrypt(nonce.Span, message.Span, ciphertext.Span, tag.Span, aad.Span); - - [Benchmark] - [BenchmarkCategory("Decryption")] - [ArgumentsSource(nameof(TestVectors))] - public void Decrypt(Tests.Vectors.Rfc8439TestVector test) - { - var aead = new ChaCha20Poly1305(test.Key); - var plaintext = new byte[test.CipherText.Length]; - aead.Decrypt(test.Nonce, test.CipherText, test.Tag, plaintext, test.Aad); - } - - public IEnumerable TestVectors() - { - //foreach (var test in Tests.Rfc8439TestVector.Rfc7634AeadTestVectors) - // yield return test; - - yield return Tests.Vectors.Rfc8439TestVector.Rfc8439AeadTestVectors[0]; - yield return Tests.Vectors.Rfc8439TestVector.Rfc8439AeadTestVectors[1]; - yield return Tests.Vectors.Rfc8439TestVector.Rfc7634AeadTestVectors[0]; - yield return Tests.Vectors.Rfc8439TestVector.Rfc7634AeadTestVectors[1]; - } - - // TODO: Use the encrypt value (from Encrypt method) to benchmark decryption - //[Benchmark] - //[BenchmarkCategory("Decryption")] - //public byte[] Decrypt(byte[] ciphertext) => aead.Decrypt(ciphertext, aad); + //foreach (var test in Tests.Rfc8439TestVector.Rfc7634AeadTestVectors) + // yield return test; + + yield return Tests.Vectors.Rfc8439TestVector.Rfc8439AeadTestVectors[0]; + yield return Tests.Vectors.Rfc8439TestVector.Rfc8439AeadTestVectors[1]; + yield return Tests.Vectors.Rfc8439TestVector.Rfc7634AeadTestVectors[0]; + yield return Tests.Vectors.Rfc8439TestVector.Rfc7634AeadTestVectors[1]; } + + // TODO: Use the encrypt value (from Encrypt method) to benchmark decryption + //[Benchmark] + //[BenchmarkCategory("Decryption")] + //public byte[] Decrypt(byte[] ciphertext) => aead.Decrypt(ciphertext, aad); } diff --git a/test/NaCl.Core.Benchmarks/Poly1305Benchmark.cs b/test/NaCl.Core.Benchmarks/Poly1305Benchmark.cs index 030fea7..6816f16 100644 --- a/test/NaCl.Core.Benchmarks/Poly1305Benchmark.cs +++ b/test/NaCl.Core.Benchmarks/Poly1305Benchmark.cs @@ -1,49 +1,48 @@ -namespace NaCl.Core.Benchmarks +namespace NaCl.Core.Benchmarks; + +using System; + +using BenchmarkDotNet.Attributes; + +[BenchmarkCategory("MAC")] +[MemoryDiagnoser] +[RPlotExporter, RankColumn] +public class Poly1305Benchmark { - using System; + //private const int KB = 1024; + private static readonly Random rnd = new Random(42); + + private Memory key; + private Memory data; + + [Params( + (int)1E+2, // 100 bytes + (int)1E+3, // 1 000 bytes = 1 KB + (int)1E+4, // 10 000 bytes = 10 KB + (int)1E+5, // 100 000 bytes = 100 KB + (int)1E+6, // 1 000 000 bytes = 1 MB + (int)1E+7)] // 10 000 000 bytes = 10 MB + //[Params(1, 4, 16, 64, 256, 1 * KB, 4 * KB, 16 * KB, 64 * KB)] + public int Size { get; set; } + + [GlobalSetup] + public void Setup() + { + key = new byte[Poly1305.MAC_KEY_SIZE_IN_BYTES]; + rnd.NextBytes(key.Span); - using BenchmarkDotNet.Attributes; + data = new byte[Size]; + rnd.NextBytes(data.Span); + } - [BenchmarkCategory("MAC")] - [MemoryDiagnoser] - [RPlotExporter, RankColumn] - public class Poly1305Benchmark + [Benchmark(Description = "ComputeMac")] + public void Compute() { - //private const int KB = 1024; - private static readonly Random rnd = new Random(42); - - private Memory key; - private Memory data; - - [Params( - (int)1E+2, // 100 bytes - (int)1E+3, // 1 000 bytes = 1 KB - (int)1E+4, // 10 000 bytes = 10 KB - (int)1E+5, // 100 000 bytes = 100 KB - (int)1E+6, // 1 000 000 bytes = 1 MB - (int)1E+7)] // 10 000 000 bytes = 10 MB - //[Params(1, 4, 16, 64, 256, 1 * KB, 4 * KB, 16 * KB, 64 * KB)] - public int Size { get; set; } - - [GlobalSetup] - public void Setup() - { - key = new byte[Poly1305.MAC_KEY_SIZE_IN_BYTES]; - rnd.NextBytes(key.Span); - - data = new byte[Size]; - rnd.NextBytes(data.Span); - } - - [Benchmark(Description = "ComputeMac")] - public void Compute() - { - var mac = new byte[Poly1305.MAC_TAG_SIZE_IN_BYTES]; - Poly1305.ComputeMac(key.Span, data.Span, mac); - } - - // TODO: Use the mac value (from Compute method) to benchmark verification - //[Benchmark(Description = "VerifyMac")] - //public byte[] Verify(byte[] mac) => Poly1305.VerifyMac(key, data, mac); + var mac = new byte[Poly1305.MAC_TAG_SIZE_IN_BYTES]; + Poly1305.ComputeMac(key.Span, data.Span, mac); } + + // TODO: Use the mac value (from Compute method) to benchmark verification + //[Benchmark(Description = "VerifyMac")] + //public byte[] Verify(byte[] mac) => Poly1305.VerifyMac(key, data, mac); } diff --git a/test/NaCl.Core.Benchmarks/Program.cs b/test/NaCl.Core.Benchmarks/Program.cs index 773be09..f59f8f5 100644 --- a/test/NaCl.Core.Benchmarks/Program.cs +++ b/test/NaCl.Core.Benchmarks/Program.cs @@ -1,31 +1,16 @@ -namespace NaCl.Core.Benchmarks -{ - using System; +using System; +using BenchmarkDotNet.Running; - using BenchmarkDotNet.Running; +// Execute following code: +// $ dotnet run -c release --framework netcoreapp3.1 +// $ dotnet run -c release --framework netcoreapp3.1 --filter *XChaCha20Poly1305Benchmark* +BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); +//BenchmarkRunner.Run(); +//BenchmarkRunner.Run(); +//BenchmarkRunner.Run(); +//BenchmarkRunner.Run(); +//BenchmarkRunner.Run(); +//BenchmarkRunner.Run(); +//BenchmarkRunner.Run(); - class Program - { - static void Main(string[] args) - { - var c = new ChaCha20Benchmark(); - c.Size = 10_000; - c.Setup(); - c.Encrypt(); - - // Execute following code: - // $ dotnet run -c release --framework netcoreapp3.1 - // $ dotnet run -c release --framework netcoreapp3.1 --filter *XChaCha20Poly1305Benchmark* - //BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); - //BenchmarkRunner.Run(); - BenchmarkRunner.Run(); - //BenchmarkRunner.Run(); - //BenchmarkRunner.Run(); - //BenchmarkRunner.Run(); - //BenchmarkRunner.Run(); - //BenchmarkRunner.Run(); - - Console.ReadLine(); - } - } -} +Console.ReadLine(); diff --git a/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs b/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs index 2f02ab2..497cebd 100644 --- a/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs +++ b/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs @@ -1,83 +1,82 @@ -namespace NaCl.Core.Benchmarks -{ - using System; - using System.Collections.Generic; +namespace NaCl.Core.Benchmarks; - using Base; +using System; +using System.Collections.Generic; - using BenchmarkDotNet.Attributes; +using Base; - [BenchmarkCategory("Stream Cipher")] - [MemoryDiagnoser] - [RPlotExporter, RankColumn] - public class Salsa20Benchmark - { - private static readonly Random rnd = new Random(42); +using BenchmarkDotNet.Attributes; - private Memory key; - private Memory nonce; - private Memory message; - private Memory cipherText; - private Salsa20 cipher; +[BenchmarkCategory("Stream Cipher")] +[MemoryDiagnoser] +[RPlotExporter, RankColumn] +public class Salsa20Benchmark +{ + private static readonly Random rnd = new Random(42); - [Params( - (int)1E+2, // 100 bytes - (int)1E+3, // 1 000 bytes = 1 KB - - (int)1E+5 // 100 000 bytes = 100 KB + private Memory key; + private Memory nonce; + private Memory message; + private Memory cipherText; + private Salsa20 cipher; - )] // 10 000 000 bytes = 10 MB - public int Size { get; set; } + [Params( + (int)1E+2, // 100 bytes + (int)1E+3, // 1 000 bytes = 1 KB + (int)1E+4, // 10 000 bytes = 10 KB + (int)1E+5, // 100 000 bytes = 100 KB + (int)1E+6, // 1 000 000 bytes = 1 MB + (int)1E+7)] // 10 000 000 bytes = 10 MB + public int Size { get; set; } - [GlobalSetup] - public void Setup() - { - key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - rnd.NextBytes(key.Span); + [GlobalSetup] + public void Setup() + { + key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + rnd.NextBytes(key.Span); - nonce = new byte[8]; - rnd.NextBytes(nonce.Span); + nonce = new byte[8]; + rnd.NextBytes(nonce.Span); - message = new byte[Size]; - rnd.NextBytes(message.Span); + message = new byte[Size]; + rnd.NextBytes(message.Span); - cipherText = new byte[Size]; - var c = new Salsa20(key, 0); - c.Encrypt(message.Span, nonce.Span, cipherText.Span); + cipherText = new byte[Size]; + var c = new Salsa20(key, 0); + c.Encrypt(message.Span, nonce.Span, cipherText.Span); - cipher = new Salsa20(key, 0); - } + cipher = new Salsa20(key, 0); + } - [Benchmark] - [BenchmarkCategory("Encryption")] - public void Encrypt() - { - var ciphertext = new byte[message.Length]; - cipher.Encrypt(message.Span, nonce.Span, ciphertext); - } + [Benchmark] + [BenchmarkCategory("Encryption")] + public void Encrypt() + { + var ciphertext = new byte[message.Length]; + cipher.Encrypt(message.Span, nonce.Span, ciphertext); + } - [Benchmark] - [BenchmarkCategory("Decryption")] - public void Decrypt() - { - var plaintext = new byte[cipherText.Length]; - cipher.Decrypt(cipherText.Span, nonce.Span, plaintext); - } + [Benchmark] + [BenchmarkCategory("Decryption")] + public void Decrypt() + { + var plaintext = new byte[cipherText.Length]; + cipher.Decrypt(cipherText.Span, nonce.Span, plaintext); + } - //[Benchmark] - //[BenchmarkCategory("Decryption")] - //[ArgumentsSource(nameof(TestVectors))] - //public void Decrypt(Tests.Vectors.Salsa20TestVector test) - //{ - // var plaintext = new byte[test.CipherText.Length]; - // var cipher = new Salsa20(test.Key, test.InitialCounter); - // cipher.Decrypt(test.CipherText, test.Nonce, plaintext); - //} + //[Benchmark] + //[BenchmarkCategory("Decryption")] + //[ArgumentsSource(nameof(TestVectors))] + //public void Decrypt(Tests.Vectors.Salsa20TestVector test) + //{ + // var plaintext = new byte[test.CipherText.Length]; + // var cipher = new Salsa20(test.Key, test.InitialCounter); + // cipher.Decrypt(test.CipherText, test.Nonce, plaintext); + //} - //public IEnumerable TestVectors() - //{ - // //foreach (var test in ParseTestVectors(GetTestVector());) - // // yield return test; - //} - } + //public IEnumerable TestVectors() + //{ + // //foreach (var test in ParseTestVectors(GetTestVector());) + // // yield return test; + //} } diff --git a/test/NaCl.Core.Benchmarks/XChaCha20Benchmark.cs b/test/NaCl.Core.Benchmarks/XChaCha20Benchmark.cs index 46f64d8..bed0972 100644 --- a/test/NaCl.Core.Benchmarks/XChaCha20Benchmark.cs +++ b/test/NaCl.Core.Benchmarks/XChaCha20Benchmark.cs @@ -1,89 +1,88 @@ -namespace NaCl.Core.Benchmarks -{ - using System; - using System.Collections.Generic; - using System.Linq; +namespace NaCl.Core.Benchmarks; - using Base; - using Internal; +using System; +using System.Collections.Generic; +using System.Linq; - using BenchmarkDotNet.Attributes; +using Base; +using Internal; - [BenchmarkCategory("Stream Cipher")] - [MemoryDiagnoser] - [RPlotExporter, RankColumn] - public class XChaCha20Benchmark - { - private static readonly Random rnd = new Random(42); +using BenchmarkDotNet.Attributes; - private Memory key; - private Memory nonce; - private Memory message; - private XChaCha20 cipher; +[BenchmarkCategory("Stream Cipher")] +[MemoryDiagnoser] +[RPlotExporter, RankColumn] +public class XChaCha20Benchmark +{ + private static readonly Random rnd = new Random(42); - [Params( - (int)1E+2, // 100 bytes - (int)1E+3, // 1 000 bytes = 1 KB - (int)1E+4, // 10 000 bytes = 10 KB - (int)1E+5, // 100 000 bytes = 100 KB - (int)1E+6, // 1 000 000 bytes = 1 MB - (int)1E+7)] // 10 000 000 bytes = 10 MB - public int Size { get; set; } + private Memory key; + private Memory nonce; + private Memory message; + private XChaCha20 cipher; - [GlobalSetup] - public void Setup() - { - key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - rnd.NextBytes(key.Span); + [Params( + (int)1E+2, // 100 bytes + (int)1E+3, // 1 000 bytes = 1 KB + (int)1E+4, // 10 000 bytes = 10 KB + (int)1E+5, // 100 000 bytes = 100 KB + (int)1E+6, // 1 000 000 bytes = 1 MB + (int)1E+7)] // 10 000 000 bytes = 10 MB + public int Size { get; set; } - nonce = new byte[24]; - rnd.NextBytes(nonce.Span); + [GlobalSetup] + public void Setup() + { + key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + rnd.NextBytes(key.Span); - message = new byte[Size]; - rnd.NextBytes(message.Span); + nonce = new byte[24]; + rnd.NextBytes(nonce.Span); - cipher = new XChaCha20(key, 0); - } + message = new byte[Size]; + rnd.NextBytes(message.Span); - [Benchmark] - [BenchmarkCategory("Encryption")] - public void Encrypt() - { - var ciphertext = new byte[message.Length]; - cipher.Encrypt(message.Span, nonce.Span, ciphertext); - } + cipher = new XChaCha20(key, 0); + } - [Benchmark] - [BenchmarkCategory("Decryption")] - [ArgumentsSource(nameof(TestVectors))] - public void Decrypt(Tests.Vectors.XChaCha20TestVector test) - { - var plaintext = new byte[test.CipherText.Length]; - var cipher = new XChaCha20(test.Key, 0); - cipher.Decrypt(test.CipherText, test.Nonce, plaintext); - } + [Benchmark] + [BenchmarkCategory("Encryption")] + public void Encrypt() + { + var ciphertext = new byte[message.Length]; + cipher.Encrypt(message.Span, nonce.Span, ciphertext); + } - public IEnumerable TestVectors() - { - //foreach (var test in Tests.XChaCha20TestVector.XChaCha20TestVectors) - // yield return test; + [Benchmark] + [BenchmarkCategory("Decryption")] + [ArgumentsSource(nameof(TestVectors))] + public void Decrypt(Tests.Vectors.XChaCha20TestVector test) + { + var plaintext = new byte[test.CipherText.Length]; + var cipher = new XChaCha20(test.Key, 0); + cipher.Decrypt(test.CipherText, test.Nonce, plaintext); + } - yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[0]; - yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[1]; - yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[2]; - yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[3]; - yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[4]; - yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[5]; - yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[6]; - yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[7]; - yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[8]; - yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[9]; - yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[10]; - } + public IEnumerable TestVectors() + { + //foreach (var test in Tests.XChaCha20TestVector.XChaCha20TestVectors) + // yield return test; - // TODO: Use the encrypt value (from Encrypt method) to benchmark decryption - //[Benchmark] - //[BenchmarkCategory("Decryption")] - //public byte[] Decrypt(byte[] ciphertext) => cipher.Decrypt(ciphertext); + yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[0]; + yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[1]; + yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[2]; + yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[3]; + yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[4]; + yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[5]; + yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[6]; + yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[7]; + yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[8]; + yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[9]; + yield return Tests.Vectors.XChaCha20TestVector.XChaCha20TestVectors[10]; } + + // TODO: Use the encrypt value (from Encrypt method) to benchmark decryption + //[Benchmark] + //[BenchmarkCategory("Decryption")] + //public byte[] Decrypt(byte[] ciphertext) => cipher.Decrypt(ciphertext); } diff --git a/test/NaCl.Core.Benchmarks/XChaCha20Poly1305Benchmark.cs b/test/NaCl.Core.Benchmarks/XChaCha20Poly1305Benchmark.cs index 926c45c..4b6057f 100644 --- a/test/NaCl.Core.Benchmarks/XChaCha20Poly1305Benchmark.cs +++ b/test/NaCl.Core.Benchmarks/XChaCha20Poly1305Benchmark.cs @@ -1,87 +1,86 @@ -namespace NaCl.Core.Benchmarks +namespace NaCl.Core.Benchmarks; + +using System; +using System.Collections.Generic; +using System.Security.Cryptography; + +using NaCl.Core.Base; + +using BenchmarkDotNet.Attributes; + +[BenchmarkCategory("AEAD")] +[MemoryDiagnoser] +[RPlotExporter, RankColumn] +public class XChaCha20Poly1305Benchmark { - using System; - using System.Collections.Generic; - using System.Security.Cryptography; + private static readonly Random rnd = new Random(42); + + private Memory key; + private Memory nonce; + private Memory message; + private Memory tag; + private Memory aad; + private Memory ciphertext; + + private XChaCha20Poly1305 aead; + + [Params( + (int)1E+2, // 100 bytes + (int)1E+3, // 1 000 bytes = 1 KB + (int)1E+4, // 10 000 bytes = 10 KB + (int)1E+5, // 100 000 bytes = 100 KB + (int)1E+6, // 1 000 000 bytes = 1 MB + (int)1E+7)] // 10 000 000 bytes = 10 MB + public int Size { get; set; } + + [GlobalSetup] + public void Setup() + { + key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(key.Span); + + nonce = new byte[XChaCha20.NONCE_SIZE_IN_BYTES]; + RandomNumberGenerator.Fill(nonce.Span); + + tag = new byte[Poly1305.MAC_TAG_SIZE_IN_BYTES]; + + message = new byte[Size]; + rnd.NextBytes(message.Span); - using NaCl.Core.Base; + aad = new byte[24]; + rnd.NextBytes(aad.Span); - using BenchmarkDotNet.Attributes; + ciphertext = new byte[message.Length]; - [BenchmarkCategory("AEAD")] - [MemoryDiagnoser] - [RPlotExporter, RankColumn] - public class XChaCha20Poly1305Benchmark + aead = new XChaCha20Poly1305(key); + } + + [Benchmark] + [BenchmarkCategory("Encryption")] + public void Encrypt() => aead.Encrypt(nonce.Span, message.Span, ciphertext.Span, tag.Span, aad.Span); + + [Benchmark] + [BenchmarkCategory("Decryption")] + [ArgumentsSource(nameof(TestVectors))] + public void Decrypt(Tests.Vectors.XChaCha20Poly1305TestVector test) + { + var aead = new XChaCha20Poly1305(test.Key); + var plaintext = new byte[test.CipherText.Length]; + aead.Decrypt(test.Nonce, test.CipherText, test.Tag, plaintext, test.Aad); + } + + public static IEnumerable TestVectors() { - private static readonly Random rnd = new Random(42); - - private Memory key; - private Memory nonce; - private Memory message; - private Memory tag; - private Memory aad; - private Memory ciphertext; - - private XChaCha20Poly1305 aead; - - [Params( - (int)1E+2, // 100 bytes - (int)1E+3, // 1 000 bytes = 1 KB - (int)1E+4, // 10 000 bytes = 10 KB - (int)1E+5, // 100 000 bytes = 100 KB - (int)1E+6, // 1 000 000 bytes = 1 MB - (int)1E+7)] // 10 000 000 bytes = 10 MB - public int Size { get; set; } - - [GlobalSetup] - public void Setup() - { - key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(key.Span); - - nonce = new byte[XChaCha20.NONCE_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(nonce.Span); - - tag = new byte[Poly1305.MAC_TAG_SIZE_IN_BYTES]; - - message = new byte[Size]; - rnd.NextBytes(message.Span); - - aad = new byte[24]; - rnd.NextBytes(aad.Span); - - ciphertext = new byte[message.Length]; - - aead = new XChaCha20Poly1305(key); - } - - [Benchmark] - [BenchmarkCategory("Encryption")] - public void Encrypt() => aead.Encrypt(nonce.Span, message.Span, ciphertext.Span, tag.Span, aad.Span); - - [Benchmark] - [BenchmarkCategory("Decryption")] - [ArgumentsSource(nameof(TestVectors))] - public void Decrypt(Tests.Vectors.XChaCha20Poly1305TestVector test) - { - var aead = new XChaCha20Poly1305(test.Key); - var plaintext = new byte[test.CipherText.Length]; - aead.Decrypt(test.Nonce, test.CipherText, test.Tag, plaintext, test.Aad); - } - - public static IEnumerable TestVectors() - { - //foreach (var test in Tests.Rfc8439TestVector.Rfc7634AeadTestVectors) - // yield return test; - - yield return Tests.Vectors.XChaCha20Poly1305TestVector.TestVectors[0]; - yield return Tests.Vectors.XChaCha20Poly1305TestVector.TestVectors[1]; - yield return Tests.Vectors.XChaCha20Poly1305TestVector.TestVectors[2]; - } - - // TODO: Use the encrypt value (from Encrypt method) to benchmark decryption - //[Benchmark] - //[BenchmarkCategory("Decryption")] - //public byte[] Decrypt(byte[] ciphertext) => aead.Decrypt(ciphertext, aad); + //foreach (var test in Tests.Rfc8439TestVector.Rfc7634AeadTestVectors) + // yield return test; + + yield return Tests.Vectors.XChaCha20Poly1305TestVector.TestVectors[0]; + yield return Tests.Vectors.XChaCha20Poly1305TestVector.TestVectors[1]; + yield return Tests.Vectors.XChaCha20Poly1305TestVector.TestVectors[2]; } + + // TODO: Use the encrypt value (from Encrypt method) to benchmark decryption + //[Benchmark] + //[BenchmarkCategory("Decryption")] + //public byte[] Decrypt(byte[] ciphertext) => aead.Decrypt(ciphertext, aad); } diff --git a/test/NaCl.Core.Benchmarks/XSalsa20Benchmark .cs b/test/NaCl.Core.Benchmarks/XSalsa20Benchmark .cs index 9d7856d..d6e020c 100644 --- a/test/NaCl.Core.Benchmarks/XSalsa20Benchmark .cs +++ b/test/NaCl.Core.Benchmarks/XSalsa20Benchmark .cs @@ -1,81 +1,80 @@ -namespace NaCl.Core.Benchmarks -{ - using System; - using System.Collections.Generic; - - using Base; +namespace NaCl.Core.Benchmarks; - using BenchmarkDotNet.Attributes; +using System; +using System.Collections.Generic; - [BenchmarkCategory("Stream Cipher")] - [MemoryDiagnoser] - [RPlotExporter, RankColumn] - public class XSalsa20Benchmark - { - private static readonly Random rnd = new Random(42); +using Base; - private Memory key; - private Memory nonce; - private Memory message; - private Memory cipherText; - private XSalsa20 cipher; +using BenchmarkDotNet.Attributes; - [Params( - (int)1E+2, // 100 bytes - (int)1E+3, // 1 000 bytes = 1 KB - (int)1E+5 // 100 000 bytes = 100 KB - )] // 10 000 000 bytes = 10 MB - public int Size { get; set; } +[BenchmarkCategory("Stream Cipher")] +[MemoryDiagnoser] +[RPlotExporter, RankColumn] +public class XSalsa20Benchmark +{ + private static readonly Random rnd = new Random(42); - [GlobalSetup] - public void Setup() - { - key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - rnd.NextBytes(key.Span); + private Memory key; + private Memory nonce; + private Memory message; + private Memory cipherText; + private XSalsa20 cipher; - nonce = new byte[24]; - rnd.NextBytes(nonce.Span); + [Params( + (int)1E+2, // 100 bytes + (int)1E+3, // 1 000 bytes = 1 KB + (int)1E+5 // 100 000 bytes = 100 KB + )] // 10 000 000 bytes = 10 MB + public int Size { get; set; } - message = new byte[Size]; - rnd.NextBytes(message.Span); + [GlobalSetup] + public void Setup() + { + key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; + rnd.NextBytes(key.Span); - cipherText = new byte[Size]; - var c = new XSalsa20(key, 0); - c.Encrypt(message.Span, nonce.Span, cipherText.Span); + nonce = new byte[24]; + rnd.NextBytes(nonce.Span); - cipher = new XSalsa20(key, 0); - } + message = new byte[Size]; + rnd.NextBytes(message.Span); - [Benchmark] - [BenchmarkCategory("Encryption")] - public void Encrypt() - { - var ciphertext = new byte[message.Length]; - cipher.Encrypt(message.Span, nonce.Span, ciphertext); - } + cipherText = new byte[Size]; + var c = new XSalsa20(key, 0); + c.Encrypt(message.Span, nonce.Span, cipherText.Span); - [Benchmark] - [BenchmarkCategory("Decryption")] - public void Decrypt() - { - var plaintext = new byte[cipherText.Length]; - cipher.Decrypt(cipherText.Span, nonce.Span, plaintext); - } + cipher = new XSalsa20(key, 0); + } - //[Benchmark] - //[BenchmarkCategory("Decryption")] - //[ArgumentsSource(nameof(TestVectors))] - //public void Decrypt(Tests.Vectors.Rfc8439TestVector test) - //{ - // var plaintext = new byte[test.CipherText.Length]; - // var cipher = new Salsa20(test.Key, test.InitialCounter); - // cipher.Decrypt(test.CipherText, test.Nonce, plaintext); - //} + [Benchmark] + [BenchmarkCategory("Encryption")] + public void Encrypt() + { + var ciphertext = new byte[message.Length]; + cipher.Encrypt(message.Span, nonce.Span, ciphertext); + } - //public IEnumerable TestVectors() - //{ - // //foreach (var test in ParseTestVectors(GetTestVector());) - // // yield return test; - //} + [Benchmark] + [BenchmarkCategory("Decryption")] + public void Decrypt() + { + var plaintext = new byte[cipherText.Length]; + cipher.Decrypt(cipherText.Span, nonce.Span, plaintext); } + + //[Benchmark] + //[BenchmarkCategory("Decryption")] + //[ArgumentsSource(nameof(TestVectors))] + //public void Decrypt(Tests.Vectors.Rfc8439TestVector test) + //{ + // var plaintext = new byte[test.CipherText.Length]; + // var cipher = new Salsa20(test.Key, test.InitialCounter); + // cipher.Decrypt(test.CipherText, test.Nonce, plaintext); + //} + + //public IEnumerable TestVectors() + //{ + // //foreach (var test in ParseTestVectors(GetTestVector());) + // // yield return test; + //} } From 1613592046a55c2e1403cf31065368f276aea6fc Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 8 Nov 2022 20:08:37 +0000 Subject: [PATCH 46/59] Remove unused code --- src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs index 9de0de7..8fdbd4a 100644 --- a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs @@ -121,20 +121,6 @@ public static unsafe void Process(uint* x, ref byte* m, ref byte* c, ref ulong b [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector128 Vector128Rotate(Vector128 a, byte imm) => Sse2.Or(Sse2.ShiftLeftLogical(a, imm), Sse2.ShiftRightLogical(a, (byte)(32 - imm))); - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Transpose(ref Vector128 a, ref Vector128 b, ref Vector128 c, ref Vector128 d) - { - var w_0 = Sse2.UnpackLow(a, b).AsUInt64(); - var w_1 = Sse2.UnpackHigh(a, b).AsUInt64(); - var w_2 = Sse2.UnpackLow(c, d).AsUInt64(); - var w_3 = Sse2.UnpackHigh(c, d).AsUInt64(); - - a = Sse2.UnpackLow(w_0, w_2).AsUInt32(); - b = Sse2.UnpackHigh(w_0, w_2).AsUInt32(); - c = Sse2.UnpackLow(w_1, w_3).AsUInt32(); - d = Sse2.UnpackHigh(w_1, w_3).AsUInt32(); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void OneQuad(ref Vector128 x_A, ref Vector128 x_B, ref Vector128 x_C, ref Vector128 x_D, ref Vector128 origA, ref Vector128 origB, ref Vector128 origC, ref Vector128 origD, byte* m, byte* c) { From 6ca8d65d0d6bc3426e5fdf002b9a21959268f2f1 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 8 Nov 2022 20:09:02 +0000 Subject: [PATCH 47/59] Rename internal protected to protected internal --- src/NaCl.Core/Base/Salsa20Base.cs | 4 ++-- src/NaCl.Core/Base/Snuffle.cs | 2 +- src/NaCl.Core/Salsa20.cs | 2 +- src/NaCl.Core/XSalsa20.cs | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/NaCl.Core/Base/Salsa20Base.cs b/src/NaCl.Core/Base/Salsa20Base.cs index 8bc57d7..8592266 100644 --- a/src/NaCl.Core/Base/Salsa20Base.cs +++ b/src/NaCl.Core/Base/Salsa20Base.cs @@ -44,7 +44,7 @@ protected Salsa20Base(ReadOnlyMemory key, int initialCounter) : base(key, /// The state. /// The nonce. /// The counter. - internal protected abstract void SetInitialState(Span state, ReadOnlySpan nonce, int counter); + protected internal abstract void SetInitialState(Span state, ReadOnlySpan nonce, int counter); /// internal override void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) => _salsa20Core.Process(nonce, output, input, offset); @@ -90,7 +90,7 @@ public void HSalsa20InitialState(Span state, ReadOnlySpan nonce) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal protected static void ShuffleState(Span state) + protected internal static void ShuffleState(Span state) { // 10 loops × 2 rounds/loop = 20 rounds for (var i = 0; i < 10; i++) diff --git a/src/NaCl.Core/Base/Snuffle.cs b/src/NaCl.Core/Base/Snuffle.cs index c79293e..69f84a0 100644 --- a/src/NaCl.Core/Base/Snuffle.cs +++ b/src/NaCl.Core/Base/Snuffle.cs @@ -28,7 +28,7 @@ public abstract class Snuffle protected static uint[] SIGMA = new uint[] { 0x61707865, 0x3320646E, 0x79622D32, 0x6B206574 }; // "expand 32-byte k" (4 words constant: "expa", "nd 3", "2-by", and "te k") protected readonly ReadOnlyMemory Key; - internal protected readonly int InitialCounter; + protected internal readonly int InitialCounter; /// /// Initializes a new instance of the class. diff --git a/src/NaCl.Core/Salsa20.cs b/src/NaCl.Core/Salsa20.cs index 0016401..be149b6 100644 --- a/src/NaCl.Core/Salsa20.cs +++ b/src/NaCl.Core/Salsa20.cs @@ -23,7 +23,7 @@ public class Salsa20 : Salsa20Base public Salsa20(ReadOnlyMemory key, int initialCounter = 0) : base(key, initialCounter) { } /// - internal protected override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) + protected internal override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) { if (nonce.IsEmpty || nonce.Length != NonceSizeInBytes) throw new CryptographicException(FormatNonceLengthExceptionMessage(GetType().Name, nonce.Length, NonceSizeInBytes)); diff --git a/src/NaCl.Core/XSalsa20.cs b/src/NaCl.Core/XSalsa20.cs index c2efbcc..a3e475b 100644 --- a/src/NaCl.Core/XSalsa20.cs +++ b/src/NaCl.Core/XSalsa20.cs @@ -23,7 +23,7 @@ public class XSalsa20 : Salsa20Base public XSalsa20(ReadOnlyMemory key, int initialCounter = 0) : base(key, initialCounter) { } /// - internal protected override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) + protected internal override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) { if (nonce.IsEmpty || nonce.Length != NonceSizeInBytes) throw new CryptographicException(FormatNonceLengthExceptionMessage(GetType().Name, nonce.Length, NonceSizeInBytes)); From 6c2b54b151d5c1782548711abe9e9bee226d7584 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 8 Nov 2022 20:09:41 +0000 Subject: [PATCH 48/59] Remove unnecessary usings --- test/NaCl.Core.Benchmarks/ChaCha20Benchmark.cs | 1 - test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs | 1 - test/NaCl.Core.Benchmarks/XChaCha20Benchmark.cs | 2 -- test/NaCl.Core.SimdTests/ChaCha20IntrinsicsTests .cs | 1 - test/NaCl.Core.SimdTests/ChaCha20ScalarTests.cs | 1 - test/NaCl.Core.SimdTests/Salsa20IntrinsicsTests.cs | 2 -- test/NaCl.Core.SimdTests/Salsa20ScalarTests.cs | 1 - 7 files changed, 9 deletions(-) diff --git a/test/NaCl.Core.Benchmarks/ChaCha20Benchmark.cs b/test/NaCl.Core.Benchmarks/ChaCha20Benchmark.cs index e271a75..7e7a312 100644 --- a/test/NaCl.Core.Benchmarks/ChaCha20Benchmark.cs +++ b/test/NaCl.Core.Benchmarks/ChaCha20Benchmark.cs @@ -1,7 +1,6 @@ namespace NaCl.Core.Benchmarks; using System; -using System.Collections.Generic; using Base; diff --git a/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs b/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs index 497cebd..ff4d09a 100644 --- a/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs +++ b/test/NaCl.Core.Benchmarks/Salsa20Benchmark.cs @@ -1,7 +1,6 @@ namespace NaCl.Core.Benchmarks; using System; -using System.Collections.Generic; using Base; diff --git a/test/NaCl.Core.Benchmarks/XChaCha20Benchmark.cs b/test/NaCl.Core.Benchmarks/XChaCha20Benchmark.cs index bed0972..4a2def4 100644 --- a/test/NaCl.Core.Benchmarks/XChaCha20Benchmark.cs +++ b/test/NaCl.Core.Benchmarks/XChaCha20Benchmark.cs @@ -2,10 +2,8 @@ using System; using System.Collections.Generic; -using System.Linq; using Base; -using Internal; using BenchmarkDotNet.Attributes; diff --git a/test/NaCl.Core.SimdTests/ChaCha20IntrinsicsTests .cs b/test/NaCl.Core.SimdTests/ChaCha20IntrinsicsTests .cs index cc2c7c6..32e820f 100644 --- a/test/NaCl.Core.SimdTests/ChaCha20IntrinsicsTests .cs +++ b/test/NaCl.Core.SimdTests/ChaCha20IntrinsicsTests .cs @@ -11,7 +11,6 @@ using Internal; using Vectors; using NaCl.Core.Base.ChaChaCore; - using NaCl.Core.SimdTests.Vectors; public class ChaCha20IntrinsicsTests { diff --git a/test/NaCl.Core.SimdTests/ChaCha20ScalarTests.cs b/test/NaCl.Core.SimdTests/ChaCha20ScalarTests.cs index faa1ea2..2d83d98 100644 --- a/test/NaCl.Core.SimdTests/ChaCha20ScalarTests.cs +++ b/test/NaCl.Core.SimdTests/ChaCha20ScalarTests.cs @@ -11,7 +11,6 @@ using Internal; using Vectors; using NaCl.Core.Base.ChaChaCore; - using NaCl.Core.SimdTests.Vectors; public class ChaCha20ScalarTests { diff --git a/test/NaCl.Core.SimdTests/Salsa20IntrinsicsTests.cs b/test/NaCl.Core.SimdTests/Salsa20IntrinsicsTests.cs index fb3ab93..32c0b96 100644 --- a/test/NaCl.Core.SimdTests/Salsa20IntrinsicsTests.cs +++ b/test/NaCl.Core.SimdTests/Salsa20IntrinsicsTests.cs @@ -10,14 +10,12 @@ using FluentAssertions; using Xunit; using Xunit.Abstractions; - using Xunit.Categories; using Base; using Internal; using Vectors; using System.Linq; using NaCl.Core.Base.SalsaCore; - using NaCl.Core.SimdTests.Vectors; public class Salsa20IntrinsicsTests { diff --git a/test/NaCl.Core.SimdTests/Salsa20ScalarTests.cs b/test/NaCl.Core.SimdTests/Salsa20ScalarTests.cs index 56fc03f..167c989 100644 --- a/test/NaCl.Core.SimdTests/Salsa20ScalarTests.cs +++ b/test/NaCl.Core.SimdTests/Salsa20ScalarTests.cs @@ -16,7 +16,6 @@ using Vectors; using System.Linq; using NaCl.Core.Base.SalsaCore; - using NaCl.Core.SimdTests.Vectors; public class Salsa20ScalarTests { From f54e5c0db24a1b6a29c4440e3d83db29230276ff Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 8 Nov 2022 20:35:54 +0000 Subject: [PATCH 49/59] Simplified slices --- src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs | 2 +- src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs b/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs index 9e0f951..0512d30 100644 --- a/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs +++ b/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs @@ -31,7 +31,7 @@ public unsafe void Process(ReadOnlySpan nonce, Span output, ReadOnly Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; _chaCha20.SetInitialState(state, nonce, _chaCha20.InitialCounter); - ChaCha20BaseIntrinsics.ChaCha20(state, input, output.Slice(offset), (ulong)input.Length); + ChaCha20BaseIntrinsics.ChaCha20(state, input, output[offset..], (ulong)input.Length); } /// diff --git a/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs b/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs index a724a5d..73da87a 100644 --- a/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs +++ b/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs @@ -33,7 +33,7 @@ public unsafe void Process(ReadOnlySpan nonce, Span output, ReadOnly Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; _salsa20.SetInitialState(state, nonce, _salsa20.InitialCounter); - Salsa20BaseIntrinsics.Salsa20(state, input, output.Slice(offset), (ulong)input.Length); + Salsa20BaseIntrinsics.Salsa20(state, input, output[offset..], (ulong)input.Length); } /// From 6319ec164c7f3a28d97ffd7c1aeca56797b96d14 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 8 Nov 2022 22:47:53 +0000 Subject: [PATCH 50/59] Removed unnecessary using statements --- src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs | 1 - src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs | 1 - src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs | 3 --- src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs | 1 - test/NaCl.Core.Benchmarks/XSalsa20Benchmark .cs | 3 --- test/NaCl.Core.Tests/ChaCha20Tests.cs | 2 -- test/NaCl.Core.Tests/Poly1305Test.cs | 2 -- test/NaCl.Core.Tests/Vectors/WycheproofVector.cs | 1 - test/NaCl.Core.Tests/XChaCha20Tests.cs | 2 -- test/NaCl.Core.Tests/XSalsa20Tests.cs | 2 -- 10 files changed, 18 deletions(-) diff --git a/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs b/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs index 0512d30..74cf1c4 100644 --- a/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs +++ b/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs @@ -4,7 +4,6 @@ namespace NaCl.Core.Base.ChaChaCore; using System; using System.Runtime.CompilerServices; using System.Security.Cryptography; -using NaCl.Core.Base.ChaChaCore; internal class ChaCha20CoreIntrinsics : IChaCha20Core { diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs index 13cf7e2..cad266d 100644 --- a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs @@ -4,7 +4,6 @@ namespace NaCl.Core.Base.ChaChaIntrinsics; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; -using System; #pragma warning disable IDE0007 // Use implicit type internal static class ChaCha64 diff --git a/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs b/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs index 73da87a..61532b7 100644 --- a/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs +++ b/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs @@ -3,11 +3,8 @@ namespace NaCl.Core.Base.SalsaCore; using System; using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; using System.Security.Cryptography; -using Internal; - internal class Salsa20CoreIntrinsics : ISalsa20Core { const int BLOCK_SIZE_IN_BYTES = Snuffle.BLOCK_SIZE_IN_BYTES; diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs index 08a53dd..e4223fc 100644 --- a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs @@ -1,7 +1,6 @@ #if INTRINSICS namespace NaCl.Core.Base.SalsaIntrinsics; -using System; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; diff --git a/test/NaCl.Core.Benchmarks/XSalsa20Benchmark .cs b/test/NaCl.Core.Benchmarks/XSalsa20Benchmark .cs index d6e020c..02f9f6d 100644 --- a/test/NaCl.Core.Benchmarks/XSalsa20Benchmark .cs +++ b/test/NaCl.Core.Benchmarks/XSalsa20Benchmark .cs @@ -1,10 +1,7 @@ namespace NaCl.Core.Benchmarks; using System; -using System.Collections.Generic; - using Base; - using BenchmarkDotNet.Attributes; [BenchmarkCategory("Stream Cipher")] diff --git a/test/NaCl.Core.Tests/ChaCha20Tests.cs b/test/NaCl.Core.Tests/ChaCha20Tests.cs index 1396e09..dda9673 100644 --- a/test/NaCl.Core.Tests/ChaCha20Tests.cs +++ b/test/NaCl.Core.Tests/ChaCha20Tests.cs @@ -1,8 +1,6 @@ namespace NaCl.Core.Tests { using System; - using System.Collections.Generic; - using System.Linq; using System.Security.Cryptography; using System.Text; diff --git a/test/NaCl.Core.Tests/Poly1305Test.cs b/test/NaCl.Core.Tests/Poly1305Test.cs index 3e3758b..585823e 100644 --- a/test/NaCl.Core.Tests/Poly1305Test.cs +++ b/test/NaCl.Core.Tests/Poly1305Test.cs @@ -1,8 +1,6 @@ namespace NaCl.Core.Tests { using System; - using System.Collections.Generic; - using System.Linq; using System.Security.Cryptography; using System.Text; diff --git a/test/NaCl.Core.Tests/Vectors/WycheproofVector.cs b/test/NaCl.Core.Tests/Vectors/WycheproofVector.cs index aee149c..f58e53e 100644 --- a/test/NaCl.Core.Tests/Vectors/WycheproofVector.cs +++ b/test/NaCl.Core.Tests/Vectors/WycheproofVector.cs @@ -1,6 +1,5 @@ namespace NaCl.Core.Tests.Vectors { - using System; using System.Collections.Generic; public class WycheproofVector diff --git a/test/NaCl.Core.Tests/XChaCha20Tests.cs b/test/NaCl.Core.Tests/XChaCha20Tests.cs index 39a483a..dbfaa5f 100644 --- a/test/NaCl.Core.Tests/XChaCha20Tests.cs +++ b/test/NaCl.Core.Tests/XChaCha20Tests.cs @@ -1,8 +1,6 @@ namespace NaCl.Core.Tests { using System; - using System.Collections.Generic; - using System.Linq; using System.Security.Cryptography; using FluentAssertions; diff --git a/test/NaCl.Core.Tests/XSalsa20Tests.cs b/test/NaCl.Core.Tests/XSalsa20Tests.cs index 884ac61..092f442 100644 --- a/test/NaCl.Core.Tests/XSalsa20Tests.cs +++ b/test/NaCl.Core.Tests/XSalsa20Tests.cs @@ -1,8 +1,6 @@ namespace NaCl.Core.Tests { using System; - using System.Collections.Generic; - using System.Linq; using System.Security.Cryptography; using System.Text; From 8d0d5964bb901b2ad8bf00900cbf47a276266b79 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 8 Nov 2022 22:52:32 +0000 Subject: [PATCH 51/59] Add explcit access modifiers --- src/NaCl.Core/Base/ChaCha20Base.cs | 2 +- src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs | 4 ++-- src/NaCl.Core/Base/Salsa20Base.cs | 2 +- src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/NaCl.Core/Base/ChaCha20Base.cs b/src/NaCl.Core/Base/ChaCha20Base.cs index c7829a3..6f42211 100644 --- a/src/NaCl.Core/Base/ChaCha20Base.cs +++ b/src/NaCl.Core/Base/ChaCha20Base.cs @@ -11,7 +11,7 @@ /// public abstract class ChaCha20Base : Snuffle { - readonly IChaCha20Core _chaCha20Core; + private readonly IChaCha20Core _chaCha20Core; /// /// Initializes a new instance of the class. diff --git a/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs b/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs index 74cf1c4..c521059 100644 --- a/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs +++ b/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs @@ -7,8 +7,8 @@ namespace NaCl.Core.Base.ChaChaCore; internal class ChaCha20CoreIntrinsics : IChaCha20Core { - const int BLOCK_SIZE_IN_BYTES = Snuffle.BLOCK_SIZE_IN_BYTES; - const int BLOCK_SIZE_IN_INTS = Snuffle.BLOCK_SIZE_IN_INTS; + private const int BLOCK_SIZE_IN_BYTES = Snuffle.BLOCK_SIZE_IN_BYTES; + private const int BLOCK_SIZE_IN_INTS = Snuffle.BLOCK_SIZE_IN_INTS; private readonly ChaCha20Base _chaCha20; public ChaCha20CoreIntrinsics(ChaCha20Base chaCha20Base) => _chaCha20=chaCha20Base; diff --git a/src/NaCl.Core/Base/Salsa20Base.cs b/src/NaCl.Core/Base/Salsa20Base.cs index 8592266..b42f33f 100644 --- a/src/NaCl.Core/Base/Salsa20Base.cs +++ b/src/NaCl.Core/Base/Salsa20Base.cs @@ -11,7 +11,7 @@ /// public abstract class Salsa20Base : Snuffle { - readonly ISalsa20Core _salsa20Core; + private readonly ISalsa20Core _salsa20Core; /// /// Initializes a new instance of the class. diff --git a/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs b/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs index 61532b7..06f1eca 100644 --- a/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs +++ b/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs @@ -7,8 +7,8 @@ namespace NaCl.Core.Base.SalsaCore; internal class Salsa20CoreIntrinsics : ISalsa20Core { - const int BLOCK_SIZE_IN_BYTES = Snuffle.BLOCK_SIZE_IN_BYTES; - const int BLOCK_SIZE_IN_INTS = Snuffle.BLOCK_SIZE_IN_INTS; + private const int BLOCK_SIZE_IN_BYTES = Snuffle.BLOCK_SIZE_IN_BYTES; + private const int BLOCK_SIZE_IN_INTS = Snuffle.BLOCK_SIZE_IN_INTS; private readonly Salsa20Base _salsa20; From 3a5051b59afb1c317e1367fb2e72a7438ee6ba3b Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Tue, 8 Nov 2022 23:13:04 +0000 Subject: [PATCH 52/59] Edit error message, move namespace and change visibility to internal --- src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs | 21 +++++++++++--------- src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs | 21 +++++++++++--------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs index 75f841d..0a68bba 100644 --- a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs @@ -1,19 +1,17 @@ #if INTRINSICS +namespace NaCl.Core.Base; using System; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics.X86; using NaCl.Core.Base.ChaChaIntrinsics; -namespace NaCl.Core.Base; - -public static class ChaCha20BaseIntrinsics +internal static class ChaCha20BaseIntrinsics { [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void ChaCha20(Span state, ReadOnlySpan input, Span output, ulong bytes) { - if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) - throw new Exception("Error this vectorisation is not supported on this CPU"); + ValidateDeviceSupport(); fixed (uint* x = state) fixed (byte* m_p = input, c_p = output) @@ -43,8 +41,7 @@ public static unsafe void ChaCha20(Span state, ReadOnlySpan input, S [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void HChaCha20(ReadOnlySpan state, Span subKey) { - if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) - throw new Exception("Error this vectorisation is not supported on this CPU"); + ValidateDeviceSupport(); fixed (uint* x = state) fixed (byte* sk = subKey) @@ -56,8 +53,7 @@ public static unsafe void HChaCha20(ReadOnlySpan state, Span subKey) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void ChaCha20KeyStream(ReadOnlySpan state, Span output) { - if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) - throw new Exception("Error this vectorisation is not supported on this CPU"); + ValidateDeviceSupport(); fixed (byte* c = output) fixed (uint* x = state) @@ -65,5 +61,12 @@ public static unsafe void ChaCha20KeyStream(ReadOnlySpan state, Span ChaCha64.KeyStream64(x, c); } } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void ValidateDeviceSupport() + { + if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) + throw new NotSupportedException($"{nameof(Sse3)} vectorisation is not supported on this device."); + } } #endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs index bbf17a7..423f502 100644 --- a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs @@ -1,19 +1,17 @@ #if INTRINSICS +namespace NaCl.Core.Base; using System; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics.X86; using NaCl.Core.Base.SalsaIntrinsics; -namespace NaCl.Core.Base; - -public static class Salsa20BaseIntrinsics +internal static class Salsa20BaseIntrinsics { [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void Salsa20(Span state, ReadOnlySpan input, Span output, ulong bytes) { - if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) - throw new Exception("Error this vectorisation is not supported on this CPU"); + ValidateDeviceSupport(); fixed (uint* x = state) fixed (byte* m_p = input, c_p = output) @@ -43,8 +41,7 @@ public static unsafe void Salsa20(Span state, ReadOnlySpan input, Sp [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void HSalsa20(ReadOnlySpan state, Span subKey) { - if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) - throw new Exception("Error this vectorisation is not supported on this CPU"); + ValidateDeviceSupport(); fixed (uint* x = state) fixed (byte* sk = subKey) @@ -56,8 +53,7 @@ public static unsafe void HSalsa20(ReadOnlySpan state, Span subKey) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void Salsa20KeyStream(ReadOnlySpan state, Span output) { - if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) - throw new Exception("Error this vectorisation is not supported on this CPU"); + ValidateDeviceSupport(); fixed (byte* c = output) fixed (uint* x = state) @@ -65,5 +61,12 @@ public static unsafe void Salsa20KeyStream(ReadOnlySpan state, Span Salsa64.KeyStream64(x, c); } } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void ValidateDeviceSupport() + { + if (!Sse3.IsSupported || !BitConverter.IsLittleEndian) + throw new NotSupportedException($"{nameof(Sse3)} vectorisation is not supported on this device."); + } } #endif \ No newline at end of file From c8ff6fccebc1d4c3d9a124167636780b72e2d53a Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Wed, 9 Nov 2022 12:15:20 +0000 Subject: [PATCH 53/59] Remove internal core code, replaced with pre processor functions --- src/NaCl.Core/Base/ChaCha20Base.cs | 97 +++++++++---- src/NaCl.Core/Base/ChaChaCore/ChaCha20Core.cs | 129 ------------------ .../Base/ChaChaCore/ChaCha20CoreIntrinsics.cs | 53 ------- .../Base/ChaChaCore/IChaCha20Core.cs | 10 -- src/NaCl.Core/Base/Salsa20Base.cs | 96 +++++++++---- src/NaCl.Core/Base/SalsaCore/ISalsa20Core.cs | 10 -- src/NaCl.Core/Base/SalsaCore/Salsa20Core.cs | 117 ---------------- .../Base/SalsaCore/Salsa20CoreIntrinsics.cs | 54 -------- src/NaCl.Core/Base/Snuffle.cs | 18 ++- src/NaCl.Core/ChaCha20.cs | 2 +- src/NaCl.Core/Salsa20.cs | 2 +- src/NaCl.Core/XChaCha20.cs | 2 +- src/NaCl.Core/XSalsa20.cs | 2 +- 13 files changed, 159 insertions(+), 433 deletions(-) delete mode 100644 src/NaCl.Core/Base/ChaChaCore/ChaCha20Core.cs delete mode 100644 src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs delete mode 100644 src/NaCl.Core/Base/ChaChaCore/IChaCha20Core.cs delete mode 100644 src/NaCl.Core/Base/SalsaCore/ISalsa20Core.cs delete mode 100644 src/NaCl.Core/Base/SalsaCore/Salsa20Core.cs delete mode 100644 src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs diff --git a/src/NaCl.Core/Base/ChaCha20Base.cs b/src/NaCl.Core/Base/ChaCha20Base.cs index 6f42211..6037c95 100644 --- a/src/NaCl.Core/Base/ChaCha20Base.cs +++ b/src/NaCl.Core/Base/ChaCha20Base.cs @@ -2,8 +2,8 @@ { using System; using System.Runtime.CompilerServices; + using System.Security.Cryptography; using Internal; - using NaCl.Core.Base.ChaChaCore; /// /// Base class for and . @@ -11,28 +11,12 @@ /// public abstract class ChaCha20Base : Snuffle { - private readonly IChaCha20Core _chaCha20Core; - /// /// Initializes a new instance of the class. /// /// The key. /// The initial counter. - protected ChaCha20Base(ReadOnlyMemory key, int initialCounter) : base(key, initialCounter) - { -#if INTRINSICS - if (System.Runtime.Intrinsics.X86.Sse3.IsSupported && BitConverter.IsLittleEndian) - { - _chaCha20Core = new ChaCha20CoreIntrinsics(this); - } - else - { - _chaCha20Core = new ChaCha20Core(this); - } -#else - _chaCha20Core = new ChaCha20Core(this); -#endif - } + protected ChaCha20Base(ReadOnlyMemory key, int initialCounter) : base(key, initialCounter) { } /// public override int BlockSizeInBytes => BLOCK_SIZE_IN_BYTES; @@ -44,13 +28,49 @@ protected ChaCha20Base(ReadOnlyMemory key, int initialCounter) : base(key, /// The state. /// The nonce. /// The counter. - protected internal abstract void SetInitialState(Span state, ReadOnlySpan nonce, int counter); + protected abstract void SetInitialState(Span state, ReadOnlySpan nonce, int counter); /// - internal override void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) => _chaCha20Core.Process(nonce, output, input, offset); + public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) + { + if (block.Length != BLOCK_SIZE_IN_BYTES) + throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); - /// - public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) => _chaCha20Core.ProcessKeyStreamBlock(nonce, counter, block); + // Set the initial state based on https://tools.ietf.org/html/rfc8439#section-2.3 + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + SetInitialState(state, nonce, counter); + +#if INTRINSICS + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported && BitConverter.IsLittleEndian) + { + ChaCha20BaseIntrinsics.ChaCha20KeyStream(state, block); + return; + } +#endif + + // Create a copy of the state and then run 20 rounds on it, + // alternating between "column rounds" and "diagonal rounds"; each round consisting of four quarter-rounds. + Span workingState = stackalloc uint[BLOCK_SIZE_IN_INTS]; + state.CopyTo(workingState); + ShuffleState(state); + + // At the end of the rounds, add the result to the original state. + for (var i = 0; i < BLOCK_SIZE_IN_INTS; i++) + state[i] += workingState[i]; + + ArrayUtils.StoreArray16UInt32LittleEndian(block, 0, state); + } + +#if INTRINSICS + public override unsafe void ProcessStream(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int initialCounter, int offset = 0) + { + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + SetInitialState(state, nonce, initialCounter); + var c = output.Slice(offset); + + ChaCha20BaseIntrinsics.ChaCha20(state, input, c, (ulong)input.Length); + } +#endif /// /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . @@ -58,7 +78,30 @@ protected ChaCha20Base(ReadOnlyMemory key, int initialCounter) : base(key, /// The subKey. /// The nonce. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void HChaCha20(Span subKey, ReadOnlySpan nonce) => _chaCha20Core.HChaCha20(subKey, nonce); + public void HChaCha20(Span subKey, ReadOnlySpan nonce) + { + // See https://tools.ietf.org/html/draft-arciszewski-xchacha-01#section-2.2. + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + + // Setting HChaCha20 initial state + HChaCha20InitialState(state, nonce); + +#if INTRINSICS + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported && BitConverter.IsLittleEndian) + { + ChaCha20BaseIntrinsics.HChaCha20(state, subKey); + return; + } +#endif + + // Block function + ShuffleState(state); + + // Final subkey = state[0..4] || state[12..16] + state.Slice(12, 4).CopyTo(state.Slice(4,4)); + + ArrayUtils.StoreArray8UInt32LittleEndian(subKey, 0, state); + } /// /// Sets the initial of the HChaCha20 using the key and the . @@ -240,13 +283,7 @@ public static void QuarterRound(ref uint a, ref uint b, ref uint c, ref uint d) /// Sets the ChaCha20 constant. /// /// The state. - protected static void SetSigma(Span state) - { - state[0] = SIGMA[0]; - state[1] = SIGMA[1]; - state[2] = SIGMA[2]; - state[3] = SIGMA[3]; - } + protected static void SetSigma(Span state) => SIGMA.AsSpan()[..4].CopyTo(state); /// /// Sets the 256-bit Key. diff --git a/src/NaCl.Core/Base/ChaChaCore/ChaCha20Core.cs b/src/NaCl.Core/Base/ChaChaCore/ChaCha20Core.cs deleted file mode 100644 index a8220ad..0000000 --- a/src/NaCl.Core/Base/ChaChaCore/ChaCha20Core.cs +++ /dev/null @@ -1,129 +0,0 @@ -namespace NaCl.Core.Base.ChaChaCore; - -using System; -using System.Buffers; -using System.Runtime.CompilerServices; -using System.Security.Cryptography; - -using Internal; - -internal class ChaCha20Core : IChaCha20Core -{ - public const int BLOCK_SIZE_IN_BYTES = Snuffle.BLOCK_SIZE_IN_BYTES; - public const int BLOCK_SIZE_IN_INTS = Snuffle.BLOCK_SIZE_IN_INTS; - - private readonly ChaCha20Base _chaCha20; - public ChaCha20Core(ChaCha20Base chaCha20) => _chaCha20 = chaCha20; - - public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) - { - if (block.Length != BLOCK_SIZE_IN_BYTES) - throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); - - // Set the initial state based on https://tools.ietf.org/html/rfc8439#section-2.3 - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - _chaCha20.SetInitialState(state, nonce, counter); - - // Create a copy of the state and then run 20 rounds on it, - // alternating between "column rounds" and "diagonal rounds"; each round consisting of four quarter-rounds. - Span workingState = stackalloc uint[BLOCK_SIZE_IN_INTS]; - state.CopyTo(workingState); - ChaCha20Base.ShuffleState(state); - - // At the end of the rounds, add the result to the original state. - for (var i = 0; i < BLOCK_SIZE_IN_INTS; i++) - state[i] += workingState[i]; - - ArrayUtils.StoreArray16UInt32LittleEndian(block, 0, state); - } - - /// - /// Processes the Encryption/Decryption function. - /// - /// The nonce. - /// The output. - /// The input. - /// The output's starting offset. - public void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) - { - var blockSizeInBytes = _chaCha20.BlockSizeInBytes; - var length = input.Length; - var numBlocks = length / blockSizeInBytes + 1; - - /* - * Allocates 64 bytes more than below impl as per the benchmarks... - * - var block = new byte[BLOCK_SIZE_IN_BYTES]; - for (var i = 0; i < numBlocks; i++) - { - ProcessKeyStreamBlock(nonce, i + InitialCounter, block); - - if (i == numBlocks - 1) - Xor(output, input, block, length % BLOCK_SIZE_IN_BYTES, offset, i); // last block - else - Xor(output, input, block, BLOCK_SIZE_IN_BYTES, offset, i); - - CryptoBytes.Wipe(block); // Array.Clear(block, 0, block.Length); - } - */ - - using var owner = MemoryPool.Shared.Rent(blockSizeInBytes); - for (var i = 0; i < numBlocks; i++) - { - ProcessKeyStreamBlock(nonce, i + _chaCha20.InitialCounter, owner.Memory.Span); - - if (i == numBlocks - 1) - Xor(output, input, owner.Memory.Span, length % blockSizeInBytes, offset, i); // last block - else - Xor(output, input, owner.Memory.Span, blockSizeInBytes, offset, i); - - owner.Memory.Span.Clear(); - } - } - - /// - /// XOR the specified output. - /// - /// The output. - /// The input. - /// The key stream block. - /// The length. - /// The output's starting offset. - /// The current block number. - /// The combination of blocks, offsets and length to be XORed is out-of-bonds. - private void Xor(Span output, ReadOnlySpan input, ReadOnlySpan block, int len, int offset, int curBlock) - { - var blockOffset = curBlock * _chaCha20.BlockSizeInBytes; - - // Since is not called directly from outside, there's no need to check - //if (len < 0 || offset < 0 || curBlock < 0 || output.Length < len || (input.Length - blockOffset) < len || block.Length < len) - // throw new CryptographicException("The combination of blocks, offsets and length to be XORed is out-of-bonds."); - - for (var i = 0; i < len; i++) - output[i + offset + blockOffset] = (byte)(input[i + blockOffset] ^ block[i]); - } - /// - /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . - /// - /// The subKey. - /// The nonce. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void HChaCha20(Span subKey, ReadOnlySpan nonce) - { - // See https://tools.ietf.org/html/draft-arciszewski-xchacha-01#section-2.2. - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - - // Setting HChaCha20 initial state - _chaCha20.HChaCha20InitialState(state, nonce); - - // Block function - ChaCha20Base.ShuffleState(state); - - state[4] = state[12]; - state[5] = state[13]; - state[6] = state[14]; - state[7] = state[15]; - - ArrayUtils.StoreArray8UInt32LittleEndian(subKey, 0, state); - } -} diff --git a/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs b/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs deleted file mode 100644 index c521059..0000000 --- a/src/NaCl.Core/Base/ChaChaCore/ChaCha20CoreIntrinsics.cs +++ /dev/null @@ -1,53 +0,0 @@ -#if INTRINSICS -namespace NaCl.Core.Base.ChaChaCore; - -using System; -using System.Runtime.CompilerServices; -using System.Security.Cryptography; - -internal class ChaCha20CoreIntrinsics : IChaCha20Core -{ - private const int BLOCK_SIZE_IN_BYTES = Snuffle.BLOCK_SIZE_IN_BYTES; - private const int BLOCK_SIZE_IN_INTS = Snuffle.BLOCK_SIZE_IN_INTS; - - private readonly ChaCha20Base _chaCha20; - public ChaCha20CoreIntrinsics(ChaCha20Base chaCha20Base) => _chaCha20=chaCha20Base; - - public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) - { - if (block.Length != BLOCK_SIZE_IN_BYTES) - throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); - - // Set the initial state based on https://tools.ietf.org/html/rfc8439#section-2.3 - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - _chaCha20.SetInitialState(state, nonce, counter); - - ChaCha20BaseIntrinsics.ChaCha20KeyStream(state, block); - } - - public unsafe void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) - { - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - _chaCha20.SetInitialState(state, nonce, _chaCha20.InitialCounter); - - ChaCha20BaseIntrinsics.ChaCha20(state, input, output[offset..], (ulong)input.Length); - } - - /// - /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . - /// - /// The subKey. - /// The nonce. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void HChaCha20(Span subKey, ReadOnlySpan nonce) - { - // See https://tools.ietf.org/html/draft-arciszewski-xchacha-01#section-2.2. - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - - // Setting HChaCha20 initial state - _chaCha20.HChaCha20InitialState(state, nonce); - - ChaCha20BaseIntrinsics.HChaCha20(state, subKey); - } -} -#endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/ChaChaCore/IChaCha20Core.cs b/src/NaCl.Core/Base/ChaChaCore/IChaCha20Core.cs deleted file mode 100644 index 35270fd..0000000 --- a/src/NaCl.Core/Base/ChaChaCore/IChaCha20Core.cs +++ /dev/null @@ -1,10 +0,0 @@ -namespace NaCl.Core.Base.ChaChaCore; - -using System; - -internal interface IChaCha20Core -{ - void HChaCha20(Span subKey, ReadOnlySpan nonce); - void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0); - void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block); -} \ No newline at end of file diff --git a/src/NaCl.Core/Base/Salsa20Base.cs b/src/NaCl.Core/Base/Salsa20Base.cs index b42f33f..49947f0 100644 --- a/src/NaCl.Core/Base/Salsa20Base.cs +++ b/src/NaCl.Core/Base/Salsa20Base.cs @@ -2,8 +2,8 @@ { using System; using System.Runtime.CompilerServices; + using System.Security.Cryptography; using Internal; - using NaCl.Core.Base.SalsaCore; /// /// Base class for and . @@ -11,28 +11,12 @@ /// public abstract class Salsa20Base : Snuffle { - private readonly ISalsa20Core _salsa20Core; - /// /// Initializes a new instance of the class. /// /// The key. /// The initial counter. - protected Salsa20Base(ReadOnlyMemory key, int initialCounter) : base(key, initialCounter) - { -#if INTRINSICS - if (System.Runtime.Intrinsics.X86.Sse3.IsSupported && BitConverter.IsLittleEndian) - { - _salsa20Core = new Salsa20CoreIntrinsics(this); - } - else - { - _salsa20Core = new Salsa20Core(this); - } -#else - _salsa20Core = new Salsa20Core(this); -#endif - } + protected Salsa20Base(ReadOnlyMemory key, int initialCounter) : base(key, initialCounter) { } /// public override int BlockSizeInBytes => BLOCK_SIZE_IN_BYTES; @@ -44,13 +28,48 @@ protected Salsa20Base(ReadOnlyMemory key, int initialCounter) : base(key, /// The state. /// The nonce. /// The counter. - protected internal abstract void SetInitialState(Span state, ReadOnlySpan nonce, int counter); + protected abstract void SetInitialState(Span state, ReadOnlySpan nonce, int counter); /// - internal override void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) => _salsa20Core.Process(nonce, output, input, offset); + public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) + { + if (block.Length != BLOCK_SIZE_IN_BYTES) + throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); - /// - public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) => _salsa20Core.ProcessKeyStreamBlock(nonce, counter, block); + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + SetInitialState(state, nonce, counter); + +#if INTRINSICS + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported && BitConverter.IsLittleEndian) + { + Salsa20BaseIntrinsics.Salsa20KeyStream(state, block); + return; + } +#endif + + // Create a copy of the state and then run 20 rounds on it, + // alternating between "column rounds" and "diagonal rounds"; each round consisting of four quarter-rounds. + Span workingState = stackalloc uint[BLOCK_SIZE_IN_INTS]; + state.CopyTo(workingState); + ShuffleState(workingState); + + // At the end of the rounds, add the result to the original state. + for (var i = 0; i < BLOCK_SIZE_IN_INTS; i++) + state[i] += workingState[i]; + + ArrayUtils.StoreArray16UInt32LittleEndian(block, 0, state); + } + +#if INTRINSICS + public override unsafe void ProcessStream(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int initialCounter, int offset = 0) + { + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + SetInitialState(state, nonce, initialCounter); + var c = output.Slice(offset); + + Salsa20BaseIntrinsics.Salsa20(state, input, c, (ulong)input.Length); + } +#endif /// /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . @@ -58,7 +77,38 @@ protected Salsa20Base(ReadOnlyMemory key, int initialCounter) : base(key, /// The subKey. /// The nonce. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void HSalsa20(Span subKey, ReadOnlySpan nonce) => _salsa20Core.HSalsa20(subKey, nonce); + public void HSalsa20(Span subKey, ReadOnlySpan nonce) + { + // See: http://cr.yp.to/snuffle/xsalsa-20081128.pdf under 2. Specification - Definition of HSalsa20 + + Span state = stackalloc uint[BLOCK_SIZE_IN_BYTES]; + + // Setting HSalsa20 initial state + HSalsa20InitialState(state, nonce); + + +#if INTRINSICS + if (System.Runtime.Intrinsics.X86.Sse2.IsSupported && BitConverter.IsLittleEndian) + { + Salsa20BaseIntrinsics.HSalsa20(state, subKey); + return; + } +#endif + + // Block function + ShuffleState(state); + + // Final subkey = state[0], state[5], state[10], state[15] || state[6..10] + state[1] = state[5]; + state[2] = state[10]; + state[3] = state[15]; + state[4] = state[6]; + state[5] = state[7]; + state[6] = state[8]; + state[7] = state[9]; + + ArrayUtils.StoreArray8UInt32LittleEndian(subKey, 0, state); + } /// /// Sets the initial of the HSalsa20 using the key and the . diff --git a/src/NaCl.Core/Base/SalsaCore/ISalsa20Core.cs b/src/NaCl.Core/Base/SalsaCore/ISalsa20Core.cs deleted file mode 100644 index fdc97d0..0000000 --- a/src/NaCl.Core/Base/SalsaCore/ISalsa20Core.cs +++ /dev/null @@ -1,10 +0,0 @@ -namespace NaCl.Core.Base.SalsaCore; - -using System; - -internal interface ISalsa20Core -{ - void HSalsa20(Span subKey, ReadOnlySpan nonce); - void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0); - void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block); -} \ No newline at end of file diff --git a/src/NaCl.Core/Base/SalsaCore/Salsa20Core.cs b/src/NaCl.Core/Base/SalsaCore/Salsa20Core.cs deleted file mode 100644 index ae45a4b..0000000 --- a/src/NaCl.Core/Base/SalsaCore/Salsa20Core.cs +++ /dev/null @@ -1,117 +0,0 @@ -namespace NaCl.Core.Base.SalsaCore; - -using System; -using System.Buffers; -using System.Runtime.CompilerServices; -using System.Security.Cryptography; - -using Internal; - -internal class Salsa20Core : ISalsa20Core -{ - public const int BLOCK_SIZE_IN_BYTES = Snuffle.BLOCK_SIZE_IN_BYTES; - public const int BLOCK_SIZE_IN_INTS = Snuffle.BLOCK_SIZE_IN_INTS; - - private readonly Salsa20Base _salsa20; - - public Salsa20Core(Salsa20Base salsa20) => _salsa20 = salsa20; - - public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) - { - if (block.Length != BLOCK_SIZE_IN_BYTES) - throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); - - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - _salsa20.SetInitialState(state, nonce, counter); - - // Create a copy of the state and then run 20 rounds on it, - // alternating between "column rounds" and "diagonal rounds"; each round consisting of four quarter-rounds. - Span workingState = stackalloc uint[BLOCK_SIZE_IN_INTS]; - state.CopyTo(workingState); - Salsa20Base.ShuffleState(workingState); - - // At the end of the rounds, add the result to the original state. - for (var i = 0; i < BLOCK_SIZE_IN_INTS; i++) - state[i] += workingState[i]; - - ArrayUtils.StoreArray16UInt32LittleEndian(block, 0, state); - } - - public void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) - { - var blockSizeInBytes = _salsa20.BlockSizeInBytes; - var length = input.Length; - var numBlocks = length / blockSizeInBytes + 1; - - /* - * Allocates 64 bytes more than below impl as per the benchmarks... - * - var block = new byte[BLOCK_SIZE_IN_BYTES]; - for (var i = 0; i < numBlocks; i++) - { - ProcessKeyStreamBlock(nonce, i + InitialCounter, block); - - if (i == numBlocks - 1) - Xor(output, input, block, length % BLOCK_SIZE_IN_BYTES, offset, i); // last block - else - Xor(output, input, block, BLOCK_SIZE_IN_BYTES, offset, i); - - CryptoBytes.Wipe(block); // Array.Clear(block, 0, block.Length); - } - */ - - using var owner = MemoryPool.Shared.Rent(blockSizeInBytes); - for (var i = 0; i < numBlocks; i++) - { - ProcessKeyStreamBlock(nonce, i + _salsa20.InitialCounter, owner.Memory.Span); - - if (i == numBlocks - 1) - Xor(output, input, owner.Memory.Span, length % blockSizeInBytes, offset, i); // last block - else - Xor(output, input, owner.Memory.Span, blockSizeInBytes, offset, i); - - owner.Memory.Span.Clear(); - } - } - - private void Xor(Span output, ReadOnlySpan input, ReadOnlySpan block, int len, int offset, int curBlock) - { - var blockOffset = curBlock * _salsa20.BlockSizeInBytes; - - // Since is not called directly from outside, there's no need to check - //if (len < 0 || offset < 0 || curBlock < 0 || output.Length < len || (input.Length - blockOffset) < len || block.Length < len) - // throw new CryptographicException("The combination of blocks, offsets and length to be XORed is out-of-bonds."); - - for (var i = 0; i < len; i++) - output[i + offset + blockOffset] = (byte)(input[i + blockOffset] ^ block[i]); - } - - /// - /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . - /// - /// The subKey. - /// The nonce. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void HSalsa20(Span subKey, ReadOnlySpan nonce) - { - // See: http://cr.yp.to/snuffle/xsalsa-20081128.pdf under 2. Specification - Definition of HSalsa20 - - Span state = stackalloc uint[BLOCK_SIZE_IN_BYTES]; - - // Setting HSalsa20 initial state - _salsa20.HSalsa20InitialState(state, nonce); - - // Block function - Salsa20Base.ShuffleState(state); - - state[1] = state[5]; - state[2] = state[10]; - state[3] = state[15]; - state[4] = state[6]; - state[5] = state[7]; - state[6] = state[8]; - state[7] = state[9]; - - ArrayUtils.StoreArray8UInt32LittleEndian(subKey, 0, state); - } -} \ No newline at end of file diff --git a/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs b/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs deleted file mode 100644 index 06f1eca..0000000 --- a/src/NaCl.Core/Base/SalsaCore/Salsa20CoreIntrinsics.cs +++ /dev/null @@ -1,54 +0,0 @@ -#if INTRINSICS -namespace NaCl.Core.Base.SalsaCore; - -using System; -using System.Runtime.CompilerServices; -using System.Security.Cryptography; - -internal class Salsa20CoreIntrinsics : ISalsa20Core -{ - private const int BLOCK_SIZE_IN_BYTES = Snuffle.BLOCK_SIZE_IN_BYTES; - private const int BLOCK_SIZE_IN_INTS = Snuffle.BLOCK_SIZE_IN_INTS; - - private readonly Salsa20Base _salsa20; - - public Salsa20CoreIntrinsics(Salsa20Base salsa20) => _salsa20 = salsa20; - - public void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block) - { - if (block.Length != BLOCK_SIZE_IN_BYTES) - throw new CryptographicException($"The key stream block length is not valid. The length in bytes must be {BLOCK_SIZE_IN_BYTES}."); - - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - _salsa20.SetInitialState(state, nonce, counter); - - Salsa20BaseIntrinsics.Salsa20KeyStream(state, block); - } - - public unsafe void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) - { - Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; - _salsa20.SetInitialState(state, nonce, _salsa20.InitialCounter); - - Salsa20BaseIntrinsics.Salsa20(state, input, output[offset..], (ulong)input.Length); - } - - /// - /// Process a pseudorandom key stream block, converting the key and part of the into a , and the remainder of the . - /// - /// The subKey. - /// The nonce. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void HSalsa20(Span subKey, ReadOnlySpan nonce) - { - // See: http://cr.yp.to/snuffle/xsalsa-20081128.pdf under 2. Specification - Definition of HSalsa20 - - Span state = stackalloc uint[BLOCK_SIZE_IN_BYTES]; - - // Setting HSalsa20 initial state - _salsa20.HSalsa20InitialState(state, nonce); - - Salsa20BaseIntrinsics.HSalsa20(state, subKey); - } -} -#endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/Snuffle.cs b/src/NaCl.Core/Base/Snuffle.cs index 69f84a0..9e8a374 100644 --- a/src/NaCl.Core/Base/Snuffle.cs +++ b/src/NaCl.Core/Base/Snuffle.cs @@ -22,13 +22,13 @@ public abstract class Snuffle { protected const int KEY_SIZE_IN_INTS = 8; public const int KEY_SIZE_IN_BYTES = KEY_SIZE_IN_INTS * 4; // 32 - protected internal const int BLOCK_SIZE_IN_INTS = 16; + protected const int BLOCK_SIZE_IN_INTS = 16; public const int BLOCK_SIZE_IN_BYTES = BLOCK_SIZE_IN_INTS * 4; // 64 protected static uint[] SIGMA = new uint[] { 0x61707865, 0x3320646E, 0x79622D32, 0x6B206574 }; // "expand 32-byte k" (4 words constant: "expa", "nd 3", "2-by", and "te k") protected readonly ReadOnlyMemory Key; - protected internal readonly int InitialCounter; + protected readonly int InitialCounter; /// /// Initializes a new instance of the class. @@ -58,6 +58,10 @@ protected Snuffle(ReadOnlyMemory key, int initialCounter) /// ByteBuffer. public abstract void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block); +#if INTRINSICS + public abstract void ProcessStream(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int initialCounter, int offset = 0); +#endif + /// /// The size of the nonce in bytes. /// Salsa20 uses a 8-byte (64-bit) nonce, ChaCha20 uses a 12-byte (96-bit) nonce, but XSalsa20 and XChaCha20 use a 24-byte (192-bit) nonce. @@ -116,8 +120,16 @@ public void Decrypt(ReadOnlySpan ciphertext, ReadOnlySpan nonce, Spa /// The output. /// The input. /// The output's starting offset. - internal virtual void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) + private void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) { +#if INTRINSICS + if (System.Runtime.Intrinsics.X86.Sse3.IsSupported && BitConverter.IsLittleEndian) + { + ProcessStream(nonce, output, input, InitialCounter, offset); + return; + } +#endif + var length = input.Length; var numBlocks = (length / BlockSizeInBytes) + 1; diff --git a/src/NaCl.Core/ChaCha20.cs b/src/NaCl.Core/ChaCha20.cs index 1427929..88c8303 100644 --- a/src/NaCl.Core/ChaCha20.cs +++ b/src/NaCl.Core/ChaCha20.cs @@ -26,7 +26,7 @@ public class ChaCha20 : ChaCha20Base public ChaCha20(ReadOnlyMemory key, int initialCounter) : base(key, initialCounter) { } /// - protected internal override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) + protected override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) { if (nonce.IsEmpty || nonce.Length != NonceSizeInBytes) throw new CryptographicException(FormatNonceLengthExceptionMessage(GetType().Name, nonce.Length, NonceSizeInBytes)); diff --git a/src/NaCl.Core/Salsa20.cs b/src/NaCl.Core/Salsa20.cs index be149b6..e5d97f7 100644 --- a/src/NaCl.Core/Salsa20.cs +++ b/src/NaCl.Core/Salsa20.cs @@ -23,7 +23,7 @@ public class Salsa20 : Salsa20Base public Salsa20(ReadOnlyMemory key, int initialCounter = 0) : base(key, initialCounter) { } /// - protected internal override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) + protected override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) { if (nonce.IsEmpty || nonce.Length != NonceSizeInBytes) throw new CryptographicException(FormatNonceLengthExceptionMessage(GetType().Name, nonce.Length, NonceSizeInBytes)); diff --git a/src/NaCl.Core/XChaCha20.cs b/src/NaCl.Core/XChaCha20.cs index 49dd29a..4924f7c 100644 --- a/src/NaCl.Core/XChaCha20.cs +++ b/src/NaCl.Core/XChaCha20.cs @@ -24,7 +24,7 @@ public class XChaCha20 : ChaCha20Base public XChaCha20(ReadOnlyMemory key, int initialCounter) : base(key, initialCounter) { } /// - protected internal override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) + protected override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) { if (nonce.IsEmpty || nonce.Length != NonceSizeInBytes) throw new CryptographicException(FormatNonceLengthExceptionMessage(GetType().Name, nonce.Length, NonceSizeInBytes)); diff --git a/src/NaCl.Core/XSalsa20.cs b/src/NaCl.Core/XSalsa20.cs index a3e475b..c17e2c7 100644 --- a/src/NaCl.Core/XSalsa20.cs +++ b/src/NaCl.Core/XSalsa20.cs @@ -23,7 +23,7 @@ public class XSalsa20 : Salsa20Base public XSalsa20(ReadOnlyMemory key, int initialCounter = 0) : base(key, initialCounter) { } /// - protected internal override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) + protected override void SetInitialState(Span state, ReadOnlySpan nonce, int counter) { if (nonce.IsEmpty || nonce.Length != NonceSizeInBytes) throw new CryptographicException(FormatNonceLengthExceptionMessage(GetType().Name, nonce.Length, NonceSizeInBytes)); From 2c557390add055cda16df04f4277be43b2001fc5 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Wed, 9 Nov 2022 12:16:17 +0000 Subject: [PATCH 54/59] Delete core scalar and itnrinsics tests --- NaCl.Core.sln | 8 +- .../ChaCha20IntrinsicsTests .cs | 346 -------------- .../ChaCha20ScalarTests.cs | 346 -------------- .../Salsa20IntrinsicsTests.cs | 431 ------------------ .../NaCl.Core.SimdTests/Salsa20ScalarTests.cs | 431 ------------------ test/NaCl.Core.SimdTests/TestHelpers.cs | 91 ---- .../Vectors/HChaCha20TestVector.cs | 67 --- .../Vectors/Rfc8439TestVector.cs | 160 ------- .../Vectors/Salsa20TestVector.cs | 35 -- 9 files changed, 1 insertion(+), 1914 deletions(-) delete mode 100644 test/NaCl.Core.SimdTests/ChaCha20IntrinsicsTests .cs delete mode 100644 test/NaCl.Core.SimdTests/ChaCha20ScalarTests.cs delete mode 100644 test/NaCl.Core.SimdTests/Salsa20IntrinsicsTests.cs delete mode 100644 test/NaCl.Core.SimdTests/Salsa20ScalarTests.cs delete mode 100644 test/NaCl.Core.SimdTests/TestHelpers.cs delete mode 100644 test/NaCl.Core.SimdTests/Vectors/HChaCha20TestVector.cs delete mode 100644 test/NaCl.Core.SimdTests/Vectors/Rfc8439TestVector.cs delete mode 100644 test/NaCl.Core.SimdTests/Vectors/Salsa20TestVector.cs diff --git a/NaCl.Core.sln b/NaCl.Core.sln index aa1f59a..f64e8df 100644 --- a/NaCl.Core.sln +++ b/NaCl.Core.sln @@ -12,11 +12,9 @@ EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{28EF1FB3-A057-4C17-A3B2-B9370B234F81}" ProjectSection(SolutionItems) = preProject CodeCoverage.runsettings = CodeCoverage.runsettings - TestIntrinsics.ps1= TestIntrinsics.ps1 + TestIntrinsics.ps1 = TestIntrinsics.ps1 EndProjectSection EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NaCl.Core.SimdTests", "test\NaCl.Core.SimdTests\NaCl.Core.SimdTests.csproj", "{BF42937A-028C-4870-AAB4-220667A57457}" -EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -35,10 +33,6 @@ Global {F55397B0-B348-40C6-A6A8-5BECEB74A840}.Debug|Any CPU.Build.0 = Debug|Any CPU {F55397B0-B348-40C6-A6A8-5BECEB74A840}.Release|Any CPU.ActiveCfg = Release|Any CPU {F55397B0-B348-40C6-A6A8-5BECEB74A840}.Release|Any CPU.Build.0 = Release|Any CPU - {BF42937A-028C-4870-AAB4-220667A57457}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {BF42937A-028C-4870-AAB4-220667A57457}.Debug|Any CPU.Build.0 = Debug|Any CPU - {BF42937A-028C-4870-AAB4-220667A57457}.Release|Any CPU.ActiveCfg = Release|Any CPU - {BF42937A-028C-4870-AAB4-220667A57457}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/test/NaCl.Core.SimdTests/ChaCha20IntrinsicsTests .cs b/test/NaCl.Core.SimdTests/ChaCha20IntrinsicsTests .cs deleted file mode 100644 index 32e820f..0000000 --- a/test/NaCl.Core.SimdTests/ChaCha20IntrinsicsTests .cs +++ /dev/null @@ -1,346 +0,0 @@ -namespace NaCl.Core.SimdTests -{ - using System; - using System.Security.Cryptography; - using System.Text; - - using FluentAssertions; - using Xunit; - - using Base; - using Internal; - using Vectors; - using NaCl.Core.Base.ChaChaCore; - - public class ChaCha20IntrinsicsTests - { - [Fact] - public void HChaCha20IntrinsicsTestVectors() - { - // Arrange - foreach (var test in HChaCha20TestVector.HChaCha20TestVectors) - { - var xChaCha20 = new XChaCha20(test.Key, 0); - var cipher = new ChaCha20CoreIntrinsics(xChaCha20); - - // Act - var output = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - cipher.HChaCha20(output, test.Input); - - // Assert - output.Should().Equal(test.Output); - } - } - - [Fact] - public void HChaCha20IntrinsicsBlockTestVector() - { - // https://tools.ietf.org/html/draft-irtf-cfrg-xchacha-03#section-2.2.1 - - // Arrange - var key = CryptoBytes.FromHexString("00:01:02:03:04:05:06:07:08:09:0a:0b:0c:0d:0e:0f:10:11:12:13:14:15:16:17:18:19:1a:1b:1c:1d:1e:1f".Replace(":", string.Empty)); - var nonce = CryptoBytes.FromHexString("00:00:00:09:00:00:00:4a:00:00:00:00:31:41:59:27".Replace(":", string.Empty)); - - var xChaCha20 = new XChaCha20(key, 0); - var cipher = new ChaCha20CoreIntrinsics(xChaCha20); - - // Act - var subKey = new byte[32]; - cipher.HChaCha20(subKey, nonce); - var state = subKey.ToUInt16Array(); - //var stateHex = CryptoBytes.ToHexStringLower(subKey.ToArray()); - - // Assert - // HChaCha20 returns only the first and last rows - var expectedState = new uint[] - { - 0x423b4182, 0xfe7bb227, 0x50420ed3, 0x737d878a, - //0x0aa76448, 0x7954cdf3, 0x846acd37, 0x7b3c58ad, - //0x77e35583, 0x83e77c12, 0xe0076a2d, 0xbc6cd0e5, - 0xd5e4f9a0, 0x53a8748a, 0x13c42ec1, 0xdcecd326 - }; - - // Same as above but in HEX - //var expectedStateHex = "82413b4" + "227b27bfe" + "d30e4250" + "8a877d73" - // + "a0f9e4d" + "58a74a853" + "c12ec413" + "26d3ecdc"; - - state.Should().BeEquivalentTo(expectedState); - //stateHex.Should().Be(expectedStateHex); - } - - [Fact] - public void IntrinsicsEncryptDecrypt1BlockTest() - { - // Arrange - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(key); - - var nonce = new byte[ChaCha20.NONCE_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(nonce); - - var expected = Encoding.UTF8.GetBytes("This is a secret content!!"); - - var chacha20 = new ChaCha20(key, 0); - var cipher = new ChaCha20CoreIntrinsics(chacha20); - - // Act - var ciphertext = new byte[expected.Length]; - cipher.Process(nonce, ciphertext, expected); - - var plaintext = new byte[expected.Length]; - cipher.Process(nonce, plaintext, ciphertext); - - // Assert - plaintext.Should().Equal(expected); - } - - [Fact] - public void IntrinsicsEncryptDecryptNBlocksTest() - { - // Arrange - var rnd = new Random(); - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - var nonce = new byte[ChaCha20.NONCE_SIZE_IN_BYTES]; - - for (var i = 0; i < 64; i++) - { - RandomNumberGenerator.Fill(key); - RandomNumberGenerator.Fill(nonce); - - var chacha20 = new ChaCha20(key, 0); - var cipher = new ChaCha20CoreIntrinsics(chacha20); - - for (var j = 0; j < 64; j++) - { - var expected = new byte[rnd.Next(300)]; - rnd.NextBytes(expected); - - var ciphertext = new byte[expected.Length]; - var plaintext = new byte[expected.Length]; - - // Act - cipher.Process(nonce, ciphertext, expected); - cipher.Process(nonce, plaintext, ciphertext); - - // Assert - plaintext.Should().Equal(expected); - } - } - } - - [Fact] - public void IntrinsicsEncryptDecryptLongMessagesTest() - { - var rnd = new Random(); - - var dataSize = 16; - while (dataSize <= 1 << 24) - { - var plaintext = new byte[dataSize]; - rnd.NextBytes(plaintext); - - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(key); - - var nonce = new byte[ChaCha20.NONCE_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(nonce); - - var chacha20 = new ChaCha20(key, 0); - var cipher = new ChaCha20CoreIntrinsics(chacha20); - - var ciphertext = new byte[plaintext.Length]; - cipher.Process(nonce, ciphertext, plaintext); - - var decrypted = new byte[plaintext.Length]; - cipher.Process(nonce, decrypted, ciphertext); - - decrypted.Should().Equal(plaintext); - dataSize += 5 * dataSize / 11; - } - } - - - [Fact] - public void ChaCha20IntrinsicsBlockWhenNonceLengthIsEmptyFails() - { - // Arrange - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - - var chacha20 = new ChaCha20(key, 0); - var core = new ChaCha20CoreIntrinsics(chacha20); - - var nonce = new byte[0]; - var block = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; - - // Act & Assert - var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); - act.Should().Throw(); - } - - [Fact] - public void ChaCha20IntrinsicsBlockWhenNonceLengthIsInvalidFails() - { - // Arrange - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - - var chacha20 = new ChaCha20(key, 0); - var core = new ChaCha20CoreIntrinsics(chacha20); - var nonce = new byte[chacha20.NonceSizeInBytes + TestHelpers.ReturnRandomPositiveNegative()]; - var block = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; - - // Act & Assert - var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); - act.Should().Throw(); - } - - [Fact] - public void ChaCha20IntrinsicsBlockWhenLengthIsInvalidFails() - { - // Arrange - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - - var chacha20 = new ChaCha20(key, 0); - var core = new ChaCha20CoreIntrinsics(chacha20); - var nonce = new byte[chacha20.NonceSizeInBytes + TestHelpers.ReturnRandomPositiveNegative()]; - var block = new byte[0]; - - // Act & Assert - var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); - act.Should().Throw(); - } - - [Fact] - public void ChaCha20IntrinsicsBlockTestVector() - { - // https://tools.ietf.org/html/rfc8439#section-2.3.2 - - // Arrange - var key = CryptoBytes.FromHexString("00:01:02:03:04:05:06:07:08:09:0a:0b:0c:0d:0e:0f:10:11:12:13:14:15:16:17:18:19:1a:1b:1c:1d:1e:1f".Replace(":", string.Empty)); - var nonce = CryptoBytes.FromHexString("00:00:00:09:00:00:00:4a:00:00:00:00".Replace(":", string.Empty)); - var counter = 1; - - // Act - var chacha20 = new ChaCha20(key, 1); - var core = new ChaCha20CoreIntrinsics(chacha20); - var output = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; - core.ProcessKeyStreamBlock(nonce, counter, output); - - // Assert - var expected = new uint[16] - { - 0xe4e7f110, 0x15593bd1, 0x1fdd0f50, 0xc47120a3, - 0xc7f4d1c7, 0x0368c033, 0x9aaa2204, 0x4e6cd4c3, - 0x466482d2, 0x09aa9f07, 0x05d7c214, 0xa2028bd9, - 0xd19c12b5, 0xb94e16de, 0xe883d0cb, 0x4e3c50a2, - }; - - output.ToUInt16Array().Should().Equal(expected); - } - - [Fact] - public void ChaCha20IntrinsicsTestVector() - { - // https://tools.ietf.org/html/rfc8439#section-2.4.2 - - // Arrange - foreach (var test in Rfc8439TestVector.Rfc8439TestVectors) - { - // Act - var chacha20 = new ChaCha20(test.Key, test.InitialCounter); - var cipher = new ChaCha20CoreIntrinsics(chacha20); - - var output = new byte[test.CipherText.Length]; - cipher.Process(test.Nonce, output, test.CipherText); - - // Assert - output.Should().Equal(test.PlainText); - } - } - - [Theory] - [InlineData(33)] - [InlineData(64)] - [InlineData(65)] - [InlineData(255)] - [InlineData(256)] - [InlineData(511)] - [InlineData(512)] - [InlineData(1023)] - [InlineData(1024)] - public void IntrinsicsCreateVariableLengthCiphers(int size) - { - var input = new byte[size]; - var output = new byte[size]; - - var nonce = new byte[12]; - Array.Fill(nonce, (byte)2); - var key = new byte[32]; - Array.Fill(key, (byte)1); - - var chacha20 = new ChaCha20(key, 0); - var cipher = new ChaCha20CoreIntrinsics(chacha20); - cipher.Process(nonce, output, input); - var value = Convert.ToHexString(output); - - value.Should().Be(LongKeyStream[..(size*2)]); - } - - [Fact] - public void ChaCha20IntrinsicsTestVectorTC8() - { - // TC8: key: 'All your base are belong to us!, IV: 'IETF2013' - // Test vector TC8 from RFC draft by J. Strombergson - // https://tools.ietf.org/html/draft-strombergson-chacha-test-vectors-01 - - // Arrange - var key = new byte[32] - { - 0xC4, 0x6E, 0xC1, 0xB1, 0x8C, 0xE8, 0xA8, 0x78, - 0x72, 0x5A, 0x37, 0xE7, 0x80, 0xDF, 0xB7, 0x35, - 0x1F, 0x68, 0xED, 0x2E, 0x19, 0x4C, 0x79, 0xFB, - 0xC6, 0xAE, 0xBE, 0xE1, 0xA6, 0x67, 0x97, 0x5D - }; - - // The first 4 bytes are set to zero and a large counter - // is used; this makes the RFC 8439 version of ChaCha20 - // compatible with the original specification by D. J. Bernstein. - var nonce = new byte[12] { 0x00, 0x00, 0x00, 0x00, - 0x1A, 0xDA, 0x31, 0xD5, 0xCF, 0x68, 0x82, 0x21 - }; - - // Act - var chacha20 = new ChaCha20(key, 0); - var cipher = new ChaCha20CoreIntrinsics(chacha20); - var block0 = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; - var block1 = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; - cipher.ProcessKeyStreamBlock(nonce, 0, block0); - cipher.ProcessKeyStreamBlock(nonce, 1, block1); - - // Assert - var expected = new byte[128] - { - 0xF6, 0x3A, 0x89, 0xB7, 0x5C, 0x22, 0x71, 0xF9, - 0x36, 0x88, 0x16, 0x54, 0x2B, 0xA5, 0x2F, 0x06, - 0xED, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2B, 0x00, - 0xB5, 0xE8, 0xF8, 0x0A, 0xE9, 0xA4, 0x73, 0xAF, - 0xC2, 0x5B, 0x21, 0x8F, 0x51, 0x9A, 0xF0, 0xFD, - 0xD4, 0x06, 0x36, 0x2E, 0x8D, 0x69, 0xDE, 0x7F, - 0x54, 0xC6, 0x04, 0xA6, 0xE0, 0x0F, 0x35, 0x3F, - 0x11, 0x0F, 0x77, 0x1B, 0xDC, 0xA8, 0xAB, 0x92, - - 0xE5, 0xFB, 0xC3, 0x4E, 0x60, 0xA1, 0xD9, 0xA9, - 0xDB, 0x17, 0x34, 0x5B, 0x0A, 0x40, 0x27, 0x36, - 0x85, 0x3B, 0xF9, 0x10, 0xB0, 0x60, 0xBD, 0xF1, - 0xF8, 0x97, 0xB6, 0x29, 0x0F, 0x01, 0xD1, 0x38, - 0xAE, 0x2C, 0x4C, 0x90, 0x22, 0x5B, 0xA9, 0xEA, - 0x14, 0xD5, 0x18, 0xF5, 0x59, 0x29, 0xDE, 0xA0, - 0x98, 0xCA, 0x7A, 0x6C, 0xCF, 0xE6, 0x12, 0x27, - 0x05, 0x3C, 0x84, 0xE4, 0x9A, 0x4A, 0x33, 0x32 - }; - - CryptoBytes.Combine(block0, block1).Should().Equal(expected); - } - - private const string LongKeyStream = "06E1F8D66AC5C75181F3E5ED9FA16AA909A1FB57A4A9B0110C84FCDC0D710880072A4342AF88DEC0138DAF141A3F471C01E77C1FDA90999496D601A36A8C0412E61CF22E8DA3E8DA712DE9F9D38BE4298CB36C0D83AA7DD314841BBDF59644DCD313F9F53B0E06B9D6CB3F0788CE2EE78993D9D27A3EDF0A52589CBB698519D583B68F72F3961AD77C1358394F29B08FE9F98A29F98311723013591E698557A04A73FB277E3E247083444A6C139ADE01BDE3C368C3A484D6824B33C024C0285CBD665D4F2E4DE87BF79565F08FE09766C16639279A243DAE8395F3E0E5D96E711B210355605A5A8E7B50CEA4BA25E4CB0E273488E223CD69FB699BD937A30D33488EF6076192E1ED08758F7F4774E4C0B8E70955D3CAFE790EB40F7725EB87B8BE6BBECDE1E140966973B5B05FDBFBE05C4BC599888693D96AC0C429B75591EF228A243A6EFDBEEEE49F09383AF2D4AFB6305DE60C5D195A44ED646B0CAFCEC5E445562FFFBB56D444C650E2D892FA99BCE78F2EBF866B154FDB110DDF8CAFB7BE4BEA46724B3952906F0C6E81BE7A17E3C95DF350BB970D2C97499924BDCC4EA0E1DE33AA4E62B5C1FC65FFD2728D81A79AE218AE1C639108323C3D22BA1B8C746CAB0CD535C8661CCA4B6B047790EF148A1B9A88CD3CDD8D79389E2F0D9AAAE135B361ED6778A6F6E03186651692F8DABEDF8872939F694C41E2CAD064FF4C537B92AFD0951DF77302749DCDBC9560FCE001DACAAFAA703BDA73007174C549B69EB031324E31BC9F60049E39254146AEB39BEE8A52CAEA1DD31C42346E44EBCC0771A2548D55ABD085323BA69625845F34831E7518F129CB1D80B76D3C94634F38A1226B5E212D917D593838F51D6CC35F87EB500030AB1446D87F6FFC4717B51C619DDAFD75DBA4C25A09C8C961CDA12A9E01203D678AD2ABB4B7D1BED7EBF0C2932DCE5F0C97F9488DD01A7891DC18D5EEFF6129B7942726A5B5110877260E2A78075C666F4410A2F8A2909D03DE0FBE2BFCA2B068B438ADAF767D804BA85278FB930945D15380281C215BC664B6627EE76CBBC8C5355E607721AAAC069B16B78C2F282795E7BF9B6509E7DC36FD2D45A227BF9D20C5E9678A040B63E964817F98B5F4828EB5D66740C595304D08A0A3C5A50EE3B3F99D2269992DD400A5B452A213DCD2579F7A193FC7FE33E498E91203DE19FF9D54BEBDE9E124A17E784430C38110FE3552861737DE1F2B7678F63417FE2224ED6571D43A8015F6F81362E7B95CB93C86735787F0980B0A3A65549844768EDF0DDEC75A24FA1EF5A26640932F65FF141CAEE2E14506A34E925C21BC268769CD95328675953E79B4B375912434834018ADD9C1832057EE4386C95B6E9407346B4A1582FB3C095E4B0882087DB48F081B5C0DE69ADBC447A6BA2ED6A4F90909911CD3B51ECEC2C6BE6EFE"; - } -} diff --git a/test/NaCl.Core.SimdTests/ChaCha20ScalarTests.cs b/test/NaCl.Core.SimdTests/ChaCha20ScalarTests.cs deleted file mode 100644 index 2d83d98..0000000 --- a/test/NaCl.Core.SimdTests/ChaCha20ScalarTests.cs +++ /dev/null @@ -1,346 +0,0 @@ -namespace NaCl.Core.SimdTests -{ - using System; - using System.Security.Cryptography; - using System.Text; - - using FluentAssertions; - using Xunit; - - using Base; - using Internal; - using Vectors; - using NaCl.Core.Base.ChaChaCore; - - public class ChaCha20ScalarTests - { - [Fact] - public void HChaCha20ScalarTestVectors() - { - // Arrange - foreach (var test in HChaCha20TestVector.HChaCha20TestVectors) - { - var xChaCha20 = new XChaCha20(test.Key, 0); - var cipher = new ChaCha20Core(xChaCha20); - - // Act - var output = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - cipher.HChaCha20(output, test.Input); - - // Assert - output.Should().Equal(test.Output); - } - } - - [Fact] - public void HChaCha20ScalarBlockTestVector() - { - // https://tools.ietf.org/html/draft-irtf-cfrg-xchacha-03#section-2.2.1 - - // Arrange - var key = CryptoBytes.FromHexString("00:01:02:03:04:05:06:07:08:09:0a:0b:0c:0d:0e:0f:10:11:12:13:14:15:16:17:18:19:1a:1b:1c:1d:1e:1f".Replace(":", string.Empty)); - var nonce = CryptoBytes.FromHexString("00:00:00:09:00:00:00:4a:00:00:00:00:31:41:59:27".Replace(":", string.Empty)); - - var xChaCha20 = new XChaCha20(key, 0); - var cipher = new ChaCha20Core(xChaCha20); - - // Act - var subKey = new byte[32]; - cipher.HChaCha20(subKey, nonce); - var state = subKey.ToUInt16Array(); - //var stateHex = CryptoBytes.ToHexStringLower(subKey.ToArray()); - - // Assert - // HChaCha20 returns only the first and last rows - var expectedState = new uint[] - { - 0x423b4182, 0xfe7bb227, 0x50420ed3, 0x737d878a, - //0x0aa76448, 0x7954cdf3, 0x846acd37, 0x7b3c58ad, - //0x77e35583, 0x83e77c12, 0xe0076a2d, 0xbc6cd0e5, - 0xd5e4f9a0, 0x53a8748a, 0x13c42ec1, 0xdcecd326 - }; - - // Same as above but in HEX - //var expectedStateHex = "82413b4" + "227b27bfe" + "d30e4250" + "8a877d73" - // + "a0f9e4d" + "58a74a853" + "c12ec413" + "26d3ecdc"; - - state.Should().BeEquivalentTo(expectedState); - //stateHex.Should().Be(expectedStateHex); - } - - [Fact] - public void ScalarEncryptDecrypt1BlockTest() - { - // Arrange - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(key); - - var nonce = new byte[ChaCha20.NONCE_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(nonce); - - var expected = Encoding.UTF8.GetBytes("This is a secret content!!"); - - var chacha20 = new ChaCha20(key, 0); - var cipher = new ChaCha20Core(chacha20); - - // Act - var ciphertext = new byte[expected.Length]; - cipher.Process(nonce, ciphertext, expected); - - var plaintext = new byte[expected.Length]; - cipher.Process(nonce, plaintext, ciphertext); - - // Assert - plaintext.Should().Equal(expected); - } - - [Fact] - public void ScalarEncryptDecryptNBlocksTest() - { - // Arrange - var rnd = new Random(); - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - var nonce = new byte[ChaCha20.NONCE_SIZE_IN_BYTES]; - - for (var i = 0; i < 64; i++) - { - RandomNumberGenerator.Fill(key); - RandomNumberGenerator.Fill(nonce); - - var chacha20 = new ChaCha20(key, 0); - var cipher = new ChaCha20Core(chacha20); - - for (var j = 0; j < 64; j++) - { - var expected = new byte[rnd.Next(300)]; - rnd.NextBytes(expected); - - var ciphertext = new byte[expected.Length]; - var plaintext = new byte[expected.Length]; - - // Act - cipher.Process(nonce, ciphertext, expected); - cipher.Process(nonce, plaintext, ciphertext); - - // Assert - plaintext.Should().Equal(expected); - } - } - } - - [Fact] - public void ScalarEncryptDecryptLongMessagesTest() - { - var rnd = new Random(); - - var dataSize = 16; - while (dataSize <= 1 << 24) - { - var plaintext = new byte[dataSize]; - rnd.NextBytes(plaintext); - - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(key); - - var nonce = new byte[ChaCha20.NONCE_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(nonce); - - var chacha20 = new ChaCha20(key, 0); - var cipher = new ChaCha20Core(chacha20); - - var ciphertext = new byte[plaintext.Length]; - cipher.Process(nonce, ciphertext, plaintext); - - var decrypted = new byte[plaintext.Length]; - cipher.Process(nonce, decrypted, ciphertext); - - decrypted.Should().Equal(plaintext); - dataSize += 5 * dataSize / 11; - } - } - - - [Fact] - public void ChaCha20ScalarBlockWhenNonceLengthIsEmptyFails() - { - // Arrange - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - - var chacha20 = new ChaCha20(key, 0); - var core = new ChaCha20Core(chacha20); - - var nonce = new byte[0]; - var block = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; - - // Act & Assert - var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); - act.Should().Throw(); - } - - [Fact] - public void ChaCha20ScalarBlockWhenNonceLengthIsInvalidFails() - { - // Arrange - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - - var chacha20 = new ChaCha20(key, 0); - var core = new ChaCha20Core(chacha20); - var nonce = new byte[chacha20.NonceSizeInBytes + TestHelpers.ReturnRandomPositiveNegative()]; - var block = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; - - // Act & Assert - var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); - act.Should().Throw(); - } - - [Fact] - public void ChaCha20ScalarBlockWhenLengthIsInvalidFails() - { - // Arrange - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - - var chacha20 = new ChaCha20(key, 0); - var core = new ChaCha20Core(chacha20); - var nonce = new byte[chacha20.NonceSizeInBytes + TestHelpers.ReturnRandomPositiveNegative()]; - var block = new byte[0]; - - // Act & Assert - var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); - act.Should().Throw(); - } - - [Fact] - public void ChaCha20ScalarBlockTestVector() - { - // https://tools.ietf.org/html/rfc8439#section-2.3.2 - - // Arrange - var key = CryptoBytes.FromHexString("00:01:02:03:04:05:06:07:08:09:0a:0b:0c:0d:0e:0f:10:11:12:13:14:15:16:17:18:19:1a:1b:1c:1d:1e:1f".Replace(":", string.Empty)); - var nonce = CryptoBytes.FromHexString("00:00:00:09:00:00:00:4a:00:00:00:00".Replace(":", string.Empty)); - var counter = 1; - - // Act - var chacha20 = new ChaCha20(key, 1); - var core = new ChaCha20Core(chacha20); - var output = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; - core.ProcessKeyStreamBlock(nonce, counter, output); - - // Assert - var expected = new uint[16] - { - 0xe4e7f110, 0x15593bd1, 0x1fdd0f50, 0xc47120a3, - 0xc7f4d1c7, 0x0368c033, 0x9aaa2204, 0x4e6cd4c3, - 0x466482d2, 0x09aa9f07, 0x05d7c214, 0xa2028bd9, - 0xd19c12b5, 0xb94e16de, 0xe883d0cb, 0x4e3c50a2, - }; - - output.ToUInt16Array().Should().Equal(expected); - } - - [Fact] - public void ChaCha20ScalarTestVector() - { - // https://tools.ietf.org/html/rfc8439#section-2.4.2 - - // Arrange - foreach (var test in Rfc8439TestVector.Rfc8439TestVectors) - { - // Act - var chacha20 = new ChaCha20(test.Key, test.InitialCounter); - var cipher = new ChaCha20Core(chacha20); - - var output = new byte[test.CipherText.Length]; - cipher.Process(test.Nonce, output, test.CipherText); - - // Assert - output.Should().Equal(test.PlainText); - } - } - - [Theory] - [InlineData(33)] - [InlineData(64)] - [InlineData(65)] - [InlineData(255)] - [InlineData(256)] - [InlineData(511)] - [InlineData(512)] - [InlineData(1023)] - [InlineData(1024)] - public void ScalarCreateVariableLengthCiphers(int size) - { - var input = new byte[size]; - var output = new byte[size]; - - var nonce = new byte[12]; - Array.Fill(nonce, (byte)2); - var key = new byte[32]; - Array.Fill(key, (byte)1); - - var chacha20 = new ChaCha20(key, 0); - var cipher = new ChaCha20Core(chacha20); - cipher.Process(nonce, output, input); - var value = Convert.ToHexString(output); - - value.Should().Be(LongKeyStream[..(size*2)]); - } - - [Fact] - public void ChaCha20ScalarTestVectorTC8() - { - // TC8: key: 'All your base are belong to us!, IV: 'IETF2013' - // Test vector TC8 from RFC draft by J. Strombergson - // https://tools.ietf.org/html/draft-strombergson-chacha-test-vectors-01 - - // Arrange - var key = new byte[32] - { - 0xC4, 0x6E, 0xC1, 0xB1, 0x8C, 0xE8, 0xA8, 0x78, - 0x72, 0x5A, 0x37, 0xE7, 0x80, 0xDF, 0xB7, 0x35, - 0x1F, 0x68, 0xED, 0x2E, 0x19, 0x4C, 0x79, 0xFB, - 0xC6, 0xAE, 0xBE, 0xE1, 0xA6, 0x67, 0x97, 0x5D - }; - - // The first 4 bytes are set to zero and a large counter - // is used; this makes the RFC 8439 version of ChaCha20 - // compatible with the original specification by D. J. Bernstein. - var nonce = new byte[12] { 0x00, 0x00, 0x00, 0x00, - 0x1A, 0xDA, 0x31, 0xD5, 0xCF, 0x68, 0x82, 0x21 - }; - - // Act - var chacha20 = new ChaCha20(key, 0); - var cipher = new ChaCha20Core(chacha20); - var block0 = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; - var block1 = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; - cipher.ProcessKeyStreamBlock(nonce, 0, block0); - cipher.ProcessKeyStreamBlock(nonce, 1, block1); - - // Assert - var expected = new byte[128] - { - 0xF6, 0x3A, 0x89, 0xB7, 0x5C, 0x22, 0x71, 0xF9, - 0x36, 0x88, 0x16, 0x54, 0x2B, 0xA5, 0x2F, 0x06, - 0xED, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2B, 0x00, - 0xB5, 0xE8, 0xF8, 0x0A, 0xE9, 0xA4, 0x73, 0xAF, - 0xC2, 0x5B, 0x21, 0x8F, 0x51, 0x9A, 0xF0, 0xFD, - 0xD4, 0x06, 0x36, 0x2E, 0x8D, 0x69, 0xDE, 0x7F, - 0x54, 0xC6, 0x04, 0xA6, 0xE0, 0x0F, 0x35, 0x3F, - 0x11, 0x0F, 0x77, 0x1B, 0xDC, 0xA8, 0xAB, 0x92, - - 0xE5, 0xFB, 0xC3, 0x4E, 0x60, 0xA1, 0xD9, 0xA9, - 0xDB, 0x17, 0x34, 0x5B, 0x0A, 0x40, 0x27, 0x36, - 0x85, 0x3B, 0xF9, 0x10, 0xB0, 0x60, 0xBD, 0xF1, - 0xF8, 0x97, 0xB6, 0x29, 0x0F, 0x01, 0xD1, 0x38, - 0xAE, 0x2C, 0x4C, 0x90, 0x22, 0x5B, 0xA9, 0xEA, - 0x14, 0xD5, 0x18, 0xF5, 0x59, 0x29, 0xDE, 0xA0, - 0x98, 0xCA, 0x7A, 0x6C, 0xCF, 0xE6, 0x12, 0x27, - 0x05, 0x3C, 0x84, 0xE4, 0x9A, 0x4A, 0x33, 0x32 - }; - - CryptoBytes.Combine(block0, block1).Should().Equal(expected); - } - - private const string LongKeyStream = "06E1F8D66AC5C75181F3E5ED9FA16AA909A1FB57A4A9B0110C84FCDC0D710880072A4342AF88DEC0138DAF141A3F471C01E77C1FDA90999496D601A36A8C0412E61CF22E8DA3E8DA712DE9F9D38BE4298CB36C0D83AA7DD314841BBDF59644DCD313F9F53B0E06B9D6CB3F0788CE2EE78993D9D27A3EDF0A52589CBB698519D583B68F72F3961AD77C1358394F29B08FE9F98A29F98311723013591E698557A04A73FB277E3E247083444A6C139ADE01BDE3C368C3A484D6824B33C024C0285CBD665D4F2E4DE87BF79565F08FE09766C16639279A243DAE8395F3E0E5D96E711B210355605A5A8E7B50CEA4BA25E4CB0E273488E223CD69FB699BD937A30D33488EF6076192E1ED08758F7F4774E4C0B8E70955D3CAFE790EB40F7725EB87B8BE6BBECDE1E140966973B5B05FDBFBE05C4BC599888693D96AC0C429B75591EF228A243A6EFDBEEEE49F09383AF2D4AFB6305DE60C5D195A44ED646B0CAFCEC5E445562FFFBB56D444C650E2D892FA99BCE78F2EBF866B154FDB110DDF8CAFB7BE4BEA46724B3952906F0C6E81BE7A17E3C95DF350BB970D2C97499924BDCC4EA0E1DE33AA4E62B5C1FC65FFD2728D81A79AE218AE1C639108323C3D22BA1B8C746CAB0CD535C8661CCA4B6B047790EF148A1B9A88CD3CDD8D79389E2F0D9AAAE135B361ED6778A6F6E03186651692F8DABEDF8872939F694C41E2CAD064FF4C537B92AFD0951DF77302749DCDBC9560FCE001DACAAFAA703BDA73007174C549B69EB031324E31BC9F60049E39254146AEB39BEE8A52CAEA1DD31C42346E44EBCC0771A2548D55ABD085323BA69625845F34831E7518F129CB1D80B76D3C94634F38A1226B5E212D917D593838F51D6CC35F87EB500030AB1446D87F6FFC4717B51C619DDAFD75DBA4C25A09C8C961CDA12A9E01203D678AD2ABB4B7D1BED7EBF0C2932DCE5F0C97F9488DD01A7891DC18D5EEFF6129B7942726A5B5110877260E2A78075C666F4410A2F8A2909D03DE0FBE2BFCA2B068B438ADAF767D804BA85278FB930945D15380281C215BC664B6627EE76CBBC8C5355E607721AAAC069B16B78C2F282795E7BF9B6509E7DC36FD2D45A227BF9D20C5E9678A040B63E964817F98B5F4828EB5D66740C595304D08A0A3C5A50EE3B3F99D2269992DD400A5B452A213DCD2579F7A193FC7FE33E498E91203DE19FF9D54BEBDE9E124A17E784430C38110FE3552861737DE1F2B7678F63417FE2224ED6571D43A8015F6F81362E7B95CB93C86735787F0980B0A3A65549844768EDF0DDEC75A24FA1EF5A26640932F65FF141CAEE2E14506A34E925C21BC268769CD95328675953E79B4B375912434834018ADD9C1832057EE4386C95B6E9407346B4A1582FB3C095E4B0882087DB48F081B5C0DE69ADBC447A6BA2ED6A4F90909911CD3B51ECEC2C6BE6EFE"; - } -} diff --git a/test/NaCl.Core.SimdTests/Salsa20IntrinsicsTests.cs b/test/NaCl.Core.SimdTests/Salsa20IntrinsicsTests.cs deleted file mode 100644 index 32c0b96..0000000 --- a/test/NaCl.Core.SimdTests/Salsa20IntrinsicsTests.cs +++ /dev/null @@ -1,431 +0,0 @@ -namespace NaCl.Core.SimdTests -{ - using System; - using System.Collections.Generic; - using System.IO; - using System.Net.Http; - using System.Security.Cryptography; - using System.Text; - - using FluentAssertions; - using Xunit; - using Xunit.Abstractions; - - using Base; - using Internal; - using Vectors; - using System.Linq; - using NaCl.Core.Base.SalsaCore; - - public class Salsa20IntrinsicsTests - { - private readonly ITestOutputHelper _output; - - public Salsa20IntrinsicsTests(ITestOutputHelper output) => _output = output; - - [Fact] - public void HSalsa20IntrinsicsTestVector1() - { - // 8. Example of the long stream, ref: https://cr.yp.to/highspeed/naclcrypto-20090310.pdf - - // Arrange - var shared = new byte[32] - { - 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, - 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, - 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, - 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 - }; - var zero = new byte[32]; - var c = new byte[16] // SIGMA - { - 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x20, 0x33, - 0x32, 0x2d, 0x62, 0x79, 0x74, 0x65, 0x20, 0x6b - }; - - var salsa20 = new XSalsa20(shared); - var cipher = new Salsa20CoreIntrinsics(salsa20); - - // Act - var firstKey = new byte[32]; - cipher.HSalsa20(firstKey, zero); - - // Assert - firstKey.Should().Equal(new byte[] - { - 0x1b, 0x27, 0x55, 0x64, 0x73, 0xe9, 0x85, 0xd4, - 0x62, 0xcd, 0x51, 0x19, 0x7a, 0x9a, 0x46, 0xc7, - 0x60, 0x09, 0x54, 0x9e, 0xac, 0x64, 0x74, 0xf2, - 0x06, 0xc4, 0xee, 0x08, 0x44, 0xf6, 0x83, 0x89 - }); - } - - [Fact] - public void HSalsa20IntrinsicsTestVector2() - { - // 8. Example of the long stream, ref: https://cr.yp.to/highspeed/naclcrypto-20090310.pdf - - // Arrange - var firstKey = new byte[32] - { - 0x1b, 0x27, 0x55, 0x64, 0x73, 0xe9, 0x85, 0xd4, - 0x62, 0xcd, 0x51, 0x19, 0x7a, 0x9a, 0x46, 0xc7, - 0x60, 0x09, 0x54, 0x9e, 0xac, 0x64, 0x74, 0xf2, - 0x06, 0xc4, 0xee, 0x08, 0x44, 0xf6, 0x83, 0x89 - }; - var noncePrefix = new byte[16] - { - 0x69, 0x69, 0x6e, 0xe9, 0x55, 0xb6, 0x2b, 0x73, - 0xcd, 0x62, 0xbd, 0xa8, 0x75, 0xfc, 0x73, 0xd6 - }; - var c = new byte[16] // SIGMA - { - 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x20, 0x33, - 0x32, 0x2d, 0x62, 0x79, 0x74, 0x65, 0x20, 0x6b - }; - - var salsa20 = new XSalsa20(firstKey); - var cipher = new Salsa20CoreIntrinsics(salsa20); - - // Act - var secondKey = new byte[32]; - cipher.HSalsa20(secondKey, noncePrefix); - - // Assert - secondKey.Should().Equal(new byte[] - { - 0xdc, 0x90, 0x8d, 0xda, 0x0b, 0x93, 0x44, 0xa9, - 0x53, 0x62, 0x9b, 0x73, 0x38, 0x20, 0x77, 0x88, - 0x80, 0xf3, 0xce, 0xb4, 0x21, 0xbb, 0x61, 0xb9, - 0x1c, 0xbd, 0x4c, 0x3e, 0x66, 0x25, 0x6c, 0xe4 - }); - } - - [Fact] - public void HSalsa20IntrinsicsTestVector3() - { - // 8. Example of the long stream, ref: https://cr.yp.to/highspeed/naclcrypto-20090310.pdf - - // Arrange - var k = new byte[32] - { - 0xee, 0x30, 0x4f, 0xca, 0x27, 0x00, 0x8d, 0x8c, - 0x12, 0x6f, 0x90, 0x02, 0x79, 0x01, 0xd8, 0x0f, - 0x7f, 0x1d, 0x8b, 0x8d, 0xc9, 0x36, 0xcf, 0x3b, - 0x9f, 0x81, 0x96, 0x92, 0x82, 0x7e, 0x57, 0x77 - }; - var n = new byte[16] - { - 0x81, 0x91, 0x8e, 0xf2, 0xa5, 0xe0, 0xda, 0x9b, - 0x3e, 0x90, 0x60, 0x52, 0x1e, 0x4b, 0xb3, 0x52 - }; - - var salsa20 = new XSalsa20(k); - var cipher = new Salsa20CoreIntrinsics(salsa20); - - // Act - var output = new byte[32]; - cipher.HSalsa20(output, n); - - // Assert - output.Should().Equal(new byte[] - { - 0xbc, 0x1b, 0x30, 0xfc, 0x07, 0x2c, 0xc1, 0x40, - 0x75, 0xe4, 0xba, 0xa7, 0x31, 0xb5, 0xa8, 0x45, - 0xea, 0x9b, 0x11, 0xe9, 0xa5, 0x19, 0x1f, 0x94, - 0xe1, 0x8c, 0xba, 0x8f, 0xd8, 0x21, 0xa7, 0xcd - }); - } - - [Fact] - public void IntrinsicsEncryptDecrypt1BlockTest() - { - // Arrange - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(key); - - var nonce = new byte[Salsa20.NONCE_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(nonce); - - var expected = Encoding.UTF8.GetBytes("This is a secret content!!"); - - var salsa20 = new Salsa20(key, 0); - var cipher = new Salsa20CoreIntrinsics(salsa20); - - // Act - var ciphertext = new byte[expected.Length]; - cipher.Process(nonce, ciphertext, expected); - - var plaintext = new byte[expected.Length]; - cipher.Process(nonce, plaintext, ciphertext); - - // Assert - plaintext.Should().Equal(expected); - } - - [Fact] - public void IntrinsicsEncryptDecryptNBlocksTest() - { - // Arrange - var rnd = new Random(); - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - var nonce = new byte[Salsa20.NONCE_SIZE_IN_BYTES]; - - for (var i = 0; i < 64; i++) - { - RandomNumberGenerator.Fill(key); - RandomNumberGenerator.Fill(nonce); - - var salsa20 = new Salsa20(key, 0); - var cipher = new Salsa20CoreIntrinsics(salsa20); - - for (var j = 0; j < 64; j++) - { - var expected = new byte[rnd.Next(300)]; - rnd.NextBytes(expected); - - var ciphertext = new byte[expected.Length]; - var plaintext = new byte[expected.Length]; - - // Act - cipher.Process(nonce, ciphertext, expected); - cipher.Process(nonce, plaintext, ciphertext); - - // Assert - plaintext.Should().Equal(expected); - } - } - } - - [Fact] - public void IntrinsicsEncryptDecryptLongMessagesTest() - { - var rnd = new Random(); - - var dataSize = 16; - while (dataSize <= 1 << 24) - { - var plaintext = new byte[dataSize]; - rnd.NextBytes(plaintext); - - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(key); - - var nonce = new byte[Salsa20.NONCE_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(nonce); - - var salsa20 = new Salsa20(key, 0); - var cipher = new Salsa20CoreIntrinsics(salsa20); - - var ciphertext = new byte[plaintext.Length]; - cipher.Process(nonce, ciphertext, plaintext); - - var decrypted = new byte[plaintext.Length]; - cipher.Process(nonce, decrypted, ciphertext); - - decrypted.Should().Equal(plaintext); - dataSize += 5 * dataSize / 11; - } - } - - [Fact] - public void Salsa20IntrinsicsBlockWhenNonceLengthIsEmptyFails() - { - // Arrange - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - - var salsa20 = new Salsa20(key, 0); - var nonce = new byte[0]; - var block = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; - var core = new Salsa20CoreIntrinsics(salsa20); - - // Act & Assert - var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); - act.Should().Throw(); - } - - [Fact] - public void Salsa20IntrinsicsBlockWhenNonceLengthIsInvalidFails() - { - // Arrange - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - - var salsa20 = new Salsa20(key, 0); - var nonce = new byte[salsa20.NonceSizeInBytes + TestHelpers.ReturnRandomPositiveNegative()]; - var block = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; - var core = new Salsa20CoreIntrinsics(salsa20); - - // Act & Assert - var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); - act.Should().Throw(); - } - - [Fact] - public void Salsa20IntrinsicsBlockWhenLengthIsInvalidFails() - { - // Arrange - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - - var salsa20 = new Salsa20(key, 0); - var nonce = new byte[salsa20.NonceSizeInBytes + TestHelpers.ReturnRandomPositiveNegative()]; - var block = new byte[0]; - var core = new Salsa20CoreIntrinsics(salsa20); - - // Act & Assert - var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); - act.Should().Throw(); - } - - [Fact] - public void Salsa20IntrinsicsBlockTestVector() - { - // Arrange - var key = CryptoBytes.FromHexString("00:01:02:03:04:05:06:07:08:09:0a:0b:0c:0d:0e:0f:10:11:12:13:14:15:16:17:18:19:1a:1b:1c:1d:1e:1f".Replace(":", string.Empty)); - var nonce = CryptoBytes.FromHexString("00:00:00:09:00:00:00:4a".Replace(":", string.Empty)); - var counter = 1; - - // Act - var salsa20 = new Salsa20(key, 1); - var output = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; - var core = new Salsa20CoreIntrinsics(salsa20); - core.ProcessKeyStreamBlock(nonce, counter, output); - - // Assert - var expected = new uint[16] - { - 3649387971u, 3432934094u, 2867581180u, 544842727u, - 3442094382u, 3233001746u, 2484653980u, 586338650u, - 3037335121u, 3388889956u, 1351682463u, 2284954070u, - 3021171268u, 2617586057u, 3288245149u, 2763695160u }; - - output.ToUInt16Array().Should().Equal(expected); - } - - public static IEnumerable Salsa20TestData => ParseTestVectors(GetTestVector()).Select(d => new object[] { d }); - - [Theory] - [MemberData(nameof(Salsa20TestData))] - public void Salsa20IntrinsicsProcessTestVectors(Salsa20TestVector test) - { - _output.WriteLine($"Salsa20 - {test.Name}"); - - var input = new byte[512]; - var output = new byte[512]; - - var cipher = new Salsa20(test.Key, 0); - var core = new Salsa20CoreIntrinsics(cipher); - core.Process(test.IV, output, input); - - ToBlock1(output).Should().Be(test.ExpectedBlock1); - ToBlock4(output).Should().Be(test.ExpectedBlock4); - ToBlock5(output).Should().Be(test.ExpectedBlock5); - ToBlock8(output).Should().Be(test.ExpectedBlock8); - } - - [Theory] - [InlineData(33)] - [InlineData(64)] - [InlineData(65)] - [InlineData(255)] - [InlineData(256)] - [InlineData(511)] - [InlineData(512)] - [InlineData(1023)] - [InlineData(1024)] - public void IntrinsicsCreateVariableLengthCiphers(int size) - { - var input = new byte[size]; - var output = new byte[size]; - - var nonce = new byte[8]; - Array.Fill(nonce, (byte)2); - - var key = new byte[32]; - Array.Fill(key, (byte)1); - - var cipher = new Salsa20(key, 0); - cipher.Encrypt(input, nonce, output); - var value = Convert.ToHexString(output); - - value.Should().Be(LongKeyStream[..(size*2)]); - } - - private static string GetTestVector() - { - try - { - using var client = new HttpClient(); - return client.GetStringAsync("https://github.com/das-labor/legacy/raw/master/microcontroller-2/arm-crypto-lib/testvectors/salsa20-256.64-verified.test-vectors").Result; - } - catch (Exception) - { - return File.ReadAllText(@"Vectors\salsa20-256.64-verified.test-vectors"); - } - } - - private static IList ParseTestVectors(string raw) - { - var lines = raw.Split(new[] { '\r', '\n' }); - - var result = new List(); - - string ReadValue(string toFind, int idx, int len) - { - var toFindIdx = lines[idx].IndexOf(toFind, StringComparison.Ordinal) + toFind.Length; - return lines[idx].Substring(toFindIdx, len); - } - - for (var i = 0; i < lines.Length; i++) - { - if (!lines[i].StartsWith("Set ")) - continue; - - // We skip Set 6 vector tests for now... - if (!lines[i + 8].Contains("stream[192..255] = ")) - continue; - - var name = lines[i].Replace(":", ""); - - var key = ReadValue("key = ", i + 1, 32); - key += lines[i + 2].Trim(); - - var iv = ReadValue("IV = ", i + 3, 16); - - var block1 = ReadValue("stream[0..63] = ", i + 4, 32); - block1 += lines[i + 5].Trim(); - block1 += lines[i + 6].Trim(); - block1 += lines[i + 7].Trim(); - - var block4 = ReadValue("stream[192..255] = ", i + 8, 32); - block4 += lines[i + 9].Trim(); - block4 += lines[i + 10].Trim(); - block4 += lines[i + 11].Trim(); - - var block5 = ReadValue("stream[256..319] = ", i + 12, 32); - block5 += lines[i + 13].Trim(); - block5 += lines[i + 14].Trim(); - block5 += lines[i + 15].Trim(); - - var block8 = ReadValue("stream[448..511] = ", i + 16, 32); - block8 += lines[i + 17].Trim(); - block8 += lines[i + 18].Trim(); - block8 += lines[i + 19].Trim(); - - result.Add(new Salsa20TestVector(name, key, iv, block1, block4, block5, block8)); - i += 20; - } - - return result; - } - - private static string ToBlock1(byte[] output) => CryptoBytes.ToHexStringUpper(output[0..64]); - - private static string ToBlock4(byte[] output) => CryptoBytes.ToHexStringUpper(output[192..256]); - - private static string ToBlock5(byte[] output) => CryptoBytes.ToHexStringUpper(output[256..320]); - - private static string ToBlock8(byte[] output) => CryptoBytes.ToHexStringUpper(output[448..512]); - - private const string LongKeyStream = "A3D1F8292CAB0B2096AB2AA26FC59AAF3EE159B39FC6029EF160D82EC80FA110FF958AB802861180EC006F8C8450030024A2D7744BF564C1782F15DB6681144C65A730622A14AE9A4E95F753289A6D2DBBEE47B457B57DB75C009B287BF240EBE02890581E3628BDBCC9B79E93500CA15F6E10D4EBCAAFC2FB936AF2EC05BBCB1610036E840621D7CE53E4A06822D6073EA0FA8943EDFB70E45B4D2525AE4B616BD08B33F23A7E0B6CD501E80B8E80B7423E7C9D5D900AE2194AF0CF4A74D721534063D3F17BC7993B5B3EC20A373F933B43CEB6987934C1456521F098BA0CB1205109F534F80D4EA1767EA9DFC08BED97BE40C539DD37EC24EAE0C68AC1B56DD0189747A4B8278B1E0E5206EAE893C0E45C76751002F38924B8C9A036CFAB9E3D44C1E323BCE43F2C69EB8212994803C1D2AC00C3B8F97DA6D09F29B974E0DF4D6D36C9D2E88C2D7B73AB399C0920A2996A4727272339D991C6BF45CE63C2DEF3FC9C2625F87EA6268C196829BB1F7E659736AF4B0CC2A771FB0962B19005E53DD880879C052556312BA353B51C26D5F5949464EAECE15ACA240E339BF3C581E7D93D220B1C3C0DE87F65B4F340DAB924EB72072211C41B18770230A3A123619006BE5FD4ABAAFD2BFAD0F34D5FB491DEBEBF5CA9EC92D997B5A171482CC6E949C70759A0B8EC64D590B6FFF6500E8425C3AE4178C2EDE996C0003F6FA76A6D90F49D6D3D128C0DE82EA8C7C16415DDD07081940701677C32D5B5E3BB57A93315474C5B648D31AA7AE52FCD63BF22550900077FF5CF6A5F5148B285E34A57A3DA1BEB0662A20C23857CA8D5D1748F654F54F42F30CD413F408A0C7B31F57AD59E9F152DBDEEA3EA9C3DBB3517615735CFF0226E179C4A9149C6477A2903B338AE308300A86D91043E2AA437C5F2A77A49B547B05BD98CEBE49500FF367CE204157BB3EFD182A8A96FCC31025D4C948105F6762F22357446367B87A01FA3F954D52810CBE5C4EEB04C3AE827973E481F3C38EF14A6F0FE3FB2D89969D2CCB0DFB63D7366D91F29DDBF1EB90B136191745B8AC8B8F0AAEF4D3A1C763D63AED1E76CC7B920979CB8163C413273CA1A563C37B925A0251C9AD31363F978437D92437A0D250C7F221C00F2E13CF371554DF191ECDDB46C95659739A1CDC257A067D9251FE89EA328D313C4D7EF8E33614FFC4C615D3195CD6282D82633067C81E1F563DA307B14253CBF0492256A409E3007EB6A4A7BDA694E1FFA9B5106AB9868CC359B976441C7B362C03E501D8B3FBEF98771A41C4DA542DB8DA4761EA3792695288437DEAC50E7B6A62E6D00B7511A5DB0E567090ADDDFCF0521F6DD62F969D5BE89378DB127219C38931A0AEDBCE784C35D4215B09B1F96732615813753B67846E9505DF974F4B1ECDFBD0C850A9644D720884B80B4FE4CC08508A8A65D1C5F"; - } -} \ No newline at end of file diff --git a/test/NaCl.Core.SimdTests/Salsa20ScalarTests.cs b/test/NaCl.Core.SimdTests/Salsa20ScalarTests.cs deleted file mode 100644 index 167c989..0000000 --- a/test/NaCl.Core.SimdTests/Salsa20ScalarTests.cs +++ /dev/null @@ -1,431 +0,0 @@ -namespace NaCl.Core.SimdTests -{ - using System; - using System.Collections.Generic; - using System.IO; - using System.Net.Http; - using System.Security.Cryptography; - using System.Text; - - using FluentAssertions; - using Xunit; - using Xunit.Abstractions; - - using Base; - using Internal; - using Vectors; - using System.Linq; - using NaCl.Core.Base.SalsaCore; - - public class Salsa20ScalarTests - { - private readonly ITestOutputHelper _output; - - public Salsa20ScalarTests(ITestOutputHelper output) => _output = output; - - [Fact] - public void HSalsa20ScalarTestVector1() - { - // 8. Example of the long stream, ref: https://cr.yp.to/highspeed/naclcrypto-20090310.pdf - - // Arrange - var shared = new byte[32] - { - 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, - 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, - 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, - 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 - }; - var zero = new byte[32]; - var c = new byte[16] // SIGMA - { - 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x20, 0x33, - 0x32, 0x2d, 0x62, 0x79, 0x74, 0x65, 0x20, 0x6b - }; - - var salsa20 = new XSalsa20(shared); - var cipher = new Salsa20Core(salsa20); - - // Act - var firstKey = new byte[32]; - cipher.HSalsa20(firstKey, zero); - - // Assert - firstKey.Should().Equal(new byte[] - { - 0x1b, 0x27, 0x55, 0x64, 0x73, 0xe9, 0x85, 0xd4, - 0x62, 0xcd, 0x51, 0x19, 0x7a, 0x9a, 0x46, 0xc7, - 0x60, 0x09, 0x54, 0x9e, 0xac, 0x64, 0x74, 0xf2, - 0x06, 0xc4, 0xee, 0x08, 0x44, 0xf6, 0x83, 0x89 - }); - } - - [Fact] - public void HSalsa20ScalarTestVector2() - { - // 8. Example of the long stream, ref: https://cr.yp.to/highspeed/naclcrypto-20090310.pdf - - // Arrange - var firstKey = new byte[32] - { - 0x1b, 0x27, 0x55, 0x64, 0x73, 0xe9, 0x85, 0xd4, - 0x62, 0xcd, 0x51, 0x19, 0x7a, 0x9a, 0x46, 0xc7, - 0x60, 0x09, 0x54, 0x9e, 0xac, 0x64, 0x74, 0xf2, - 0x06, 0xc4, 0xee, 0x08, 0x44, 0xf6, 0x83, 0x89 - }; - var noncePrefix = new byte[16] - { - 0x69, 0x69, 0x6e, 0xe9, 0x55, 0xb6, 0x2b, 0x73, - 0xcd, 0x62, 0xbd, 0xa8, 0x75, 0xfc, 0x73, 0xd6 - }; - var c = new byte[16] // SIGMA - { - 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x20, 0x33, - 0x32, 0x2d, 0x62, 0x79, 0x74, 0x65, 0x20, 0x6b - }; - - var salsa20 = new XSalsa20(firstKey); - var cipher = new Salsa20Core(salsa20); - - // Act - var secondKey = new byte[32]; - cipher.HSalsa20(secondKey, noncePrefix); - - // Assert - secondKey.Should().Equal(new byte[] - { - 0xdc, 0x90, 0x8d, 0xda, 0x0b, 0x93, 0x44, 0xa9, - 0x53, 0x62, 0x9b, 0x73, 0x38, 0x20, 0x77, 0x88, - 0x80, 0xf3, 0xce, 0xb4, 0x21, 0xbb, 0x61, 0xb9, - 0x1c, 0xbd, 0x4c, 0x3e, 0x66, 0x25, 0x6c, 0xe4 - }); - } - - [Fact] - public void HSalsa20ScalarTestVector3() - { - // 8. Example of the long stream, ref: https://cr.yp.to/highspeed/naclcrypto-20090310.pdf - - // Arrange - var k = new byte[32] - { - 0xee, 0x30, 0x4f, 0xca, 0x27, 0x00, 0x8d, 0x8c, - 0x12, 0x6f, 0x90, 0x02, 0x79, 0x01, 0xd8, 0x0f, - 0x7f, 0x1d, 0x8b, 0x8d, 0xc9, 0x36, 0xcf, 0x3b, - 0x9f, 0x81, 0x96, 0x92, 0x82, 0x7e, 0x57, 0x77 - }; - var n = new byte[16] - { - 0x81, 0x91, 0x8e, 0xf2, 0xa5, 0xe0, 0xda, 0x9b, - 0x3e, 0x90, 0x60, 0x52, 0x1e, 0x4b, 0xb3, 0x52 - }; - - var salsa20 = new XSalsa20(k); - var cipher = new Salsa20Core(salsa20); - - // Act - var output = new byte[32]; - cipher.HSalsa20(output, n); - - // Assert - output.Should().Equal(new byte[] - { - 0xbc, 0x1b, 0x30, 0xfc, 0x07, 0x2c, 0xc1, 0x40, - 0x75, 0xe4, 0xba, 0xa7, 0x31, 0xb5, 0xa8, 0x45, - 0xea, 0x9b, 0x11, 0xe9, 0xa5, 0x19, 0x1f, 0x94, - 0xe1, 0x8c, 0xba, 0x8f, 0xd8, 0x21, 0xa7, 0xcd - }); - } - - [Fact] - public void ScalarEncryptDecrypt1BlockTest() - { - // Arrange - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(key); - - var nonce = new byte[Salsa20.NONCE_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(nonce); - - var expected = Encoding.UTF8.GetBytes("This is a secret content!!"); - - var salsa20 = new Salsa20(key, 0); - var cipher = new Salsa20Core(salsa20); - - // Act - var ciphertext = new byte[expected.Length]; - cipher.Process(nonce, ciphertext, expected); - - var plaintext = new byte[expected.Length]; - cipher.Process(nonce, plaintext, ciphertext); - - // Assert - plaintext.Should().Equal(expected); - } - - [Fact] - public void ScalarEncryptDecryptNBlocksTest() - { - // Arrange - var rnd = new Random(); - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - var nonce = new byte[Salsa20.NONCE_SIZE_IN_BYTES]; - - for (var i = 0; i < 64; i++) - { - RandomNumberGenerator.Fill(key); - RandomNumberGenerator.Fill(nonce); - - var salsa20 = new Salsa20(key, 0); - var cipher = new Salsa20Core(salsa20); - - for (var j = 0; j < 64; j++) - { - var expected = new byte[rnd.Next(300)]; - rnd.NextBytes(expected); - - var ciphertext = new byte[expected.Length]; - var plaintext = new byte[expected.Length]; - - // Act - cipher.Process(nonce, ciphertext, expected); - cipher.Process(nonce, plaintext, ciphertext); - - // Assert - plaintext.Should().Equal(expected); - } - } - } - - [Fact] - public void ScalarEncryptDecryptLongMessagesTest() - { - var rnd = new Random(); - - var dataSize = 16; - while (dataSize <= 1 << 24) - { - var plaintext = new byte[dataSize]; - rnd.NextBytes(plaintext); - - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(key); - - var nonce = new byte[Salsa20.NONCE_SIZE_IN_BYTES]; - RandomNumberGenerator.Fill(nonce); - - var salsa20 = new Salsa20(key, 0); - var cipher = new Salsa20Core(salsa20); - - var ciphertext = new byte[plaintext.Length]; - cipher.Process(nonce, ciphertext, plaintext); - - var decrypted = new byte[plaintext.Length]; - cipher.Process(nonce, decrypted, ciphertext); - - decrypted.Should().Equal(plaintext); - dataSize += 5 * dataSize / 11; - } - } - - [Fact] - public void Salsa20ScalarBlockWhenNonceLengthIsEmptyFails() - { - // Arrange - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - - var salsa20 = new Salsa20(key, 0); - var nonce = new byte[0]; - var block = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; - var core = new Salsa20Core(salsa20); - - // Act & Assert - var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); - act.Should().Throw(); - } - - [Fact] - public void Salsa20ScalarBlockWhenNonceLengthIsInvalidFails() - { - // Arrange - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - - var salsa20 = new Salsa20(key, 0); - var nonce = new byte[salsa20.NonceSizeInBytes + TestHelpers.ReturnRandomPositiveNegative()]; - var block = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; - var core = new Salsa20Core(salsa20); - - // Act & Assert - var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); - act.Should().Throw(); - } - - [Fact] - public void Salsa20ScalarBlockWhenLengthIsInvalidFails() - { - // Arrange - var key = new byte[Snuffle.KEY_SIZE_IN_BYTES]; - - var salsa20 = new Salsa20(key, 0); - var nonce = new byte[salsa20.NonceSizeInBytes + TestHelpers.ReturnRandomPositiveNegative()]; - var block = new byte[0]; - var core = new Salsa20Core(salsa20); - - // Act & Assert - var act = () => core.ProcessKeyStreamBlock(nonce, 0, block); - act.Should().Throw(); - } - - [Fact] - public void Salsa20ScalarBlockTestVector() - { - // Arrange - var key = CryptoBytes.FromHexString("00:01:02:03:04:05:06:07:08:09:0a:0b:0c:0d:0e:0f:10:11:12:13:14:15:16:17:18:19:1a:1b:1c:1d:1e:1f".Replace(":", string.Empty)); - var nonce = CryptoBytes.FromHexString("00:00:00:09:00:00:00:4a".Replace(":", string.Empty)); - var counter = 1; - - // Act - var salsa20 = new Salsa20(key, 1); - var output = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; - var core = new Salsa20Core(salsa20); - core.ProcessKeyStreamBlock(nonce, counter, output); - - // Assert - var expected = new uint[16] - { - 3649387971u, 3432934094u, 2867581180u, 544842727u, - 3442094382u, 3233001746u, 2484653980u, 586338650u, - 3037335121u, 3388889956u, 1351682463u, 2284954070u, - 3021171268u, 2617586057u, 3288245149u, 2763695160u }; - - output.ToUInt16Array().Should().Equal(expected); - } - - public static IEnumerable Salsa20TestData => ParseTestVectors(GetTestVector()).Select(d => new object[] { d }); - - [Theory] - [MemberData(nameof(Salsa20TestData))] - public void Salsa20ScalarProcessTestVectors(Salsa20TestVector test) - { - _output.WriteLine($"Salsa20 - {test.Name}"); - - var input = new byte[512]; - var output = new byte[512]; - - var cipher = new Salsa20(test.Key, 0); - var core = new Salsa20Core(cipher); - core.Process(test.IV, output, input); - - ToBlock1(output).Should().Be(test.ExpectedBlock1); - ToBlock4(output).Should().Be(test.ExpectedBlock4); - ToBlock5(output).Should().Be(test.ExpectedBlock5); - ToBlock8(output).Should().Be(test.ExpectedBlock8); - } - - [Theory] - [InlineData(33)] - [InlineData(64)] - [InlineData(65)] - [InlineData(255)] - [InlineData(256)] - [InlineData(511)] - [InlineData(512)] - [InlineData(1023)] - [InlineData(1024)] - public void ScalarCreateVariableLengthCiphers(int size) - { - var input = new byte[size]; - var output = new byte[size]; - - var nonce = new byte[8]; - Array.Fill(nonce, (byte)2); - - var key = new byte[32]; - Array.Fill(key, (byte)1); - - var cipher = new Salsa20(key, 0); - cipher.Encrypt(input, nonce, output); - var value = Convert.ToHexString(output); - - value.Should().Be(LongKeyStream[..(size*2)]); - } - - private static string GetTestVector() - { - try - { - using var client = new HttpClient(); - return client.GetStringAsync("https://github.com/das-labor/legacy/raw/master/microcontroller-2/arm-crypto-lib/testvectors/salsa20-256.64-verified.test-vectors").Result; - } - catch (Exception) - { - return File.ReadAllText(@"Vectors\salsa20-256.64-verified.test-vectors"); - } - } - - private static IList ParseTestVectors(string raw) - { - var lines = raw.Split(new[] { '\r', '\n' }); - - var result = new List(); - - string ReadValue(string toFind, int idx, int len) - { - var toFindIdx = lines[idx].IndexOf(toFind, StringComparison.Ordinal) + toFind.Length; - return lines[idx].Substring(toFindIdx, len); - } - - for (var i = 0; i < lines.Length; i++) - { - if (!lines[i].StartsWith("Set ")) - continue; - - // We skip Set 6 vector tests for now... - if (!lines[i + 8].Contains("stream[192..255] = ")) - continue; - - var name = lines[i].Replace(":", ""); - - var key = ReadValue("key = ", i + 1, 32); - key += lines[i + 2].Trim(); - - var iv = ReadValue("IV = ", i + 3, 16); - - var block1 = ReadValue("stream[0..63] = ", i + 4, 32); - block1 += lines[i + 5].Trim(); - block1 += lines[i + 6].Trim(); - block1 += lines[i + 7].Trim(); - - var block4 = ReadValue("stream[192..255] = ", i + 8, 32); - block4 += lines[i + 9].Trim(); - block4 += lines[i + 10].Trim(); - block4 += lines[i + 11].Trim(); - - var block5 = ReadValue("stream[256..319] = ", i + 12, 32); - block5 += lines[i + 13].Trim(); - block5 += lines[i + 14].Trim(); - block5 += lines[i + 15].Trim(); - - var block8 = ReadValue("stream[448..511] = ", i + 16, 32); - block8 += lines[i + 17].Trim(); - block8 += lines[i + 18].Trim(); - block8 += lines[i + 19].Trim(); - - result.Add(new Salsa20TestVector(name, key, iv, block1, block4, block5, block8)); - i += 20; - } - - return result; - } - - private static string ToBlock1(byte[] output) => CryptoBytes.ToHexStringUpper(output[0..64]); - - private static string ToBlock4(byte[] output) => CryptoBytes.ToHexStringUpper(output[192..256]); - - private static string ToBlock5(byte[] output) => CryptoBytes.ToHexStringUpper(output[256..320]); - - private static string ToBlock8(byte[] output) => CryptoBytes.ToHexStringUpper(output[448..512]); - - private const string LongKeyStream = "A3D1F8292CAB0B2096AB2AA26FC59AAF3EE159B39FC6029EF160D82EC80FA110FF958AB802861180EC006F8C8450030024A2D7744BF564C1782F15DB6681144C65A730622A14AE9A4E95F753289A6D2DBBEE47B457B57DB75C009B287BF240EBE02890581E3628BDBCC9B79E93500CA15F6E10D4EBCAAFC2FB936AF2EC05BBCB1610036E840621D7CE53E4A06822D6073EA0FA8943EDFB70E45B4D2525AE4B616BD08B33F23A7E0B6CD501E80B8E80B7423E7C9D5D900AE2194AF0CF4A74D721534063D3F17BC7993B5B3EC20A373F933B43CEB6987934C1456521F098BA0CB1205109F534F80D4EA1767EA9DFC08BED97BE40C539DD37EC24EAE0C68AC1B56DD0189747A4B8278B1E0E5206EAE893C0E45C76751002F38924B8C9A036CFAB9E3D44C1E323BCE43F2C69EB8212994803C1D2AC00C3B8F97DA6D09F29B974E0DF4D6D36C9D2E88C2D7B73AB399C0920A2996A4727272339D991C6BF45CE63C2DEF3FC9C2625F87EA6268C196829BB1F7E659736AF4B0CC2A771FB0962B19005E53DD880879C052556312BA353B51C26D5F5949464EAECE15ACA240E339BF3C581E7D93D220B1C3C0DE87F65B4F340DAB924EB72072211C41B18770230A3A123619006BE5FD4ABAAFD2BFAD0F34D5FB491DEBEBF5CA9EC92D997B5A171482CC6E949C70759A0B8EC64D590B6FFF6500E8425C3AE4178C2EDE996C0003F6FA76A6D90F49D6D3D128C0DE82EA8C7C16415DDD07081940701677C32D5B5E3BB57A93315474C5B648D31AA7AE52FCD63BF22550900077FF5CF6A5F5148B285E34A57A3DA1BEB0662A20C23857CA8D5D1748F654F54F42F30CD413F408A0C7B31F57AD59E9F152DBDEEA3EA9C3DBB3517615735CFF0226E179C4A9149C6477A2903B338AE308300A86D91043E2AA437C5F2A77A49B547B05BD98CEBE49500FF367CE204157BB3EFD182A8A96FCC31025D4C948105F6762F22357446367B87A01FA3F954D52810CBE5C4EEB04C3AE827973E481F3C38EF14A6F0FE3FB2D89969D2CCB0DFB63D7366D91F29DDBF1EB90B136191745B8AC8B8F0AAEF4D3A1C763D63AED1E76CC7B920979CB8163C413273CA1A563C37B925A0251C9AD31363F978437D92437A0D250C7F221C00F2E13CF371554DF191ECDDB46C95659739A1CDC257A067D9251FE89EA328D313C4D7EF8E33614FFC4C615D3195CD6282D82633067C81E1F563DA307B14253CBF0492256A409E3007EB6A4A7BDA694E1FFA9B5106AB9868CC359B976441C7B362C03E501D8B3FBEF98771A41C4DA542DB8DA4761EA3792695288437DEAC50E7B6A62E6D00B7511A5DB0E567090ADDDFCF0521F6DD62F969D5BE89378DB127219C38931A0AEDBCE784C35D4215B09B1F96732615813753B67846E9505DF974F4B1ECDFBD0C850A9644D720884B80B4FE4CC08508A8A65D1C5F"; - } -} \ No newline at end of file diff --git a/test/NaCl.Core.SimdTests/TestHelpers.cs b/test/NaCl.Core.SimdTests/TestHelpers.cs deleted file mode 100644 index 95a81eb..0000000 --- a/test/NaCl.Core.SimdTests/TestHelpers.cs +++ /dev/null @@ -1,91 +0,0 @@ -namespace NaCl.Core.SimdTests -{ - using System; - using System.Collections.Generic; - using System.Linq; - - using FluentAssertions; - using NaCl.Core.SimdTests; - - public static class TestHelpers - { - private static readonly Random _random = new Random(); - private static readonly object _sync = new object(); - - private static int Random(int min, int max) - { - lock (_sync) - { - return _random.Next(min, max); - } - } - - public static void AssertEqualBytes(byte[] expected, byte[] actual) - => BitConverter.ToString(actual).Should().Be(BitConverter.ToString(expected)); - - public static ArraySegment Pad(this byte[] array) - => array.Pad(Random(1, 100), Random(0, 50)); - - private static ArraySegment Pad(this byte[] array, int paddingLeft, int paddingRight) - { - byte padByte = 0xE7; - if (array.Length > 0) - { - if (array[0] == padByte) - padByte++; - if (array[array.Length - 1] == padByte) - padByte++; - } - var resultBytes = Enumerable.Repeat(padByte, paddingLeft).Concat(array).Concat(Enumerable.Repeat(padByte, paddingRight)).ToArray(); - return new ArraySegment(resultBytes, paddingLeft, array.Length); - } - - public static byte[] UnPad(this ArraySegment paddedData) - { - var padByte = paddedData.Array[0]; - if (padByte < 0xE7 || padByte > 0xE9) - throw new ArgumentException("Padding invalid"); - - for (var i = 0; i < paddedData.Offset; i++) - { - if (paddedData.Array[i] != padByte) - throw new ArgumentException("Padding invalid"); - } - for (var i = paddedData.Offset + paddedData.Count; i < paddedData.Array.Length; i++) - { - if (paddedData.Array[i] != padByte) - throw new ArgumentException("Padding invalid"); - } - return paddedData.ToArray(); - } - - public static int ReturnRandomPositiveNegative() => Random(0, 2) * 2 - 1; - - public static IEnumerable WithChangedBit(this byte[] array) - { - for (var i = 0; i < array.Length; i++) - { - for (var bit = 0; bit < 8; bit++) - { - var result = array.ToArray(); - result[i] ^= (byte)(1 << bit); - yield return result; - } - } - } - - private static byte[] ToArray(this ArraySegment segment) - { - var result = new byte[segment.Count]; - Array.Copy(segment.Array, segment.Offset, result, 0, segment.Count); - return result; - } - - public static uint[] ToUInt16Array(this byte[] source) - { - var decoded = new uint[source.Length / 4]; - Buffer.BlockCopy(source, 0, decoded, 0, source.Length); - return decoded; - } - } -} diff --git a/test/NaCl.Core.SimdTests/Vectors/HChaCha20TestVector.cs b/test/NaCl.Core.SimdTests/Vectors/HChaCha20TestVector.cs deleted file mode 100644 index 6d73b61..0000000 --- a/test/NaCl.Core.SimdTests/Vectors/HChaCha20TestVector.cs +++ /dev/null @@ -1,67 +0,0 @@ -namespace NaCl.Core.SimdTests.Vectors -{ - using Internal; - - public class HChaCha20TestVector - { - public byte[] Key { get; private set; } - public byte[] Input { get; private set; } - public byte[] Output { get; private set; } - - public HChaCha20TestVector(string key, string input, string output) - { - Key = CryptoBytes.FromHexString(key); - Input = CryptoBytes.FromHexString(input); - Output = CryptoBytes.FromHexString(output); - } - - public static HChaCha20TestVector[] HChaCha20TestVectors = { - // From libsodium's test/default/xchacha20.c (tv_hchacha20). - new HChaCha20TestVector( - "24f11cce8a1b3d61e441561a696c1c1b7e173d084fd4812425435a8896a013dc", - "d9660c5900ae19ddad28d6e06e45fe5e", - "5966b3eec3bff1189f831f06afe4d4e3be97fa9235ec8c20d08acfbbb4e851e3"), - new HChaCha20TestVector( - "80a5f6272031e18bb9bcd84f3385da65e7731b7039f13f5e3d475364cd4d42f7", - "c0eccc384b44c88e92c57eb2d5ca4dfa", - "6ed11741f724009a640a44fce7320954c46e18e0d7ae063bdbc8d7cf372709df"), - new HChaCha20TestVector( - "cb1fc686c0eec11a89438b6f4013bf110e7171dace3297f3a657a309b3199629", - "fcd49b93e5f8f299227e64d40dc864a3", - "84b7e96937a1a0a406bb7162eeaad34308d49de60fd2f7ec9dc6a79cbab2ca34"), - new HChaCha20TestVector( - "6640f4d80af5496ca1bc2cfff1fefbe99638dbceaabd7d0ade118999d45f053d", - "31f59ceeeafdbfe8cae7914caeba90d6", - "9af4697d2f5574a44834a2c2ae1a0505af9f5d869dbe381a994a18eb374c36a0"), - new HChaCha20TestVector( - "0693ff36d971225a44ac92c092c60b399e672e4cc5aafd5e31426f123787ac27", - "3a6293da061da405db45be1731d5fc4d", - "f87b38609142c01095bfc425573bb3c698f9ae866b7e4216840b9c4caf3b0865"), - new HChaCha20TestVector( - "809539bd2639a23bf83578700f055f313561c7785a4a19fc9114086915eee551", - "780c65d6a3318e479c02141d3f0b3918", - "902ea8ce4680c09395ce71874d242f84274243a156938aaa2dd37ac5be382b42"), - new HChaCha20TestVector( - "1a170ddf25a4fd69b648926e6d794e73408805835c64b2c70efddd8cd1c56ce0", - "05dbee10de87eb0c5acb2b66ebbe67d3", - "a4e20b634c77d7db908d387b48ec2b370059db916e8ea7716dc07238532d5981"), - new HChaCha20TestVector( - "3b354e4bb69b5b4a1126f509e84cad49f18c9f5f29f0be0c821316a6986e15a6", - "d8a89af02f4b8b2901d8321796388b6c", - "9816cb1a5b61993735a4b161b51ed2265b696e7ded5309c229a5a99f53534fbc"), - new HChaCha20TestVector( - "4b9a818892e15a530db50dd2832e95ee192e5ed6afffb408bd624a0c4e12a081", - "a9079c551de70501be0286d1bc78b045", - "ebc5224cf41ea97473683b6c2f38a084bf6e1feaaeff62676db59d5b719d999b"), - new HChaCha20TestVector( - "c49758f00003714c38f1d4972bde57ee8271f543b91e07ebce56b554eb7fa6a7", - "31f0204e10cf4f2035f9e62bb5ba7303", - "0dd8cc400f702d2c06ed920be52048a287076b86480ae273c6d568a2e9e7518c"), - // From https://tools.ietf.org/html/draft-arciszewski-xchacha-01#section-2.2.1. - new HChaCha20TestVector( - "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f", - "000000090000004a0000000031415927", - "82413b4227b27bfed30e42508a877d73a0f9e4d58a74a853c12ec41326d3ecdc") - }; - } -} diff --git a/test/NaCl.Core.SimdTests/Vectors/Rfc8439TestVector.cs b/test/NaCl.Core.SimdTests/Vectors/Rfc8439TestVector.cs deleted file mode 100644 index 8bdc0d2..0000000 --- a/test/NaCl.Core.SimdTests/Vectors/Rfc8439TestVector.cs +++ /dev/null @@ -1,160 +0,0 @@ -namespace NaCl.Core.SimdTests.Vectors -{ - using Internal; - - public class Rfc8439TestVector - { - public byte[] Key { get; private set; } - public byte[] PlainText { get; private set; } - public byte[] Nonce { get; private set; } - public byte[] CipherText { get; private set; } - public int InitialCounter { get; private set; } - public byte[] Aad { get; private set; } - public byte[] Tag { get; private set; } - - public string Id { get; private set; } // used to identify the benchmark test - - public Rfc8439TestVector(string key, string plaintext, string nonce, string ciphertext, int initialCounter, string id) - { - Key = CryptoBytes.FromHexString(key); - PlainText = CryptoBytes.FromHexString(plaintext); - Nonce = CryptoBytes.FromHexString(nonce); - CipherText = CryptoBytes.FromHexString(ciphertext); - InitialCounter = initialCounter; - Id = id; - } - - public Rfc8439TestVector(string plaintext, string aad, string key, string nonce, string ciphertext, string tag, string id) - { - PlainText = CryptoBytes.FromHexString(plaintext); - Aad = CryptoBytes.FromHexString(aad); - Key = CryptoBytes.FromHexString(key); - Nonce = CryptoBytes.FromHexString(nonce); - CipherText = CryptoBytes.FromHexString(ciphertext); - Tag = CryptoBytes.FromHexString(tag); - Id = id; - } - - public override string ToString() => Id; - - public static Rfc8439TestVector[] Rfc8439TestVectors = - { - // Tests against the test vectors in Section 2.3.2 of RFC 8439. - // https://tools.ietf.org/html/rfc8439#section-2.3.2 - new Rfc8439TestVector( - "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f", - "4c616469657320616e642047656e746c656d656e206f662074686520636c617373206f66202739393a20496620" - + "4920636f756c64206f6666657220796f75206f6e6c79206f6e652074697020666f722074686520667574" - + "7572652c2073756e73637265656e20776f756c642062652069742e", - "000000000000004a00000000", - "6e2e359a2568f98041ba0728dd0d6981e97e7aec1d4360c20a27afccfd9fae0bf91b65c5524733ab8f593dabcd" - + "62b3571639d624e65152ab8f530c359f0861d807ca0dbf500d6a6156a38e088a22b65e52bc514d16ccf8" - + "06818ce91ab77937365af90bbf74a35be6b40b8eedf2785e42874d", - 1, "Test Vector #1"), - new Rfc8439TestVector( - "0000000000000000000000000000000000000000000000000000000000000000", - "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000" - + "00000000000000000000000000000000000000", - "000000000000000000000000", - "76b8e0ada0f13d90405d6ae55386bd28bdd219b8a08ded1aa836efcc8b770dc7da41597c5157488d7724e03fb8" - + "d84a376a43b8f41518a11cc387b669b2ee6586", - 0, "Test Vector #2"), - new Rfc8439TestVector( - "0000000000000000000000000000000000000000000000000000000000000001", - "416e79207375626d697373696f6e20746f20746865204945544620696e74656e6465642062792074686520436f" - + "6e7472696275746f7220666f72207075626c69636174696f6e20617320616c6c206f722070617274206f" - + "6620616e204945544620496e7465726e65742d4472616674206f722052464320616e6420616e79207374" - + "6174656d656e74206d6164652077697468696e2074686520636f6e74657874206f6620616e2049455446" - + "20616374697669747920697320636f6e7369646572656420616e20224945544620436f6e747269627574" - + "696f6e222e20537563682073746174656d656e747320696e636c756465206f72616c2073746174656d65" - + "6e747320696e20494554462073657373696f6e732c2061732077656c6c206173207772697474656e2061" - + "6e6420656c656374726f6e696320636f6d6d756e69636174696f6e73206d61646520617420616e792074" - + "696d65206f7220706c6163652c207768696368206172652061646472657373656420746f", - "000000000000000000000002", - "a3fbf07df3fa2fde4f376ca23e82737041605d9f4f4f57bd8cff2c1d4b7955ec2a97948bd3722915c8f3d337f7" - + "d370050e9e96d647b7c39f56e031ca5eb6250d4042e02785ececfa4b4bb5e8ead0440e20b6e8db09d881" - + "a7c6132f420e52795042bdfa7773d8a9051447b3291ce1411c680465552aa6c405b7764d5e87bea85ad0" - + "0f8449ed8f72d0d662ab052691ca66424bc86d2df80ea41f43abf937d3259dc4b2d0dfb48a6c9139ddd7" - + "f76966e928e635553ba76c5c879d7b35d49eb2e62b0871cdac638939e25e8a1e0ef9d5280fa8ca328b35" - + "1c3c765989cbcf3daa8b6ccc3aaf9f3979c92b3720fc88dc95ed84a1be059c6499b9fda236e7e818b04b" - + "0bc39c1e876b193bfe5569753f88128cc08aaa9b63d1a16f80ef2554d7189c411f5869ca52c5b83fa36f" - + "f216b9c1d30062bebcfd2dc5bce0911934fda79a86f6e698ced759c3ff9b6477338f3da4f9cd8514ea99" - + "82ccafb341b2384dd902f3d1ab7ac61dd29c6f21ba5b862f3730e37cfdc4fd806c22f221", - 1, "Test Vector #3"), - new Rfc8439TestVector( - "1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0", - "2754776173206272696c6c69672c20616e642074686520736c6974687920746f7665730a446964206779726520616e642067696d626c6520696e2074686520776162653a0a416c6c206d696d737920776572652074686520626f726f676f7665732c0a416e6420746865206d6f6d65207261746873206f757467726162652e", - "000000000000000000000002", - "62e6347f95ed87a45ffae7426f27a1df5fb69110044c0d73118effa95b01e5cf166d3df2d721caf9b21e5fb14c616871fd84c54f9d65b283196c7fe4f60553ebf39c6402c42234e32a356b3e764312a61a5532055716ead6962568f87d3f3f7704c6a8d1bcd1bf4d50d6154b6da731b187b58dfd728afa36757a797ac188d1", - 42, "Test Vector #4"), - // Tests against the test vectors in Section 2.6.2 of RFC 8439. - // https://tools.ietf.org/html/rfc8439#section-2.6.2 - new Rfc8439TestVector( - "808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f", - "0000000000000000000000000000000000000000000000000000000000000000", - "000000000001020304050607", - "8ad5a08b905f81cc815040274ab29471a833b637e3fd0da508dbb8e2fdd1a646", - 0, "Test Vector #5"), - new Rfc8439TestVector( - "0000000000000000000000000000000000000000000000000000000000000000", - "0000000000000000000000000000000000000000000000000000000000000000", - "000000000000000000000000", - "76b8e0ada0f13d90405d6ae55386bd28bdd219b8a08ded1aa836efcc8b770dc7", - 0, "Test Vector #6"), - new Rfc8439TestVector( - "0000000000000000000000000000000000000000000000000000000000000001", - "0000000000000000000000000000000000000000000000000000000000000000", - "000000000000000000000002", - "ecfa254f845f647473d3cb140da9e87606cb33066c447b87bc2666dde3fbb739", - 0, "Test Vector #7"), - new Rfc8439TestVector( - "1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0", - "0000000000000000000000000000000000000000000000000000000000000000", - "000000000000000000000002", - "965e3bc6f9ec7ed9560808f4d229f94b137ff275ca9b3fcbdd59deaad23310ae", - 0, "Test Vector #8"), - }; - - public static Rfc8439TestVector[] Rfc8439AeadTestVectors = - { - // Section 2.8.2 - // Example and Test Vector for AEAD_CHACHA20_POLY1305 - // https://tools.ietf.org/html/rfc8439#section-2.8.2 - new Rfc8439TestVector( - "4c616469657320616e642047656e746c656d656e206f662074686520636c617373206f66202739393a204966204920636f756c64206f6666657220796f75206f6e6c79206f6e652074697020666f7220746865206675747572652c2073756e73637265656e20776f756c642062652069742e", - "50515253c0c1c2c3c4c5c6c7", - "808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f", - "07000000" + "4041424344454647", - "d31a8d34648e60db7b86afbc53ef7ec2a4aded51296e08fea9e2b5a736ee62d63dbea45e8ca9671282fafb69da92728b1a71de0a9e060b2905d6a5b67ecd3b3692ddbd7f2d778b8c9803aee328091b58fab324e4fad675945585808b4831d7bc3ff4def08e4b7a9de576d26586cec64b6116", - "1ae10b594f09e26a7e902ecbd0600691", "Section 2.8.2"), - // Appendix A.5 - new Rfc8439TestVector( - "496e7465726e65742d4472616674732061726520647261667420646f63756d656e74732076616c696420666f722061206d6178696d756d206f6620736978206d6f6e74687320616e64206d617920626520757064617465642c207265706c616365642c206f72206f62736f6c65746564206279206f7468657220646f63756d656e747320617420616e792074696d652e20497420697320696e617070726f70726961746520746f2075736520496e7465726e65742d447261667473206173207265666572656e6365206d6174657269616c206f7220746f2063697465207468656d206f74686572207468616e206173202fe2809c776f726b20696e2070726f67726573732e2fe2809d", - "f33388860000000000004e91", - "1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0", - "000000000102030405060708", - "64a0861575861af460f062c79be643bd5e805cfd345cf389f108670ac76c8cb24c6cfc18755d43eea09ee94e382d26b0bdb7b73c321b0100d4f03b7f355894cf332f830e710b97ce98c8a84abd0b948114ad176e008d33bd60f982b1ff37c8559797a06ef4f0ef61c186324e2b3506383606907b6a7c02b0f9f6157b53c867e4b9166c767b804d46a59b5216cde7a4e99040c5a40433225ee282a1b0a06c523eaf4534d7f83fa1155b0047718cbc546a0d072b04b3564eea1b422273f548271a0bb2316053fa76991955ebd63159434ecebb4e466dae5a1073a6727627097a1049e617d91d361094fa68f0ff77987130305beaba2eda04df997b714d6c6f2c29a6ad5cb4022b02709b", - "eead9d67890cbb22392336fea1851f38", "Appendix A.5") - }; - - public static Rfc8439TestVector[] Rfc7634AeadTestVectors = - { - // Appendix A. - new Rfc8439TestVector( - "45000054a6f200004001e778c6336405c000020508005b7a3a080000553bec100007362708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363701020204", - "0102030400000005", - "808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f", - "a0a1a2a31011121314151617", - "24039428b97f417e3c13753a4f05087b67c352e6a7fab1b982d466ef407ae5c614ee8099d52844eb61aa95dfab4c02f72aa71e7c4c4f64c9befe2facc638e8f3cbec163fac469b502773f6fb94e664da9165b82829f641e0", - "76aaa8266b7fb0f7b11b369907e1ad43", "Appendix A"), - // Appendix B. - new Rfc8439TestVector( - "0000000c000040010000000a00", - "c0c1c2c3c4c5c6c7d0d1d2d3d4d5d6d72e202500000000090000004529000029", - "808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f", - "a0a1a2a31011121314151617", - "610394701f8d017f7c12924889", - "6b71bfe25236efd7cdc67066906315b2", "Appendix B") - }; - } -} diff --git a/test/NaCl.Core.SimdTests/Vectors/Salsa20TestVector.cs b/test/NaCl.Core.SimdTests/Vectors/Salsa20TestVector.cs deleted file mode 100644 index f89a5da..0000000 --- a/test/NaCl.Core.SimdTests/Vectors/Salsa20TestVector.cs +++ /dev/null @@ -1,35 +0,0 @@ -namespace NaCl.Core.SimdTests.Vectors -{ - using NaCl.Core.Internal; - - public class Salsa20TestVector - { - public Salsa20TestVector(string name, string key, string iv, string block1, string block4, string block5, string block8) - { - Name = name; - Key = CryptoBytes.FromHexString(key); - IV = CryptoBytes.FromHexString(iv); - - ExpectedBlock1 = block1; - ExpectedBlock4 = block4; - ExpectedBlock5 = block5; - ExpectedBlock8 = block8; - } - - public string Name { get; } - - public byte[] Key { get; } - - public byte[] IV { get; } - - public string ExpectedBlock1 { get; } - - public string ExpectedBlock4 { get; } - - public string ExpectedBlock5 { get; } - - public string ExpectedBlock8 { get; } - - public override string ToString() => Name; - } -} \ No newline at end of file From 23722b49a06aadb562b4ab6751d12fa562438cd6 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Wed, 9 Nov 2022 13:01:52 +0000 Subject: [PATCH 55/59] Code cleanup --- src/NaCl.Core/Base/ChaCha20Base.cs | 2 +- src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha256.cs | 2 +- src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs | 2 +- src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs | 10 +++++----- src/NaCl.Core/Base/Salsa20Base.cs | 1 + src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs | 2 +- src/NaCl.Core/Base/SalsaIntrinsics/Salsa512.cs | 2 +- 7 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/NaCl.Core/Base/ChaCha20Base.cs b/src/NaCl.Core/Base/ChaCha20Base.cs index 6037c95..705beab 100644 --- a/src/NaCl.Core/Base/ChaCha20Base.cs +++ b/src/NaCl.Core/Base/ChaCha20Base.cs @@ -98,7 +98,7 @@ public void HChaCha20(Span subKey, ReadOnlySpan nonce) ShuffleState(state); // Final subkey = state[0..4] || state[12..16] - state.Slice(12, 4).CopyTo(state.Slice(4,4)); + state.Slice(12, 4).CopyTo(state.Slice(4, 4)); ArrayUtils.StoreArray8UInt32LittleEndian(subKey, 0, state); } diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha256.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha256.cs index 51fd65b..041565d 100644 --- a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha256.cs +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha256.cs @@ -2,8 +2,8 @@ namespace NaCl.Core.Base.ChaChaIntrinsics; using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; #pragma warning disable IDE0007 // Use implicit type internal static class ChaCha256 diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs index 3cdb418..ff3adc3 100644 --- a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha512.cs @@ -2,8 +2,8 @@ namespace NaCl.Core.Base.ChaChaIntrinsics; using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; #pragma warning disable IDE0007 // Use implicit type internal static class ChaCha512 diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs index cad266d..39fd759 100644 --- a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs @@ -2,8 +2,8 @@ namespace NaCl.Core.Base.ChaChaIntrinsics; using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; #pragma warning disable IDE0007 // Use implicit type internal static class ChaCha64 @@ -14,10 +14,10 @@ internal static class ChaCha64 [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void Process64(uint* x, ref byte* m, ref byte* c, ref ulong bytes) { - Vector128 x_0 =Sse2.LoadVector128(x); - Vector128 x_1 =Sse2.LoadVector128(x + 4); - Vector128 x_2 =Sse2.LoadVector128(x + 8); - Vector128 x_3 =Sse2.LoadVector128(x + 12); + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); Vector128 orig_0 = x_0; Vector128 orig_1 = x_1; diff --git a/src/NaCl.Core/Base/Salsa20Base.cs b/src/NaCl.Core/Base/Salsa20Base.cs index 49947f0..08816b8 100644 --- a/src/NaCl.Core/Base/Salsa20Base.cs +++ b/src/NaCl.Core/Base/Salsa20Base.cs @@ -2,6 +2,7 @@ { using System; using System.Runtime.CompilerServices; + using System.Runtime.InteropServices; using System.Security.Cryptography; using Internal; diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs index 8fdbd4a..58a0147 100644 --- a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa256.cs @@ -2,8 +2,8 @@ namespace NaCl.Core.Base.SalsaIntrinsics; using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; #pragma warning disable IDE0007 // Use implicit type internal static class Salsa256 diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa512.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa512.cs index 281de50..f1b4d4a 100644 --- a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa512.cs +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa512.cs @@ -2,8 +2,8 @@ namespace NaCl.Core.Base.SalsaIntrinsics; using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; #pragma warning disable IDE0007 // Use implicit type #pragma warning disable IDE0022 // Use expression body for methods From 3970397c88e845bb2793eef66728d011c121353f Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Wed, 9 Nov 2022 13:13:28 +0000 Subject: [PATCH 56/59] Deleted powershell script and updated cake script --- NaCl.Core.sln | 2 +- TestIntrinsics.ps1 | 41 -------------------------------------- build.cake | 49 ++++++++++++++++++++++++++++++---------------- 3 files changed, 33 insertions(+), 59 deletions(-) delete mode 100644 TestIntrinsics.ps1 diff --git a/NaCl.Core.sln b/NaCl.Core.sln index f64e8df..ceaf32d 100644 --- a/NaCl.Core.sln +++ b/NaCl.Core.sln @@ -11,8 +11,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "NaCl.Core.Benchmarks", "tes EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{28EF1FB3-A057-4C17-A3B2-B9370B234F81}" ProjectSection(SolutionItems) = preProject + src\NaCl.Core\build.cake = src\NaCl.Core\build.cake CodeCoverage.runsettings = CodeCoverage.runsettings - TestIntrinsics.ps1 = TestIntrinsics.ps1 EndProjectSection EndProject Global diff --git a/TestIntrinsics.ps1 b/TestIntrinsics.ps1 deleted file mode 100644 index 2fded01..0000000 --- a/TestIntrinsics.ps1 +++ /dev/null @@ -1,41 +0,0 @@ -$env:COMPlus_EnableAVX2 = 1 -$env:COMPlus_EnableSSE3 = 1 -$env:COMPlus_EnableSSE2 = 1 -Write-Host "Test Environment: Normal" -ForegroundColor "Cyan" -dotnet test .\test\NaCl.Core.Tests\NaCl.Core.Tests.csproj -if ($LastExitCode -ne 0) { - Write-Host "Tests failed, aborting build!" -Foreground "Red" - Exit 1 -} - -$env:COMPlus_EnableAVX2 = 0 -$env:COMPlus_EnableSSE3 = 1 -$env:COMPlus_EnableSSE2 = 1 -Write-Host "Test Environment: AVX2 Disabled" -ForegroundColor "Cyan" -dotnet test .\test\NaCl.Core.Tests\NaCl.Core.Tests.csproj -if ($LastExitCode -ne 0) { - Write-Host "Tests failed, aborting build!" -Foreground "Red" - Exit 1 -} - -$env:COMPlus_EnableAVX2 = 0 -$env:COMPlus_EnableSSE3 = 0 -$env:COMPlus_EnableSSE2 = 1 -Write-Host "Test Environment: SSE3 Disabled" -ForegroundColor "Cyan" -dotnet test .\test\NaCl.Core.Tests\NaCl.Core.Tests.csproj -if ($LastExitCode -ne 0) { - Write-Host "Tests failed, aborting build!" -Foreground "Red" - Exit 1 -} - -$env:COMPlus_EnableAVX2 = 0 -$env:COMPlus_EnableSSE3 = 0 -$env:COMPlus_EnableSSE2 = 0 -Write-Host "Test Environment: SSE2 Disabled" -ForegroundColor "Cyan" -dotnet test .\test\NaCl.Core.Tests\NaCl.Core.Tests.csproj -if ($LastExitCode -ne 0) { - Write-Host "Tests failed, aborting build!" -Foreground "Red" - Exit 1 -} - -Write-Host "Tests passed!" -ForegroundColor "Green" \ No newline at end of file diff --git a/build.cake b/build.cake index f545f79..38516a9 100644 --- a/build.cake +++ b/build.cake @@ -41,24 +41,39 @@ Task("Test") .DoesForEach(GetFiles("./test/**/*.Tests.csproj"), project => { Information($"Preparing {project.GetFilename()} for test"); - - DotNetTest( - project.ToString(), - new DotNetTestSettings() + var settings = new DotNetTestSettings() + { + Blame = true, + Collectors = new string[] { "XPlat Code Coverage" }, + Configuration = configuration, + Loggers = new string[] { - Blame = true, - Collectors = new string[] { "XPlat Code Coverage" }, - Configuration = configuration, - Loggers = new string[] - { - $"trx;LogFileName={project.GetFilenameWithoutExtension()}.trx", - $"html;LogFileName={project.GetFilenameWithoutExtension()}.html", - }, - NoBuild = true, - NoRestore = true, - ResultsDirectory = $"{artifactsDirectory}/TestResults", - Settings = "CodeCoverage.runsettings" - }); + $"trx;LogFileName={project.GetFilenameWithoutExtension()}.trx", + $"html;LogFileName={project.GetFilenameWithoutExtension()}.html", + }, + NoBuild = true, + NoRestore = true, + ResultsDirectory = $"{artifactsDirectory}/TestResults", + Settings = "CodeCoverage.runsettings" + }; + + Information($"Running default {project.GetFilename()} test"); + DotNetTest(project.ToString(), settings); + + settings.EnvironmentVariables["COMPlus_EnableAVX2"] = "1"; + settings.EnvironmentVariables["COMPlus_EnableSSE3"] = "1"; + Information($"Running AVX2 and SSE3 enabled {project.GetFilename()} test"); + DotNetTest(project.ToString(), settings); + + settings.EnvironmentVariables["COMPlus_EnableAVX2"] = "0"; + settings.EnvironmentVariables["COMPlus_EnableSSE3"] = "1"; + Information($"Running SSE3 enabled and AVX2 disabled {project.GetFilename()} test"); + DotNetTest(project.ToString(), settings); + + settings.EnvironmentVariables["COMPlus_EnableAVX2"] = "0"; + settings.EnvironmentVariables["COMPlus_EnableSSE3"] = "0"; + Information($"Running SSE3 and AVX2 disabled {project.GetFilename()} test"); + DotNetTest(project.ToString(), settings); }); Task("CoverageReport") From 4cebc300f67ea78c86c00e104a71c1fc5c31952e Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Wed, 9 Nov 2022 14:09:14 +0000 Subject: [PATCH 57/59] Correct visibility, remove unneeded InternalsVisibleTo, minor chnages --- src/NaCl.Core/Base/ChaCha20Base.cs | 4 ++-- src/NaCl.Core/Base/Salsa20Base.cs | 3 +-- src/NaCl.Core/Properties/AssemblyInfo.cs | 3 +-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/NaCl.Core/Base/ChaCha20Base.cs b/src/NaCl.Core/Base/ChaCha20Base.cs index 705beab..78ddd17 100644 --- a/src/NaCl.Core/Base/ChaCha20Base.cs +++ b/src/NaCl.Core/Base/ChaCha20Base.cs @@ -52,7 +52,7 @@ public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter // alternating between "column rounds" and "diagonal rounds"; each round consisting of four quarter-rounds. Span workingState = stackalloc uint[BLOCK_SIZE_IN_INTS]; state.CopyTo(workingState); - ShuffleState(state); + ShuffleState(workingState); // At the end of the rounds, add the result to the original state. for (var i = 0; i < BLOCK_SIZE_IN_INTS; i++) @@ -247,7 +247,7 @@ protected static void ShuffleState(ref Array16 state) */ [MethodImpl(MethodImplOptions.AggressiveInlining)] - protected internal static void ShuffleState(Span state) + protected static void ShuffleState(Span state) { // 10 loops × 2 rounds/loop = 20 rounds for (var i = 0; i < 10; i++) diff --git a/src/NaCl.Core/Base/Salsa20Base.cs b/src/NaCl.Core/Base/Salsa20Base.cs index 08816b8..03cfe54 100644 --- a/src/NaCl.Core/Base/Salsa20Base.cs +++ b/src/NaCl.Core/Base/Salsa20Base.cs @@ -2,7 +2,6 @@ { using System; using System.Runtime.CompilerServices; - using System.Runtime.InteropServices; using System.Security.Cryptography; using Internal; @@ -141,7 +140,7 @@ public void HSalsa20InitialState(Span state, ReadOnlySpan nonce) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - protected internal static void ShuffleState(Span state) + protected static void ShuffleState(Span state) { // 10 loops × 2 rounds/loop = 20 rounds for (var i = 0; i < 10; i++) diff --git a/src/NaCl.Core/Properties/AssemblyInfo.cs b/src/NaCl.Core/Properties/AssemblyInfo.cs index 36635ef..69cd83c 100644 --- a/src/NaCl.Core/Properties/AssemblyInfo.cs +++ b/src/NaCl.Core/Properties/AssemblyInfo.cs @@ -1,3 +1,2 @@ [assembly: System.Runtime.CompilerServices.InternalsVisibleTo("NaCl.Core.Benchmarks, PublicKey=0024000004800000940000000602000000240000525341310004000001000100ad335fda87151ecc6a84cc4b5b349b1edbb51a19dfa9d0cfedfa2d804a39f9fbc0afbb943a1285e27ec5fce25103d333873deba41dd792e8a89c50134ae409dd01021bf9408b7dcb378b3d87a3bd563882721cbd50097146903e61d601a9fa47fedb19cdac1fb1c7ab280a8dd0cccbf95f4278c44529d1ef6c478fa08036a0af")] -[assembly: System.Runtime.CompilerServices.InternalsVisibleTo("NaCl.Core.Tests, PublicKey=0024000004800000940000000602000000240000525341310004000001000100ad335fda87151ecc6a84cc4b5b349b1edbb51a19dfa9d0cfedfa2d804a39f9fbc0afbb943a1285e27ec5fce25103d333873deba41dd792e8a89c50134ae409dd01021bf9408b7dcb378b3d87a3bd563882721cbd50097146903e61d601a9fa47fedb19cdac1fb1c7ab280a8dd0cccbf95f4278c44529d1ef6c478fa08036a0af")] -[assembly: System.Runtime.CompilerServices.InternalsVisibleTo("NaCl.Core.SimdTests, PublicKey=0024000004800000940000000602000000240000525341310004000001000100ad335fda87151ecc6a84cc4b5b349b1edbb51a19dfa9d0cfedfa2d804a39f9fbc0afbb943a1285e27ec5fce25103d333873deba41dd792e8a89c50134ae409dd01021bf9408b7dcb378b3d87a3bd563882721cbd50097146903e61d601a9fa47fedb19cdac1fb1c7ab280a8dd0cccbf95f4278c44529d1ef6c478fa08036a0af")] \ No newline at end of file +[assembly: System.Runtime.CompilerServices.InternalsVisibleTo("NaCl.Core.Tests, PublicKey=0024000004800000940000000602000000240000525341310004000001000100ad335fda87151ecc6a84cc4b5b349b1edbb51a19dfa9d0cfedfa2d804a39f9fbc0afbb943a1285e27ec5fce25103d333873deba41dd792e8a89c50134ae409dd01021bf9408b7dcb378b3d87a3bd563882721cbd50097146903e61d601a9fa47fedb19cdac1fb1c7ab280a8dd0cccbf95f4278c44529d1ef6c478fa08036a0af")] \ No newline at end of file From dd6178ca2ba98631543f4ed1a96d12fbdba23184 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Wed, 9 Nov 2022 14:44:06 +0000 Subject: [PATCH 58/59] Add Salsa64 SSE41 shuffle optimisation, updated guards/checks and added more comments. --- .../Base/ChaChaIntrinsics/ChaCha64.cs | 23 +++++--- src/NaCl.Core/Base/Salsa20Base.cs | 4 +- src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs | 3 +- src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs | 54 +++++++++++-------- 4 files changed, 52 insertions(+), 32 deletions(-) diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs index 39fd759..340cd5e 100644 --- a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs @@ -26,15 +26,18 @@ public static unsafe void Process64(uint* x, ref byte* m, ref byte* c, ref ulong ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + // Add the orginal and shuffled state. x_0 = Sse2.Add(x_0, orig_0); x_1 = Sse2.Add(x_1, orig_1); x_2 = Sse2.Add(x_2, orig_2); x_3 = Sse2.Add(x_3, orig_3); + // Xor the key stream and message to obtain the cipher. x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32(); x_2 = Sse2.Xor(x_2.AsByte(), Sse2.LoadVector128(m + 32)).AsUInt32(); x_3 = Sse2.Xor(x_3.AsByte(), Sse2.LoadVector128(m + 48)).AsUInt32(); + Sse2.Store(c, x_0.AsByte()); Sse2.Store(c + 16, x_1.AsByte()); Sse2.Store(c + 32, x_2.AsByte()); @@ -71,24 +74,28 @@ public static unsafe void ProcessVarLength(uint* x, ref byte* m, ref byte* c, re ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + // Add the orginal and shuffled state. x_0 = Sse2.Add(x_0, orig_0); x_1 = Sse2.Add(x_1, orig_1); x_2 = Sse2.Add(x_2, orig_2); x_3 = Sse2.Add(x_3, orig_3); - byte* partialblock = stackalloc byte[64]; - Sse2.Store(partialblock, Vector128.AsByte(x_0)); - Sse2.Store(partialblock + 16, Vector128.AsByte(x_1)); - Sse2.Store(partialblock + 32, Vector128.AsByte(x_2)); - Sse2.Store(partialblock + 48, Vector128.AsByte(x_3)); + // Load the shuffled state into a temporary span. + byte* partialBlock = stackalloc byte[64]; + Sse2.Store(partialBlock, Vector128.AsByte(x_0)); + Sse2.Store(partialBlock + 16, Vector128.AsByte(x_1)); + Sse2.Store(partialBlock + 32, Vector128.AsByte(x_2)); + Sse2.Store(partialBlock + 48, Vector128.AsByte(x_3)); + // TODO use vector + // Xor the key stream and message to obtain the cipher. for (ulong i = 0; i nonce, Span o public void HSalsa20(Span subKey, ReadOnlySpan nonce) { // See: http://cr.yp.to/snuffle/xsalsa-20081128.pdf under 2. Specification - Definition of HSalsa20 - Span state = stackalloc uint[BLOCK_SIZE_IN_BYTES]; // Setting HSalsa20 initial state HSalsa20InitialState(state, nonce); - #if INTRINSICS - if (System.Runtime.Intrinsics.X86.Sse2.IsSupported && BitConverter.IsLittleEndian) + if (System.Runtime.Intrinsics.X86.Sse41.IsSupported && BitConverter.IsLittleEndian) { Salsa20BaseIntrinsics.HSalsa20(state, subKey); return; diff --git a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs index 423f502..4555233 100644 --- a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs @@ -41,7 +41,8 @@ public static unsafe void Salsa20(Span state, ReadOnlySpan input, Sp [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void HSalsa20(ReadOnlySpan state, Span subKey) { - ValidateDeviceSupport(); + if (!Sse41.IsSupported || !BitConverter.IsLittleEndian) + throw new NotSupportedException($"{nameof(Sse41)} vectorisation is not supported on this device."); fixed (uint* x = state) fixed (byte* sk = subKey) diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs index e4223fc..b84e494 100644 --- a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs @@ -23,6 +23,7 @@ public static unsafe void Process64(uint* x, ref byte* m, ref byte* c, ref ulong ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + // Add the orginal and shuffled state. x_0 = Sse2.Add(x_0, orig_0); x_1 = Sse2.Add(x_1, orig_1); x_2 = Sse2.Add(x_2, orig_2); @@ -70,25 +71,28 @@ public static unsafe void ProcessVarLength(uint* x, ref byte* m, ref byte* c, re ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + // Add the orginal and shuffled state. x_0 = Sse2.Add(x_0, orig_0); x_1 = Sse2.Add(x_1, orig_1); x_2 = Sse2.Add(x_2, orig_2); x_3 = Sse2.Add(x_3, orig_3); - byte* partialblock = stackalloc byte[64]; - Sse2.Store(partialblock, Vector128.AsByte(x_0)); - Sse2.Store(partialblock + 16, Vector128.AsByte(x_1)); - Sse2.Store(partialblock + 32, Vector128.AsByte(x_2)); - Sse2.Store(partialblock + 48, Vector128.AsByte(x_3)); + // Load the shuffled state into a temporary span. + byte* partialBlock = stackalloc byte[64]; + Sse2.Store(partialBlock, Vector128.AsByte(x_0)); + Sse2.Store(partialBlock + 16, Vector128.AsByte(x_1)); + Sse2.Store(partialBlock + 32, Vector128.AsByte(x_2)); + Sse2.Store(partialBlock + 48, Vector128.AsByte(x_3)); // TODO use vector + // Xor the key stream and message to obtain the cipher. for (ulong i = 0; i < bytes; i++) { - c[i] = (byte)(m[i] ^ partialblock[i]); + c[i] = (byte)(m[i] ^ partialBlock[i]); } for (int n = 0; n < 64 / sizeof(int); n++) { - ((int*)partialblock)[n] = 0; + ((int*)partialBlock)[n] = 0; } } @@ -102,7 +106,7 @@ public static unsafe void HSalsa20(uint* x, byte* sk) ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); - // HSalsa returns a 32 byte array of 0,5,10,15,6,7,8,9 + // HSalsa returns a 32 byte array of index 0,5,10,15,6,7,8,9 // <0, 5, 2, 3> + <8, 9, 10, 15> -> <0, 5, 10, 15> var t_0 = GetDiagonal(x_0, x_1, x_2, x_3); @@ -129,6 +133,7 @@ public static unsafe void KeyStream64(uint* x, byte* c) ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + // Add the orginal and shuffled state. x_0 = Sse2.Add(x_0, orig_0); x_1 = Sse2.Add(x_1, orig_1); x_2 = Sse2.Add(x_2, orig_2); @@ -143,14 +148,15 @@ public static unsafe void KeyStream64(uint* x, byte* c) [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void ShuffleState(ref Vector128 x_0, ref Vector128 x_1, ref Vector128 x_2, ref Vector128 x_3) { - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); - - // Diagonalize - x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); - x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); - x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); + var u0 = Sse41.Blend(x_0.AsUInt16(), x_2.AsUInt16(), 0xF0); // 0, 1,10,11 + var u1 = Sse41.Blend(x_1.AsUInt16(), x_3.AsUInt16(), 0xC3); // 12, 5, 6,15 + var u2 = Sse41.Blend(x_0.AsUInt16(), x_2.AsUInt16(), 0x0F); // 8, 9, 2, 3 + var u3 = Sse41.Blend(x_1.AsUInt16(), x_3.AsUInt16(), 0x3C); // 4,13,14, 7 - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); + x_0 = Sse41.Blend(u0, u1, 0xCC).AsUInt32(); + x_3 = Sse41.Blend(u0, u1, 0b110011).AsUInt32(); + x_2 = Sse41.Blend(u2, u3, 0xCC).AsUInt32(); + x_1 = Sse41.Blend(u2, u3, 0b00110011).AsUInt32(); for (int i = 0; i < 20; i += 2) { @@ -173,14 +179,20 @@ private static unsafe void ShuffleState(ref Vector128 x_0, ref Vector128 Date: Wed, 9 Nov 2022 14:44:06 +0000 Subject: [PATCH 59/59] Add Salsa64 SSE41 shuffle optimisation, updated guards/checks, changed the cake file and added more comments. --- build.cake | 9 ++-- .../Base/ChaChaIntrinsics/ChaCha64.cs | 23 +++++--- src/NaCl.Core/Base/Salsa20Base.cs | 4 +- src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs | 3 +- src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs | 54 +++++++++++-------- 5 files changed, 55 insertions(+), 38 deletions(-) diff --git a/build.cake b/build.cake index 38516a9..295853e 100644 --- a/build.cake +++ b/build.cake @@ -57,22 +57,19 @@ Task("Test") Settings = "CodeCoverage.runsettings" }; - Information($"Running default {project.GetFilename()} test"); - DotNetTest(project.ToString(), settings); - settings.EnvironmentVariables["COMPlus_EnableAVX2"] = "1"; settings.EnvironmentVariables["COMPlus_EnableSSE3"] = "1"; - Information($"Running AVX2 and SSE3 enabled {project.GetFilename()} test"); + Information($"Running default {project.GetFilename()} test with SSE3 and AVX2 enabled"); DotNetTest(project.ToString(), settings); settings.EnvironmentVariables["COMPlus_EnableAVX2"] = "0"; settings.EnvironmentVariables["COMPlus_EnableSSE3"] = "1"; - Information($"Running SSE3 enabled and AVX2 disabled {project.GetFilename()} test"); + Information($"Running {project.GetFilename()} test with SSE3 enabled and AVX2 disabled"); DotNetTest(project.ToString(), settings); settings.EnvironmentVariables["COMPlus_EnableAVX2"] = "0"; settings.EnvironmentVariables["COMPlus_EnableSSE3"] = "0"; - Information($"Running SSE3 and AVX2 disabled {project.GetFilename()} test"); + Information($"Running {project.GetFilename()} test with SSE3 and AVX2 disabled"); DotNetTest(project.ToString(), settings); }); diff --git a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs index 39fd759..340cd5e 100644 --- a/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs +++ b/src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs @@ -26,15 +26,18 @@ public static unsafe void Process64(uint* x, ref byte* m, ref byte* c, ref ulong ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + // Add the orginal and shuffled state. x_0 = Sse2.Add(x_0, orig_0); x_1 = Sse2.Add(x_1, orig_1); x_2 = Sse2.Add(x_2, orig_2); x_3 = Sse2.Add(x_3, orig_3); + // Xor the key stream and message to obtain the cipher. x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32(); x_2 = Sse2.Xor(x_2.AsByte(), Sse2.LoadVector128(m + 32)).AsUInt32(); x_3 = Sse2.Xor(x_3.AsByte(), Sse2.LoadVector128(m + 48)).AsUInt32(); + Sse2.Store(c, x_0.AsByte()); Sse2.Store(c + 16, x_1.AsByte()); Sse2.Store(c + 32, x_2.AsByte()); @@ -71,24 +74,28 @@ public static unsafe void ProcessVarLength(uint* x, ref byte* m, ref byte* c, re ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + // Add the orginal and shuffled state. x_0 = Sse2.Add(x_0, orig_0); x_1 = Sse2.Add(x_1, orig_1); x_2 = Sse2.Add(x_2, orig_2); x_3 = Sse2.Add(x_3, orig_3); - byte* partialblock = stackalloc byte[64]; - Sse2.Store(partialblock, Vector128.AsByte(x_0)); - Sse2.Store(partialblock + 16, Vector128.AsByte(x_1)); - Sse2.Store(partialblock + 32, Vector128.AsByte(x_2)); - Sse2.Store(partialblock + 48, Vector128.AsByte(x_3)); + // Load the shuffled state into a temporary span. + byte* partialBlock = stackalloc byte[64]; + Sse2.Store(partialBlock, Vector128.AsByte(x_0)); + Sse2.Store(partialBlock + 16, Vector128.AsByte(x_1)); + Sse2.Store(partialBlock + 32, Vector128.AsByte(x_2)); + Sse2.Store(partialBlock + 48, Vector128.AsByte(x_3)); + // TODO use vector + // Xor the key stream and message to obtain the cipher. for (ulong i = 0; i nonce, Span o public void HSalsa20(Span subKey, ReadOnlySpan nonce) { // See: http://cr.yp.to/snuffle/xsalsa-20081128.pdf under 2. Specification - Definition of HSalsa20 - Span state = stackalloc uint[BLOCK_SIZE_IN_BYTES]; // Setting HSalsa20 initial state HSalsa20InitialState(state, nonce); - #if INTRINSICS - if (System.Runtime.Intrinsics.X86.Sse2.IsSupported && BitConverter.IsLittleEndian) + if (System.Runtime.Intrinsics.X86.Sse41.IsSupported && BitConverter.IsLittleEndian) { Salsa20BaseIntrinsics.HSalsa20(state, subKey); return; diff --git a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs index 423f502..4555233 100644 --- a/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs +++ b/src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs @@ -41,7 +41,8 @@ public static unsafe void Salsa20(Span state, ReadOnlySpan input, Sp [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void HSalsa20(ReadOnlySpan state, Span subKey) { - ValidateDeviceSupport(); + if (!Sse41.IsSupported || !BitConverter.IsLittleEndian) + throw new NotSupportedException($"{nameof(Sse41)} vectorisation is not supported on this device."); fixed (uint* x = state) fixed (byte* sk = subKey) diff --git a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs index e4223fc..b84e494 100644 --- a/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs +++ b/src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs @@ -23,6 +23,7 @@ public static unsafe void Process64(uint* x, ref byte* m, ref byte* c, ref ulong ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + // Add the orginal and shuffled state. x_0 = Sse2.Add(x_0, orig_0); x_1 = Sse2.Add(x_1, orig_1); x_2 = Sse2.Add(x_2, orig_2); @@ -70,25 +71,28 @@ public static unsafe void ProcessVarLength(uint* x, ref byte* m, ref byte* c, re ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + // Add the orginal and shuffled state. x_0 = Sse2.Add(x_0, orig_0); x_1 = Sse2.Add(x_1, orig_1); x_2 = Sse2.Add(x_2, orig_2); x_3 = Sse2.Add(x_3, orig_3); - byte* partialblock = stackalloc byte[64]; - Sse2.Store(partialblock, Vector128.AsByte(x_0)); - Sse2.Store(partialblock + 16, Vector128.AsByte(x_1)); - Sse2.Store(partialblock + 32, Vector128.AsByte(x_2)); - Sse2.Store(partialblock + 48, Vector128.AsByte(x_3)); + // Load the shuffled state into a temporary span. + byte* partialBlock = stackalloc byte[64]; + Sse2.Store(partialBlock, Vector128.AsByte(x_0)); + Sse2.Store(partialBlock + 16, Vector128.AsByte(x_1)); + Sse2.Store(partialBlock + 32, Vector128.AsByte(x_2)); + Sse2.Store(partialBlock + 48, Vector128.AsByte(x_3)); // TODO use vector + // Xor the key stream and message to obtain the cipher. for (ulong i = 0; i < bytes; i++) { - c[i] = (byte)(m[i] ^ partialblock[i]); + c[i] = (byte)(m[i] ^ partialBlock[i]); } for (int n = 0; n < 64 / sizeof(int); n++) { - ((int*)partialblock)[n] = 0; + ((int*)partialBlock)[n] = 0; } } @@ -102,7 +106,7 @@ public static unsafe void HSalsa20(uint* x, byte* sk) ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); - // HSalsa returns a 32 byte array of 0,5,10,15,6,7,8,9 + // HSalsa returns a 32 byte array of index 0,5,10,15,6,7,8,9 // <0, 5, 2, 3> + <8, 9, 10, 15> -> <0, 5, 10, 15> var t_0 = GetDiagonal(x_0, x_1, x_2, x_3); @@ -129,6 +133,7 @@ public static unsafe void KeyStream64(uint* x, byte* c) ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3); + // Add the orginal and shuffled state. x_0 = Sse2.Add(x_0, orig_0); x_1 = Sse2.Add(x_1, orig_1); x_2 = Sse2.Add(x_2, orig_2); @@ -143,14 +148,15 @@ public static unsafe void KeyStream64(uint* x, byte* c) [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void ShuffleState(ref Vector128 x_0, ref Vector128 x_1, ref Vector128 x_2, ref Vector128 x_3) { - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); - - // Diagonalize - x_1 = Sse2.Shuffle(x_1, 0b_00_11_10_01); - x_2 = Sse2.Shuffle(x_2, 0b_01_00_11_10); - x_3 = Sse2.Shuffle(x_3, 0b_10_01_00_11); + var u0 = Sse41.Blend(x_0.AsUInt16(), x_2.AsUInt16(), 0xF0); // 0, 1,10,11 + var u1 = Sse41.Blend(x_1.AsUInt16(), x_3.AsUInt16(), 0xC3); // 12, 5, 6,15 + var u2 = Sse41.Blend(x_0.AsUInt16(), x_2.AsUInt16(), 0x0F); // 8, 9, 2, 3 + var u3 = Sse41.Blend(x_1.AsUInt16(), x_3.AsUInt16(), 0x3C); // 4,13,14, 7 - Transpose(ref x_0, ref x_1, ref x_2, ref x_3); + x_0 = Sse41.Blend(u0, u1, 0xCC).AsUInt32(); + x_3 = Sse41.Blend(u0, u1, 0b110011).AsUInt32(); + x_2 = Sse41.Blend(u2, u3, 0xCC).AsUInt32(); + x_1 = Sse41.Blend(u2, u3, 0b00110011).AsUInt32(); for (int i = 0; i < 20; i += 2) { @@ -173,14 +179,20 @@ private static unsafe void ShuffleState(ref Vector128 x_0, ref Vector128