From 0fd4ea116f025f5f30baecf17a6cecc48e51804c Mon Sep 17 00:00:00 2001 From: macaba Date: Fri, 28 Aug 2020 21:38:35 +0100 Subject: [PATCH 1/3] Performance optimisations - Added Intrinsics fast path - Added Thread Static cache for GetMacDataRfc8439 - Added ability for benchmarks to run in .NET48 (for a pre/post intrinsics comparison) --- src/NaCl.Core/Base/ChaCha20Base.cs | 13 + src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs | 570 ++++++++++++++++++ src/NaCl.Core/Base/Snuffle.cs | 10 +- src/NaCl.Core/Base/SnufflePoly1305.cs | 15 +- src/NaCl.Core/NaCl.Core.csproj | 11 +- .../ChaCha20Poly1305Benchmark.cs | 8 +- .../NaCl.Core.Benchmarks.csproj | 4 +- test/NaCl.Core.Benchmarks/Program.cs | 13 +- .../RandomNumberGenerator.cs | 17 + test/NaCl.Core.Benchmarks/Run benchmarks.bat | 1 + test/NaCl.Core.Tests/NaCl.Core.Tests.csproj | 3 +- test/NaCl.Core.Tests/RandomNumberGenerator.cs | 17 + 12 files changed, 659 insertions(+), 23 deletions(-) create mode 100644 src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs create mode 100644 test/NaCl.Core.Benchmarks/RandomNumberGenerator.cs create mode 100644 test/NaCl.Core.Benchmarks/Run benchmarks.bat create mode 100644 test/NaCl.Core.Tests/RandomNumberGenerator.cs diff --git a/src/NaCl.Core/Base/ChaCha20Base.cs b/src/NaCl.Core/Base/ChaCha20Base.cs index 57ac702..d895cbc 100644 --- a/src/NaCl.Core/Base/ChaCha20Base.cs +++ b/src/NaCl.Core/Base/ChaCha20Base.cs @@ -64,6 +64,19 @@ public override void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter ArrayUtils.StoreArray16UInt32LittleEndian(block, 0, state); } +#if INTRINSICS + public override unsafe void ProcessStream(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int initialCounter, int offset = 0) + { + Span state = stackalloc uint[BLOCK_SIZE_IN_INTS]; + SetInitialState(state, nonce, initialCounter); + fixed (uint* x = state) + fixed (byte* m = input, c = output.Slice(offset)) + { + ChaCha20BaseIntrinsics.ChaCha20(x, m, c, (ulong)input.Length); + } + } +#endif + /// /// Process a pseudorandom keystream block, converting the key and part of the into a , and the remainder of the . /// diff --git a/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs new file mode 100644 index 0000000..16e34e8 --- /dev/null +++ b/src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs @@ -0,0 +1,570 @@ +#if INTRINSICS +#pragma warning disable IDE0007 // Use implicit type +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace NaCl.Core.Base +{ + public static class ChaCha20BaseIntrinsics + { + private static Vector128 rot8_128 = Vector128.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); + private static Vector128 rot16_128 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); + private static Vector256 rot8_256 = Vector256.Create((byte)3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14); + private static Vector256 rot16_256 = Vector256.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void ChaCha20(uint* x, byte* m, byte* c, ulong bytes) + { + if (Avx2.IsSupported && bytes >= 512) + { + Vector256 x_0 = Vector256.Create(x[0]); + Vector256 x_1 = Vector256.Create(x[1]); + Vector256 x_2 = Vector256.Create(x[2]); + Vector256 x_3 = Vector256.Create(x[3]); + Vector256 x_4 = Vector256.Create(x[4]); + Vector256 x_5 = Vector256.Create(x[5]); + Vector256 x_6 = Vector256.Create(x[6]); + Vector256 x_7 = Vector256.Create(x[7]); + Vector256 x_8 = Vector256.Create(x[8]); + Vector256 x_9 = Vector256.Create(x[9]); + Vector256 x_10 = Vector256.Create(x[10]); + Vector256 x_11 = Vector256.Create(x[11]); + Vector256 x_12; + Vector256 x_13; + Vector256 x_14 = Vector256.Create(x[14]); + Vector256 x_15 = Vector256.Create(x[15]); + + Vector256 orig0 = x_0; + Vector256 orig1 = x_1; + Vector256 orig2 = x_2; + Vector256 orig3 = x_3; + Vector256 orig4 = x_4; + Vector256 orig5 = x_5; + Vector256 orig6 = x_6; + Vector256 orig7 = x_7; + Vector256 orig8 = x_8; + Vector256 orig9 = x_9; + Vector256 orig10 = x_10; + Vector256 orig11 = x_11; + Vector256 orig12; + Vector256 orig13; + Vector256 orig14 = x_14; + Vector256 orig15 = x_15; + + while (bytes >= 512) + { + Vector256 addv12 = Vector256.Create(0, 1, 2, 3).AsUInt32(); + Vector256 addv13 = Vector256.Create(4, 5, 6, 7).AsUInt32(); + Vector256 permute = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7).AsUInt32(); + Vector256 t12, t13; + x_0 = orig0; + x_1 = orig1; + x_2 = orig2; + x_3 = orig3; + x_4 = orig4; + x_5 = orig5; + x_6 = orig6; + x_7 = orig7; + x_8 = orig8; + x_9 = orig9; + x_10 = orig10; + x_11 = orig11; + x_14 = orig14; + x_15 = orig15; + uint in12 = x[12]; + uint in13 = x[13]; + ulong in1213 = in12 | ((ulong)in13 << 32); + x_12 = x_13 = Avx2.BroadcastScalarToVector256(Sse2.X64.ConvertScalarToVector128UInt64(in1213)).AsUInt32(); + t12 = Avx2.Add(addv12.AsUInt64(), x_12.AsUInt64()).AsUInt32(); + t13 = Avx2.Add(addv13.AsUInt64(), x_13.AsUInt64()).AsUInt32(); + x_12 = Avx2.UnpackLow(t12, t13); + x_13 = Avx2.UnpackHigh(t12, t13); + t12 = Avx2.UnpackLow(x_12, x_13); + t13 = Avx2.UnpackHigh(x_12, x_13); + x_12 = Avx2.PermuteVar8x32(t12, permute); + x_13 = Avx2.PermuteVar8x32(t13, permute); + + orig12 = x_12; + orig13 = x_13; + + in1213 += 8; + + x[12] = (uint)(in1213 & 0xFFFFFFFF); + x[13] = (uint)((in1213 >> 32) & 0xFFFFFFFF); + for (int i = 0; i < 20; i += 2) + { + Vec256Round(ref x_0, ref x_4, ref x_8, ref x_12, ref x_1, ref x_5, ref x_9, ref x_13, ref x_2, ref x_6, ref x_10, ref x_14, ref x_3, ref x_7, ref x_11, ref x_15); + Vec256Round(ref x_0, ref x_5, ref x_10, ref x_15, ref x_1, ref x_6, ref x_11, ref x_12, ref x_2, ref x_7, ref x_8, ref x_13, ref x_3, ref x_4, ref x_9, ref x_14); + } + + Vector256 t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, t_13, t_14, t_15; + t_0 = t_1 = t_2 = t_3 = t_4 = t_5 = t_6 = t_7 = t_8 = t_9 = t_10 = t_11 = t_12 = t_13 = t_14 = t_15 = Vector256.Create((uint)0); + // ONEOCTO enter + OneQuadUnpack(ref x_0, ref x_1, ref x_2, ref x_3, ref t_0, ref t_1, ref t_2, ref t_3, ref orig0, ref orig1, ref orig2, ref orig3); + OneQuadUnpack(ref x_4, ref x_5, ref x_6, ref x_7, ref t_4, ref t_5, ref t_6, ref t_7, ref orig4, ref orig5, ref orig6, ref orig7); + t_0 = Avx2.Permute2x128(x_0, x_4, 0x20); + t_4 = Avx2.Permute2x128(x_0, x_4, 0x31); + t_1 = Avx2.Permute2x128(x_1, x_5, 0x20); + t_5 = Avx2.Permute2x128(x_1, x_5, 0x31); + t_2 = Avx2.Permute2x128(x_2, x_6, 0x20); + t_6 = Avx2.Permute2x128(x_2, x_6, 0x31); + t_3 = Avx2.Permute2x128(x_3, x_7, 0x20); + t_7 = Avx2.Permute2x128(x_3, x_7, 0x31); + t_0 = Avx2.Xor(t_0, Avx.LoadVector256(m).AsUInt32()); + t_1 = Avx2.Xor(t_1, Avx.LoadVector256(m + 64).AsUInt32()); + t_2 = Avx2.Xor(t_2, Avx.LoadVector256(m + 128).AsUInt32()); + t_3 = Avx2.Xor(t_3, Avx.LoadVector256(m + 192).AsUInt32()); + t_4 = Avx2.Xor(t_4, Avx.LoadVector256(m + 256).AsUInt32()); + t_5 = Avx2.Xor(t_5, Avx.LoadVector256(m + 320).AsUInt32()); + t_6 = Avx2.Xor(t_6, Avx.LoadVector256(m + 384).AsUInt32()); + t_7 = Avx2.Xor(t_7, Avx.LoadVector256(m + 448).AsUInt32()); + Avx.Store(c, t_0.AsByte()); + Avx.Store(c + 64, t_1.AsByte()); + Avx.Store(c + 128, t_2.AsByte()); + Avx.Store(c + 192, t_3.AsByte()); + Avx.Store(c + 256, t_4.AsByte()); + Avx.Store(c + 320, t_5.AsByte()); + Avx.Store(c + 384, t_6.AsByte()); + Avx.Store(c + 448, t_7.AsByte()); + // ONEOCTO exit + + m += 32; + c += 32; + + // ONEOCTO enter + OneQuadUnpack(ref x_8, ref x_9, ref x_10, ref x_11, ref t_8, ref t_9, ref t_10, ref t_11, ref orig8, ref orig9, ref orig10, ref orig11); + OneQuadUnpack(ref x_12, ref x_13, ref x_14, ref x_15, ref t_12, ref t_13, ref t_14, ref t_15, ref orig12, ref orig13, ref orig14, ref orig15); + t_8 = Avx2.Permute2x128(x_8, x_12, 0x20); + t_12 = Avx2.Permute2x128(x_8, x_12, 0x31); + t_9 = Avx2.Permute2x128(x_9, x_13, 0x20); + t_13 = Avx2.Permute2x128(x_9, x_13, 0x31); + t_10 = Avx2.Permute2x128(x_10, x_14, 0x20); + t_14 = Avx2.Permute2x128(x_10, x_14, 0x31); + t_11 = Avx2.Permute2x128(x_11, x_15, 0x20); + t_15 = Avx2.Permute2x128(x_11, x_15, 0x31); + t_8 = Avx2.Xor(t_8, Avx.LoadVector256(m).AsUInt32()); + t_9 = Avx2.Xor(t_9, Avx.LoadVector256(m + 64).AsUInt32()); + t_10 = Avx2.Xor(t_10, Avx.LoadVector256(m + 128).AsUInt32()); + t_11 = Avx2.Xor(t_11, Avx.LoadVector256(m + 192).AsUInt32()); + t_12 = Avx2.Xor(t_12, Avx.LoadVector256(m + 256).AsUInt32()); + t_13 = Avx2.Xor(t_13, Avx.LoadVector256(m + 320).AsUInt32()); + t_14 = Avx2.Xor(t_14, Avx.LoadVector256(m + 384).AsUInt32()); + t_15 = Avx2.Xor(t_15, Avx.LoadVector256(m + 448).AsUInt32()); + Avx.Store(c, t_8.AsByte()); + Avx.Store(c + 64, t_9.AsByte()); + Avx.Store(c + 128, t_10.AsByte()); + Avx.Store(c + 192, t_11.AsByte()); + Avx.Store(c + 256, t_12.AsByte()); + Avx.Store(c + 320, t_13.AsByte()); + Avx.Store(c + 384, t_14.AsByte()); + Avx.Store(c + 448, t_15.AsByte()); + // ONEOCTO exit + m -= 32; + c -= 32; + bytes -= 512; + c += 512; + m += 512; + } + } + if (bytes >= 256) + { + Vector128 x_0 = Vector128.Create(x[0]); + Vector128 x_1 = Vector128.Create(x[1]); + Vector128 x_2 = Vector128.Create(x[2]); + Vector128 x_3 = Vector128.Create(x[3]); + Vector128 x_4 = Vector128.Create(x[4]); + Vector128 x_5 = Vector128.Create(x[5]); + Vector128 x_6 = Vector128.Create(x[6]); + Vector128 x_7 = Vector128.Create(x[7]); + Vector128 x_8 = Vector128.Create(x[8]); + Vector128 x_9 = Vector128.Create(x[9]); + Vector128 x_10 = Vector128.Create(x[10]); + Vector128 x_11 = Vector128.Create(x[11]); + Vector128 x_12; + Vector128 x_13; + Vector128 x_14 = Vector128.Create(x[14]); + Vector128 x_15 = Vector128.Create(x[15]); + Vector128 orig0 = x_0; + Vector128 orig1 = x_1; + Vector128 orig2 = x_2; + Vector128 orig3 = x_3; + Vector128 orig4 = x_4; + Vector128 orig5 = x_5; + Vector128 orig6 = x_6; + Vector128 orig7 = x_7; + Vector128 orig8 = x_8; + Vector128 orig9 = x_9; + Vector128 orig10 = x_10; + Vector128 orig11 = x_11; + Vector128 orig12; + Vector128 orig13; + Vector128 orig14 = x_14; + Vector128 orig15 = x_15; + Vector128 t12, t13; + + while (bytes >= 256) + { + Vector128 addv12 = Vector128.Create(0, 1).AsUInt32(); + Vector128 addv13 = Vector128.Create(2, 3).AsUInt32(); + + x_0 = orig0; + x_1 = orig1; + x_2 = orig2; + x_3 = orig3; + x_4 = orig4; + x_5 = orig5; + x_6 = orig6; + x_7 = orig7; + x_8 = orig8; + x_9 = orig9; + x_10 = orig10; + x_11 = orig11; + x_14 = orig14; + x_15 = orig15; + + uint in12 = x[12]; + uint in13 = x[13]; + ulong in1213 = in12 | ((ulong)in13) << 32; + t12 = Vector128.Create(in1213).AsUInt32(); + t13 = Vector128.Create(in1213).AsUInt32(); + + x_12 = Sse2.Add(Vector128.AsUInt64(addv12), Vector128.AsUInt64(t12)).AsUInt32(); + x_13 = Sse2.Add(Vector128.AsUInt64(addv13), Vector128.AsUInt64(t13)).AsUInt32(); + + t12 = Sse2.UnpackLow(x_12, x_13); + t13 = Sse2.UnpackHigh(x_12, x_13); + + x_12 = Sse2.UnpackLow(t12, t13); + x_13 = Sse2.UnpackHigh(t12, t13); + + orig12 = x_12; + orig13 = x_13; + + in1213 += 4; + + x[12] = (uint)(in1213 & 0xFFFFFFFF); + x[13] = (uint)(in1213 >> 32 & 0xFFFFFFFF); + + for (int i = 0; i < 20; i += 2) + { + Vec128QuarterRound(ref x_0, ref x_4, ref x_8, ref x_12); + Vec128QuarterRound(ref x_1, ref x_5, ref x_9, ref x_13); + Vec128QuarterRound(ref x_2, ref x_6, ref x_10, ref x_14); + Vec128QuarterRound(ref x_3, ref x_7, ref x_11, ref x_15); + Vec128QuarterRound(ref x_0, ref x_5, ref x_10, ref x_15); + Vec128QuarterRound(ref x_1, ref x_6, ref x_11, ref x_12); + Vec128QuarterRound(ref x_2, ref x_7, ref x_8, ref x_13); + Vec128QuarterRound(ref x_3, ref x_4, ref x_9, ref x_14); + } + OneQuad(ref x_0, ref x_1, ref x_2, ref x_3, ref orig0, ref orig1, ref orig2, ref orig3, m, c); + m += 16; + c += 16; + OneQuad(ref x_4, ref x_5, ref x_6, ref x_7, ref orig4, ref orig5, ref orig6, ref orig7, m, c); + m += 16; + c += 16; + OneQuad(ref x_8, ref x_9, ref x_10, ref x_11, ref orig8, ref orig9, ref orig10, ref orig11, m, c); + m += 16; + c += 16; + OneQuad(ref x_12, ref x_13, ref x_14, ref x_15, ref orig12, ref orig13, ref orig14, ref orig15, m, c); + m -= 48; + c -= 48; + bytes -= 256; + c += 256; + m += 256; + } + } + while (bytes >= 64) + { + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); + Vector128 t_1; + + for (int i = 0; i < 20; i += 2) + { + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_1 = Sse2.Xor(x_1, x_2); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 12); + t_1 = Sse2.ShiftRightLogical(t_1, 20); + x_1 = Sse2.Xor(x_1, t_1); + + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_0 = Sse2.Shuffle(x_0, 147); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_3 = Sse2.Shuffle(x_3, 78); + x_1 = Sse2.Xor(x_1, x_2); + x_2 = Sse2.Shuffle(x_2, 57); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 7); + t_1 = Sse2.ShiftRightLogical(t_1, 25); + x_1 = Sse2.Xor(x_1, t_1); + + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_1 = Sse2.Xor(x_1, x_2); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 12); + t_1 = Sse2.ShiftRightLogical(t_1, 20); + x_1 = Sse2.Xor(x_1, t_1); + + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_0 = Sse2.Shuffle(x_0, 57); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_3 = Sse2.Shuffle(x_3, 78); + x_1 = Sse2.Xor(x_1, x_2); + x_2 = Sse2.Shuffle(x_2, 147); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 7); + t_1 = Sse2.ShiftRightLogical(t_1, 25); + x_1 = Sse2.Xor(x_1, t_1); + } + x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x)); + x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4)); + x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8)); + x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12)); + x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); + x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32(); + x_2 = Sse2.Xor(x_2.AsByte(), Sse2.LoadVector128(m + 32)).AsUInt32(); + x_3 = Sse2.Xor(x_3.AsByte(), Sse2.LoadVector128(m + 48)).AsUInt32(); + Sse2.Store(c, x_0.AsByte()); + Sse2.Store(c + 16, x_1.AsByte()); + Sse2.Store(c + 32, x_2.AsByte()); + Sse2.Store(c + 48, x_3.AsByte()); + + uint in12 = x[12]; + uint in13 = x[13]; + in12++; + if (in12 == 0) + { + in13++; + } + x[12] = in12; + x[13] = in13; + + bytes -= 64; + c += 64; + m += 64; + } + if (bytes > 0) + { + Vector128 x_0 = Sse2.LoadVector128(x); + Vector128 x_1 = Sse2.LoadVector128(x + 4); + Vector128 x_2 = Sse2.LoadVector128(x + 8); + Vector128 x_3 = Sse2.LoadVector128(x + 12); + Vector128 t_1; + for (int i = 0; i < 20; i += 2) + { + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_1 = Sse2.Xor(x_1, x_2); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 12); + t_1 = Sse2.ShiftRightLogical(t_1, 20); + x_1 = Sse2.Xor(x_1, t_1); + + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_0 = Sse2.Shuffle(x_0, 0x93); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_3 = Sse2.Shuffle(x_3, 0x4e); + x_1 = Sse2.Xor(x_1, x_2); + x_2 = Sse2.Shuffle(x_2, 0x39); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 7); + t_1 = Sse2.ShiftRightLogical(t_1, 25); + x_1 = Sse2.Xor(x_1, t_1); + + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_1 = Sse2.Xor(x_1, x_2); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 12); + t_1 = Sse2.ShiftRightLogical(t_1, 20); + x_1 = Sse2.Xor(x_1, t_1); + + x_0 = Sse2.Add(x_0, x_1); + x_3 = Sse2.Xor(x_3, x_0); + x_0 = Sse2.Shuffle(x_0, 0x39); + x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); + + x_2 = Sse2.Add(x_2, x_3); + x_3 = Sse2.Shuffle(x_3, 0x4e); + x_1 = Sse2.Xor(x_1, x_2); + x_2 = Sse2.Shuffle(x_2, 0x93); + + t_1 = x_1; + x_1 = Sse2.ShiftLeftLogical(x_1, 7); + t_1 = Sse2.ShiftRightLogical(t_1, 25); + x_1 = Sse2.Xor(x_1, t_1); + } + x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x)); + x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4)); + x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8)); + x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12)); + byte* partialblock = stackalloc byte[64]; + Sse2.Store(partialblock, Vector128.AsByte(x_0)); + Sse2.Store(partialblock + 16, Vector128.AsByte(x_1)); + Sse2.Store(partialblock + 32, Vector128.AsByte(x_2)); + Sse2.Store(partialblock + 48, Vector128.AsByte(x_3)); + + for (ulong i = 0; i < bytes; i++) + { + c[i] = (byte)(m[i] ^ partialblock[i]); + } + for (int n = 0; n < 64 / sizeof(int); n++) + { + ((int*)partialblock)[n] = 0; + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void OneQuad(ref Vector128 x_A, ref Vector128 x_B, ref Vector128 x_C, ref Vector128 x_D, ref Vector128 origA, ref Vector128 origB, ref Vector128 origC, ref Vector128 origD, byte* m, byte* c) + { + Vector128 t_A, t_B, t_C, t_D, t0, t1, t2, t3; + x_A = Sse2.Add(x_A, origA); + x_B = Sse2.Add(x_B, origB); + x_C = Sse2.Add(x_C, origC); + x_D = Sse2.Add(x_D, origD); + t_A = Sse2.UnpackLow(x_A, x_B); + t_B = Sse2.UnpackLow(x_C, x_D); + t_C = Sse2.UnpackHigh(x_A, x_B); + t_D = Sse2.UnpackHigh(x_C, x_D); + x_A = Sse2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_B = Sse2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_C = Sse2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + x_D = Sse2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + t0 = Sse2.Xor(x_A.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); + Sse2.Store(c, t0.AsByte()); + t1 = Sse2.Xor(x_B.AsByte(), Sse2.LoadVector128(m + 64)).AsUInt32(); + Sse2.Store(c + 64, t1.AsByte()); + t2 = Sse2.Xor(x_C.AsByte(), Sse2.LoadVector128(m + 128)).AsUInt32(); + Sse2.Store(c + 128, t2.AsByte()); + t3 = Sse2.Xor(x_D.AsByte(), Sse2.LoadVector128(m + 192)).AsUInt32(); + Sse2.Store(c + 192, t3.AsByte()); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vec128QuarterRound(ref Vector128 x_A, ref Vector128 x_B, ref Vector128 x_C, ref Vector128 x_D) + { + Vector128 t_A, t_C; + x_A = Sse2.Add(x_A, x_B); + t_A = Sse2.Xor(x_D, x_A); + x_D = Ssse3.Shuffle(t_A.AsByte(), rot16_128).AsUInt32(); + x_C = Sse2.Add(x_C, x_D); + t_C = Sse2.Xor(x_B, x_C); + x_B = Sse2.Or(Sse2.ShiftLeftLogical(t_C, 12), Sse2.ShiftRightLogical(t_C, 20)); + x_A = Sse2.Add(x_A, x_B); + t_A = Sse2.Xor(x_D, x_A); + x_D = Ssse3.Shuffle(t_A.AsByte(), rot8_128).AsUInt32(); + x_C = Sse2.Add(x_C, x_D); + t_C = Sse2.Xor(x_B, x_C); + x_B = Sse2.Or(Sse2.ShiftLeftLogical(t_C, 7), Sse2.ShiftRightLogical(t_C, 25)); + } + + // 512 byte methods + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 Vector256Rotate(Vector256 a, byte imm) => Avx2.Or(Avx2.ShiftLeftLogical(a, imm), Avx2.ShiftRightLogical(a, (byte)(32 - imm))); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vec256Round(ref Vector256 A1, ref Vector256 B1, ref Vector256 C1, ref Vector256 D1, ref Vector256 A2, ref Vector256 B2, ref Vector256 C2, ref Vector256 D2, ref Vector256 A3, ref Vector256 B3, ref Vector256 C3, ref Vector256 D3, ref Vector256 A4, ref Vector256 B4, ref Vector256 C4, ref Vector256 D4) + { + Vector256Line1(ref A1, ref B1, ref C1, ref D1); + Vector256Line1(ref A2, ref B2, ref C2, ref D2); + Vector256Line1(ref A3, ref B3, ref C3, ref D3); + Vector256Line1(ref A4, ref B4, ref C4, ref D4); + Vector256Line2(ref A1, ref B1, ref C1, ref D1); + Vector256Line2(ref A2, ref B2, ref C2, ref D2); + Vector256Line2(ref A3, ref B3, ref C3, ref D3); + Vector256Line2(ref A4, ref B4, ref C4, ref D4); + Vector256Line3(ref A1, ref B1, ref C1, ref D1); + Vector256Line3(ref A2, ref B2, ref C2, ref D2); + Vector256Line3(ref A3, ref B3, ref C3, ref D3); + Vector256Line3(ref A4, ref B4, ref C4, ref D4); + Vector256Line4(ref A1, ref B1, ref C1, ref D1); + Vector256Line4(ref A2, ref B2, ref C2, ref D2); + Vector256Line4(ref A3, ref B3, ref C3, ref D3); + Vector256Line4(ref A4, ref B4, ref C4, ref D4); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line1(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_A = Avx2.Add(x_A, x_B); + x_D = Avx2.Shuffle(Avx2.Xor(x_D, x_A).AsByte(), rot16_256).AsUInt32(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line2(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_C = Avx2.Add(x_C, x_D); + x_B = Vector256Rotate(Avx2.Xor(x_B, x_C), 12); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line3(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_A = Avx2.Add(x_A, x_B); + x_D = Avx2.Shuffle(Avx2.Xor(x_D, x_A).AsByte(), rot8_256).AsUInt32(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Vector256Line4(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D) + { + x_C = Avx2.Add(x_C, x_D); + x_B = Vector256Rotate(Avx2.Xor(x_B, x_C), 7); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void OneQuadUnpack(ref Vector256 x_A, ref Vector256 x_B, ref Vector256 x_C, ref Vector256 x_D, ref Vector256 t_A, ref Vector256 t_B, ref Vector256 t_C, ref Vector256 t_D, ref Vector256 orig_A, ref Vector256 orig_B, ref Vector256 orig_C, ref Vector256 orig_D) + { + x_A = Avx2.Add(x_A, orig_A); + x_B = Avx2.Add(x_B, orig_B); + x_C = Avx2.Add(x_C, orig_C); + x_D = Avx2.Add(x_D, orig_D); + t_A = Avx2.UnpackLow(x_A, x_B); + t_B = Avx2.UnpackLow(x_C, x_D); + t_C = Avx2.UnpackHigh(x_A, x_B); + t_D = Avx2.UnpackHigh(x_C, x_D); + x_A = Avx2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_B = Avx2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); + x_C = Avx2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + x_D = Avx2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); + } + // End of 512 byte methods + } +} +#pragma warning restore IDE0007 // Use implicit type +#endif \ No newline at end of file diff --git a/src/NaCl.Core/Base/Snuffle.cs b/src/NaCl.Core/Base/Snuffle.cs index d237860..bc25dae 100644 --- a/src/NaCl.Core/Base/Snuffle.cs +++ b/src/NaCl.Core/Base/Snuffle.cs @@ -59,6 +59,10 @@ public Snuffle(ReadOnlyMemory key, int initialCounter) /// ByteBuffer. public abstract void ProcessKeyStreamBlock(ReadOnlySpan nonce, int counter, Span block); +#if INTRINSICS + public abstract void ProcessStream(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int initialCounter, int offset = 0); +#endif + /// /// The size of the randomly generated nonces. /// ChaCha20 uses 12-byte nonces, but XSalsa20 and XChaCha20 use 24-byte nonces. @@ -87,7 +91,7 @@ public virtual byte[] Encrypt(ReadOnlySpan plaintext) var ciphertext = new byte[plaintext.Length + NonceSizeInBytes]; -#if NETCOREAPP3_1 +#if SPANSTACKALLOC Span nonce = stackalloc byte[NonceSizeInBytes]; RandomNumberGenerator.Fill(nonce); @@ -202,6 +206,9 @@ public virtual void Decrypt(ReadOnlySpan ciphertext, ReadOnlySpan no /// The output's starting offset. private void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan input, int offset = 0) { +#if INTRINSICS + ProcessStream(nonce, output, input, InitialCounter, offset); +#else var length = input.Length; var numBlocks = (length / BLOCK_SIZE_IN_BYTES) + 1; @@ -236,6 +243,7 @@ private void Process(ReadOnlySpan nonce, Span output, ReadOnlySpan diff --git a/src/NaCl.Core/Base/SnufflePoly1305.cs b/src/NaCl.Core/Base/SnufflePoly1305.cs index e497873..d0128cf 100644 --- a/src/NaCl.Core/Base/SnufflePoly1305.cs +++ b/src/NaCl.Core/Base/SnufflePoly1305.cs @@ -291,18 +291,23 @@ private Span GetMacKey(ReadOnlySpan nonce) /// The associated data. /// The ciphertext. /// System.Byte[]. - private byte[] GetMacDataRfc8439(ReadOnlySpan aad, ReadOnlySpan ciphertext) + [ThreadStatic] + private static byte[] macDataBytes = new byte[0]; + private ReadOnlySpan GetMacDataRfc8439(ReadOnlySpan aad, ReadOnlySpan ciphertext) { var aadPaddedLen = (aad.Length % 16 == 0) ? aad.Length : (aad.Length + 16 - aad.Length % 16); var ciphertextLen = ciphertext.Length; var ciphertextPaddedLen = (ciphertextLen % 16 == 0) ? ciphertextLen : (ciphertextLen + 16 - ciphertextLen % 16); + var macDataLength = aadPaddedLen + ciphertextPaddedLen + 16; - var macData = new byte[aadPaddedLen + ciphertextPaddedLen + 16]; + if (macDataBytes.Length < macDataLength) + macDataBytes = new byte[macDataLength]; + + Span macData = macDataBytes; // Mac Text - //aad.CopyTo(macData); - Array.Copy(aad.ToArray(), macData, aad.Length); - Array.Copy(ciphertext.ToArray(), 0, macData, aadPaddedLen, ciphertextLen); + aad.CopyTo(macData); + ciphertext.CopyTo(macData.Slice(aadPaddedLen, ciphertextLen)); // Mac Length //macData[aadPaddedLen + ciphertextPaddedLen] = (byte)aad.Length; diff --git a/src/NaCl.Core/NaCl.Core.csproj b/src/NaCl.Core/NaCl.Core.csproj index eb39249..e9d2da5 100644 --- a/src/NaCl.Core/NaCl.Core.csproj +++ b/src/NaCl.Core/NaCl.Core.csproj @@ -1,8 +1,8 @@ - netstandard1.6;netstandard2.0;netstandard2.1;netcoreapp3.1 - netstandard1.6;netstandard2.0;netstandard2.1;netcoreapp3.1;net45;net48 + netstandard1.6;netstandard2.0;netstandard2.1;netcoreapp3.1;netcoreapp5.0 + netstandard1.6;netstandard2.0;netstandard2.1;netcoreapp3.1;netcoreapp5.0;net45;net48 true 1.2.0 David De Smet @@ -26,10 +26,11 @@ - - FCL_BITOPS + + FCL_BITOPS;INTRINSICS;SPANSTACKALLOC + true - + diff --git a/test/NaCl.Core.Benchmarks/ChaCha20Poly1305Benchmark.cs b/test/NaCl.Core.Benchmarks/ChaCha20Poly1305Benchmark.cs index ac4c550..b6654a6 100644 --- a/test/NaCl.Core.Benchmarks/ChaCha20Poly1305Benchmark.cs +++ b/test/NaCl.Core.Benchmarks/ChaCha20Poly1305Benchmark.cs @@ -20,7 +20,8 @@ public class ChaCha20Poly1305Benchmark private byte[] message; private byte[] tag; private byte[] aad; - + private Memory ciphertext; + private ChaCha20Poly1305 aead; [Params( @@ -50,14 +51,15 @@ public void Setup() rnd.NextBytes(aad); aead = new ChaCha20Poly1305(key); + + ciphertext = new byte[message.Length]; } [Benchmark] [BenchmarkCategory("Encryption")] public void Encrypt() { - var ciphertext = new byte[message.Length]; - aead.Encrypt(nonce, message, ciphertext, tag, aad); + aead.Encrypt(nonce, message, ciphertext.Span, tag, aad); } [Benchmark] diff --git a/test/NaCl.Core.Benchmarks/NaCl.Core.Benchmarks.csproj b/test/NaCl.Core.Benchmarks/NaCl.Core.Benchmarks.csproj index cca4c1f..69901d5 100644 --- a/test/NaCl.Core.Benchmarks/NaCl.Core.Benchmarks.csproj +++ b/test/NaCl.Core.Benchmarks/NaCl.Core.Benchmarks.csproj @@ -1,8 +1,8 @@ - + Exe - netcoreapp3.1 + netcoreapp3.1;net48 diff --git a/test/NaCl.Core.Benchmarks/Program.cs b/test/NaCl.Core.Benchmarks/Program.cs index 1bdc82b..10b90c0 100644 --- a/test/NaCl.Core.Benchmarks/Program.cs +++ b/test/NaCl.Core.Benchmarks/Program.cs @@ -10,12 +10,13 @@ static void Main(string[] args) { // Execute following code: // $ dotnet run -c release --framework netcoreapp3.1 - //BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); - BenchmarkRunner.Run(); - BenchmarkRunner.Run(); - BenchmarkRunner.Run(); - BenchmarkRunner.Run(); - BenchmarkRunner.Run(); + + BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); + //BenchmarkRunner.Run(args); + //BenchmarkRunner.Run(args); + //BenchmarkRunner.Run(args); + //BenchmarkRunner.Run(args); + //BenchmarkRunner.Run(args); Console.ReadLine(); } diff --git a/test/NaCl.Core.Benchmarks/RandomNumberGenerator.cs b/test/NaCl.Core.Benchmarks/RandomNumberGenerator.cs new file mode 100644 index 0000000..442f83a --- /dev/null +++ b/test/NaCl.Core.Benchmarks/RandomNumberGenerator.cs @@ -0,0 +1,17 @@ +using System; + +namespace NaCl.Core.Benchmarks +{ +#if NET48 + public static class RandomNumberGenerator + { + public static void Fill(Span data) + { + var random = System.Security.Cryptography.RandomNumberGenerator.Create(); + var dataBytes = new byte[data.Length]; + random.GetBytes(dataBytes); + dataBytes.CopyTo(data); + } + } +#endif +} diff --git a/test/NaCl.Core.Benchmarks/Run benchmarks.bat b/test/NaCl.Core.Benchmarks/Run benchmarks.bat new file mode 100644 index 0000000..34f5801 --- /dev/null +++ b/test/NaCl.Core.Benchmarks/Run benchmarks.bat @@ -0,0 +1 @@ +dotnet run -c Release -f netcoreapp3.1 --runtimes net48 netcoreapp3.1 \ No newline at end of file diff --git a/test/NaCl.Core.Tests/NaCl.Core.Tests.csproj b/test/NaCl.Core.Tests/NaCl.Core.Tests.csproj index bac8f8e..e7e57e7 100644 --- a/test/NaCl.Core.Tests/NaCl.Core.Tests.csproj +++ b/test/NaCl.Core.Tests/NaCl.Core.Tests.csproj @@ -1,7 +1,7 @@  - netcoreapp3.1 + netcoreapp3.1;net48 latest true @@ -23,6 +23,7 @@ + diff --git a/test/NaCl.Core.Tests/RandomNumberGenerator.cs b/test/NaCl.Core.Tests/RandomNumberGenerator.cs new file mode 100644 index 0000000..846cb44 --- /dev/null +++ b/test/NaCl.Core.Tests/RandomNumberGenerator.cs @@ -0,0 +1,17 @@ +using System; + +namespace NaCl.Core.Tests +{ +#if NET48 + public static class RandomNumberGenerator + { + public static void Fill(Span data) + { + var random = System.Security.Cryptography.RandomNumberGenerator.Create(); + var dataBytes = new byte[data.Length]; + random.GetBytes(dataBytes); + dataBytes.CopyTo(data); + } + } +#endif +} From 0ec9bc087f5511bf378617b01ab84038bd2ec005 Mon Sep 17 00:00:00 2001 From: macaba Date: Fri, 28 Aug 2020 22:26:00 +0100 Subject: [PATCH 2/3] Fixed regression --- src/NaCl.Core/Base/SnufflePoly1305.cs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/NaCl.Core/Base/SnufflePoly1305.cs b/src/NaCl.Core/Base/SnufflePoly1305.cs index d0128cf..850c8e5 100644 --- a/src/NaCl.Core/Base/SnufflePoly1305.cs +++ b/src/NaCl.Core/Base/SnufflePoly1305.cs @@ -291,19 +291,13 @@ private Span GetMacKey(ReadOnlySpan nonce) /// The associated data. /// The ciphertext. /// System.Byte[]. - [ThreadStatic] - private static byte[] macDataBytes = new byte[0]; private ReadOnlySpan GetMacDataRfc8439(ReadOnlySpan aad, ReadOnlySpan ciphertext) { var aadPaddedLen = (aad.Length % 16 == 0) ? aad.Length : (aad.Length + 16 - aad.Length % 16); var ciphertextLen = ciphertext.Length; var ciphertextPaddedLen = (ciphertextLen % 16 == 0) ? ciphertextLen : (ciphertextLen + 16 - ciphertextLen % 16); - var macDataLength = aadPaddedLen + ciphertextPaddedLen + 16; - if (macDataBytes.Length < macDataLength) - macDataBytes = new byte[macDataLength]; - - Span macData = macDataBytes; + Span macData = new byte[aadPaddedLen + ciphertextPaddedLen + 16]; // Mac Text aad.CopyTo(macData); From 28d47c9360882774e6b9c1d6d28ae7782dbe1340 Mon Sep 17 00:00:00 2001 From: macaba Date: Sat, 29 Aug 2020 15:30:33 +0100 Subject: [PATCH 3/3] Added & used SpanOwner to memory pool GetMacKey and GetMacDataRfc8439 --- src/NaCl.Core/Base/SnufflePoly1305.cs | 73 ++++++++---- src/NaCl.Core/Internal/SpanOwner.cs | 163 ++++++++++++++++++++++++++ src/NaCl.Core/NaCl.Core.csproj | 1 + 3 files changed, 216 insertions(+), 21 deletions(-) create mode 100644 src/NaCl.Core/Internal/SpanOwner.cs diff --git a/src/NaCl.Core/Base/SnufflePoly1305.cs b/src/NaCl.Core/Base/SnufflePoly1305.cs index 850c8e5..b1a5d6d 100644 --- a/src/NaCl.Core/Base/SnufflePoly1305.cs +++ b/src/NaCl.Core/Base/SnufflePoly1305.cs @@ -1,9 +1,11 @@ namespace NaCl.Core.Base { using System; + using System.Buffers; using System.Diagnostics.CodeAnalysis; using System.Linq; using System.Security.Cryptography; + using Microsoft.Toolkit.HighPerformance.Buffers; using Internal; @@ -107,14 +109,19 @@ public virtual byte[] Encrypt(ReadOnlySpan nonce, ReadOnlySpan plain var ciphertext = _snuffle.Encrypt(plaintext, nonce); - var tag = Poly1305.ComputeMac(GetMacKey(nonce), GetMacDataRfc8439(associatedData, ciphertext)); - - // Array.Resize(ref ciphertext, ciphertext.Length + Poly1305.MAC_TAG_SIZE_IN_BYTES); - // Array.Copy(tag, 0, ciphertext, ciphertext.Length - Poly1305.MAC_TAG_SIZE_IN_BYTES, tag.Length); - - // return ciphertext; - // return ciphertext.Concat(tag).ToArray(); // could be inefficient - return CryptoBytes.Combine(ciphertext, tag); + using (var macKey = GetMacKey(nonce)) + using (var macData = GetMacDataRfc8439(associatedData, ciphertext)) + { + var tag = Poly1305.ComputeMac(macKey.Span, macData.Span); + macKey.Span.Clear(); + macData.Span.Clear(); + // Array.Resize(ref ciphertext, ciphertext.Length + Poly1305.MAC_TAG_SIZE_IN_BYTES); + // Array.Copy(tag, 0, ciphertext, ciphertext.Length - Poly1305.MAC_TAG_SIZE_IN_BYTES, tag.Length); + + // return ciphertext; + // return ciphertext.Concat(tag).ToArray(); // could be inefficient + return CryptoBytes.Combine(ciphertext, tag); + } } /// @@ -144,7 +151,13 @@ public void Encrypt(ReadOnlySpan nonce, ReadOnlySpan plaintext, Span // throw new ArgumentException($"The {nameof(plaintext)} is too long."); _snuffle.Encrypt(plaintext, nonce, ciphertext); - Poly1305.ComputeMac(GetMacKey(nonce), GetMacDataRfc8439(associatedData, ciphertext), tag); + using (var macKey = GetMacKey(nonce)) + using (var macData = GetMacDataRfc8439(associatedData, ciphertext)) + { + Poly1305.ComputeMac(macKey.Span, macData.Span, tag); + macKey.Span.Clear(); + macData.Span.Clear(); + } } /// @@ -217,7 +230,13 @@ public virtual byte[] Decrypt(ReadOnlySpan nonce, ReadOnlySpan ciphe try { - Poly1305.VerifyMac(GetMacKey(nonce), GetMacDataRfc8439(associatedData, ciphertext.Slice(0, limit)), ciphertext.Slice(limit, Poly1305.MAC_TAG_SIZE_IN_BYTES)); + using (var macKey = GetMacKey(nonce)) + using (var macData = GetMacDataRfc8439(associatedData, ciphertext.Slice(0, limit))) + { + Poly1305.VerifyMac(macKey.Span, macData.Span, ciphertext.Slice(limit, Poly1305.MAC_TAG_SIZE_IN_BYTES)); + macKey.Span.Clear(); + macData.Span.Clear(); + } } catch (Exception ex) { @@ -255,7 +274,13 @@ public virtual void Decrypt(ReadOnlySpan nonce, ReadOnlySpan ciphert try { - Poly1305.VerifyMac(GetMacKey(nonce), GetMacDataRfc8439(associatedData, ciphertext), tag); + using (var macKey = GetMacKey(nonce)) + using (var macData = GetMacDataRfc8439(associatedData, ciphertext)) + { + Poly1305.VerifyMac(macKey.Span, macData.Span, tag); + macKey.Span.Clear(); + macData.Span.Clear(); + } } catch (Exception ex) { @@ -270,7 +295,7 @@ public virtual void Decrypt(ReadOnlySpan nonce, ReadOnlySpan ciphert /// /// The nonce. /// System.Byte[]. - private Span GetMacKey(ReadOnlySpan nonce) + private SpanOwner GetMacKey(ReadOnlySpan nonce) { //var firstBlock = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; //_macKeySnuffle.ProcessKeyStreamBlock(nonce, 0, firstBlock); @@ -279,10 +304,16 @@ private Span GetMacKey(ReadOnlySpan nonce) //Array.Copy(firstBlock, result, result.Length); //return result; - Span firstBlock = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; - _macKeySnuffle.ProcessKeyStreamBlock(nonce, 0, firstBlock); + using (var firstBlock = SpanOwner.Allocate(Snuffle.BLOCK_SIZE_IN_BYTES, AllocationMode.Clear)) + { + //Span firstBlock = new byte[Snuffle.BLOCK_SIZE_IN_BYTES]; + _macKeySnuffle.ProcessKeyStreamBlock(nonce, 0, firstBlock.Span); - return firstBlock.Slice(0, Poly1305.MAC_KEY_SIZE_IN_BYTES); + var macKey = SpanOwner.Allocate(Poly1305.MAC_KEY_SIZE_IN_BYTES, AllocationMode.Clear); + firstBlock.Span.Slice(0, Poly1305.MAC_KEY_SIZE_IN_BYTES).CopyTo(macKey.Span); + firstBlock.Span.Clear(); + return macKey; + } } /// @@ -291,23 +322,23 @@ private Span GetMacKey(ReadOnlySpan nonce) /// The associated data. /// The ciphertext. /// System.Byte[]. - private ReadOnlySpan GetMacDataRfc8439(ReadOnlySpan aad, ReadOnlySpan ciphertext) + private SpanOwner GetMacDataRfc8439(ReadOnlySpan aad, ReadOnlySpan ciphertext) { var aadPaddedLen = (aad.Length % 16 == 0) ? aad.Length : (aad.Length + 16 - aad.Length % 16); var ciphertextLen = ciphertext.Length; var ciphertextPaddedLen = (ciphertextLen % 16 == 0) ? ciphertextLen : (ciphertextLen + 16 - ciphertextLen % 16); - Span macData = new byte[aadPaddedLen + ciphertextPaddedLen + 16]; + var macData = SpanOwner.Allocate(aadPaddedLen + ciphertextPaddedLen + 16, AllocationMode.Clear); // Mac Text - aad.CopyTo(macData); - ciphertext.CopyTo(macData.Slice(aadPaddedLen, ciphertextLen)); + aad.CopyTo(macData.Span); + ciphertext.CopyTo(macData.Span.Slice(aadPaddedLen, ciphertextLen)); // Mac Length //macData[aadPaddedLen + ciphertextPaddedLen] = (byte)aad.Length; //macData[aadPaddedLen + ciphertextPaddedLen + 8] = (byte)ciphertextLen; - SetMacLength(macData, aadPaddedLen + ciphertextPaddedLen, aad.Length); - SetMacLength(macData, aadPaddedLen + ciphertextPaddedLen + sizeof(ulong), ciphertextLen); + SetMacLength(macData.Span, aadPaddedLen + ciphertextPaddedLen, aad.Length); + SetMacLength(macData.Span, aadPaddedLen + ciphertextPaddedLen + sizeof(ulong), ciphertextLen); return macData; } diff --git a/src/NaCl.Core/Internal/SpanOwner.cs b/src/NaCl.Core/Internal/SpanOwner.cs new file mode 100644 index 0000000..0027620 --- /dev/null +++ b/src/NaCl.Core/Internal/SpanOwner.cs @@ -0,0 +1,163 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Buffers; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +//using Microsoft.Toolkit.HighPerformance.Buffers.Views; +//using Microsoft.Toolkit.HighPerformance.Extensions; + +namespace Microsoft.Toolkit.HighPerformance.Buffers +{ + /// + /// An that indicates a mode to use when allocating buffers. + /// + public enum AllocationMode + { + /// + /// The default allocation mode for pooled memory (rented buffers are not cleared). + /// + Default, + + /// + /// Clear pooled buffers when renting them. + /// + Clear + } + + /// + /// A stack-only type with the ability to rent a buffer of a specified length and getting a from it. + /// This type mirrors but without allocations and with further optimizations. + /// As this is a stack-only type, it relies on the duck-typed pattern introduced with C# 8. + /// It should be used like so: + /// + /// using (SpanOwner<byte> buffer = SpanOwner<byte>.Allocate(1024)) + /// { + /// // Use the buffer here... + /// } + /// + /// As soon as the code leaves the scope of that block, the underlying buffer will automatically + /// be disposed. The APIs in rely on this pattern for extra performance, eg. they don't perform + /// the additional checks that are done in to ensure that the buffer hasn't been disposed + /// before returning a or instance from it. + /// As such, this type should always be used with a block or expression. + /// Not doing so will cause the underlying buffer not to be returned to the shared pool. + /// + /// The type of items to store in the current instance. + //[DebuggerTypeProxy(typeof(SpanOwnerDebugView<>))] + [DebuggerDisplay("{ToString(),raw}")] + public readonly ref struct SpanOwner + { +#pragma warning disable IDE0032 + /// + /// The usable length within . + /// + private readonly int length; +#pragma warning restore IDE0032 + + /// + /// The underlying array. + /// + private readonly T[] array; + + /// + /// Initializes a new instance of the struct with the specified parameters. + /// + /// The length of the new memory buffer to use. + /// Indicates the allocation mode to use for the new buffer to rent. + private SpanOwner(int length, AllocationMode mode) + { + this.length = length; + this.array = ArrayPool.Shared.Rent(length); + + if (mode == AllocationMode.Clear) + { + this.array.AsSpan(0, length).Clear(); + } + } + + /// + /// Gets an empty instance. + /// + public static SpanOwner Empty + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => new SpanOwner(0, AllocationMode.Default); + } + + /// + /// Creates a new instance with the specified parameters. + /// + /// The length of the new memory buffer to use. + /// A instance of the requested length. + /// Thrown when is not valid. + /// This method is just a proxy for the constructor, for clarity. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static SpanOwner Allocate(int size) => new SpanOwner(size, AllocationMode.Default); + + /// + /// Creates a new instance with the specified parameters. + /// + /// The length of the new memory buffer to use. + /// Indicates the allocation mode to use for the new buffer to rent. + /// A instance of the requested length. + /// Thrown when is not valid. + /// This method is just a proxy for the constructor, for clarity. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static SpanOwner Allocate(int size, AllocationMode mode) => new SpanOwner(size, mode); + + /// + /// Gets the number of items in the current instance + /// + public int Length + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => this.length; + } + + /// + /// Gets a wrapping the memory belonging to the current instance. + /// + public Span Span + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => new Span(array, 0, this.length); + } + + /// + /// Returns a reference to the first element within the current instance, with no bounds check. + /// + /// A reference to the first element within the current instance. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ref T DangerousGetReference() + { + //return ref array.DangerousGetReference(); + return ref MemoryMarshal.GetReference(array); + } + + /// + /// Implements the duck-typed method. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Dispose() + { + ArrayPool.Shared.Return(array); + } + + /// + public override string ToString() + { + if (typeof(T) == typeof(char) && + this.array is char[] chars) + { + return new string(chars, 0, this.length); + } + + // Same representation used in Span + return $"Microsoft.Toolkit.HighPerformance.Buffers.SpanOwner<{typeof(T)}>[{this.length}]"; + } + } +} \ No newline at end of file diff --git a/src/NaCl.Core/NaCl.Core.csproj b/src/NaCl.Core/NaCl.Core.csproj index e9d2da5..55c994d 100644 --- a/src/NaCl.Core/NaCl.Core.csproj +++ b/src/NaCl.Core/NaCl.Core.csproj @@ -15,6 +15,7 @@ https://github.com/idaviddesmet/NaCl.Core.git git master + 8.0