Skip to content

Commit

Permalink
utf simd conversion test
Browse files Browse the repository at this point in the history
  • Loading branch information
CptMoore committed Jan 18, 2025
1 parent de80b4a commit a1d1a0d
Show file tree
Hide file tree
Showing 7 changed files with 62,727 additions and 33 deletions.
2 changes: 1 addition & 1 deletion ModTek.Preloader/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ case ${os_type} in
#Work around used as it is a bug that is patched out in newer versions of mono.
export TERM=xterm

export LD_PRELOAD="${BASEDIR}/libdoorstop.so:${LD_PRELOAD:-}"
export LD_PRELOAD="${BASEDIR}/libdoorstop.so:${BASEDIR}/libsimdutfexport.so:${LD_PRELOAD:-}"
LD_PRELOAD="${LD_PRELOAD%:}"
;;
Darwin*)
Expand Down
257 changes: 225 additions & 32 deletions ModTek/Features/Logging/FastBuffer.cs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Security;
using System.Text;
using System.Threading;
using ModTek.Util.Stopwatch;

namespace ModTek.Features.Logging;
Expand Down Expand Up @@ -86,50 +88,59 @@ internal void Append(byte[] value)
private static int FindMemCpyThreshold()
{
const int MaxSize = 4 * 1024;
const int StepSize = 256;
const int MinSize = 256;
const int Steps = (MaxSize - MinSize) / StepSize;
var byteBufferTicks = new long[Steps];
var memCpyTicks = new long[Steps];
var srcA = new byte[MaxSize];
var srcB = new byte[MaxSize];
for (var i = 0; i < MaxSize; i++)
{
srcA[i] = (byte)i;
srcB[i] = (byte)i;
}
var dstA = new byte[MaxSize];
var dst = stackalloc byte[MaxSize];

const int TestRunsPerSize = 100;
var byteBufferTicks = new long[TestRunsPerSize];
var memCpyTicks = new long[TestRunsPerSize];

const int WarmupCount = 100;
for (var w = 0; w < WarmupCount + 1; w++)
var benchStart = MTStopwatch.GetTimestamp();

do
{
var shouldMeasure = w == WarmupCount;
const int StepSize = 256;
const int ThresholdMin = 256;
for (var size=ThresholdMin+StepSize; size<=MaxSize; size+=StepSize) {
for (var run = 0; run < TestRunsPerSize; run++)
{
var start = shouldMeasure ? MTStopwatch.GetTimestamp() : 0;
Buffer.BlockCopy(srcA, 0, dstA, 0, size);
if (shouldMeasure)
{
byteBufferTicks[run] = MTStopwatch.GetTimestamp() - start;
}
}
for (var run = 0; run < TestRunsPerSize; run++)
for (var step = 0; step < Steps; step++)
{
var size = step * StepSize + MinSize;
{
var start = shouldMeasure ? MTStopwatch.GetTimestamp() : 0;
fixed (byte* bytes = srcA)
{
Memcpy256(dst, bytes, size);
}
if (shouldMeasure)
var start = MTStopwatch.GetTimestamp();
for (var run = 0; run < TestRunsPerSize; run++)
{
memCpyTicks[run] = MTStopwatch.GetTimestamp() - start;
Buffer.BlockCopy(srcA, 0, dstA, 0, size);
}
byteBufferTicks[step] = MTStopwatch.GetTimestamp() - start;
}
if (shouldMeasure)
{
if (MTStopwatch.TicksMin(memCpyTicks) > MTStopwatch.TicksMin(byteBufferTicks))
var start = MTStopwatch.GetTimestamp();
for (var run = 0; run < TestRunsPerSize; run++)
{
return size - StepSize;
fixed (byte* dst = dstA)
{
fixed (byte* src = srcB)
{
Memcpy256(dst, src, size);
}
}
}
memCpyTicks[step] = MTStopwatch.GetTimestamp() - start;
}
}
} while (MTStopwatch.TimeSpanFromTicks(MTStopwatch.GetTimestamp() - benchStart).TotalMilliseconds < 10);

for (var step = 0; step < Steps; step++)
{
if (memCpyTicks[step] > byteBufferTicks[step] )
{
return Math.Max((step - 1) * StepSize + MinSize, MinSize);
}
}
return MaxSize;
}
Expand All @@ -141,7 +152,120 @@ internal void Append(int value)
FormattingHelpers.WriteDigits(position, (uint)value, digits);
}


internal static readonly MTStopwatch AppendNativeStopwatch = new();
internal static readonly MTStopwatch AppendManagedStopwatch = new();
internal static readonly MTStopwatch AppendGetBytesStopwatch = new();
private static long counter;
internal void Append(string value)
{
//if (value.Length is >= 8 and < 32)
if (value.Length is >= 300)
{
AppendGetBytes(value);
return;
}
var ok = Interlocked.Increment(ref counter);
if (ok == 1_000)
{
AppendNativeStopwatch.Reset();
AppendManagedStopwatch.Reset();
AppendGetBytesStopwatch.Reset();
}
var l = _length;
{
var start = MTStopwatch.GetTimestamp();
//AppendNative(value);
AppendNativeStopwatch.EndMeasurement(start);
}
_length = l;
{
var start = MTStopwatch.GetTimestamp();
AppendManaged(value);
AppendManagedStopwatch.EndMeasurement(start);
}
_length = l;
{
var start = MTStopwatch.GetTimestamp();
AppendGetBytes(value);
AppendGetBytesStopwatch.EndMeasurement(start);
}
}

[MethodImpl(MethodImplOptions.NoInlining)]
private void AppendGetBytes(string value)
{
const int Utf8MaxBytesPerChar = 4;
EnsureCapacity(_length + value.Length * Utf8MaxBytesPerChar);
_length += Encoding.UTF8.GetBytes(value, 0, value.Length, _buffer, _length);
}

private static int CountNonAscii(string value)
{
var processingCount = value.Length;
var nonAsciiCount = 0;
fixed (char* chars = value)
{
var ptr = (ulong*)chars;

{
const int IterSize = 8;
for (; processingCount >= IterSize; processingCount -= IterSize)
{
const ulong NonAsciiBitmask =
(1ul << (7 + 8 * 7)) +
(1ul << (7 + 8 * 5)) +
(1ul << (7 + 8 * 3)) +
(1ul << (7 + 8 * 1));
if ((*ptr & NonAsciiBitmask) != 0)
{
nonAsciiCount++;
}
ptr += IterSize;
}
}
if (processingCount > 0)
{
const byte NonAsciiBitmask = 1 << 7;
if ((*(byte*)ptr & NonAsciiBitmask) != 0)
{
nonAsciiCount++;
}
}
return nonAsciiCount;
}
}

[MethodImpl(MethodImplOptions.NoInlining)]
private void AppendNative(string value)
{
var valueLength = value.Length;
if (valueLength == 0)
{
return;
}

fixed (char* chars = value)
{
var dstPtr = (IntPtr)(_bufferPtr + _length);
var srcPtr = (IntPtr)chars;

var processed = (int)convert_utf16le_to_utf8(srcPtr, (ulong)valueLength, dstPtr, (ulong)CapacityLeft);
if (processed < 0)
{
EnsureCapacity(_length + valueLength - processed);
processed = (int)convert_utf16le_to_utf8(srcPtr, (ulong)valueLength, dstPtr, (ulong)CapacityLeft);
}
_length += processed;
}
}

[DllImport("libsimdutfexport", CallingConvention = CallingConvention.Cdecl)]
[SuppressUnmanagedCodeSecurity]
private static extern long convert_utf16le_to_utf8(IntPtr utf16, ulong utf16words, IntPtr utf8, ulong utf8space);

[MethodImpl(MethodImplOptions.NoInlining)]
private void AppendManaged(string value)
{
var processingCount = value.Length;
if (processingCount == 0)
Expand All @@ -163,7 +287,7 @@ internal void Append(string value)
}
else
{
// this is 10x slower or more (GetBytes has no fast ASCII path and no SIMD in this old .NET)
// this is 2x slower than FastConvert (GetBytes has no fast ASCII path and no SIMD in this old .NET)
var measurement = MTStopwatch.GetTimestamp();
var charIndex = value.Length - processingCount;
_length += charIndex;
Expand All @@ -186,8 +310,75 @@ private static int GetLowerBytePosition()
// batching also has an effect due to fewer ops overall
// 8 is a sweat spot for unrolling and the ulong bit mask check
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool FastConvert(byte* dstPtr, byte* srcPtr, ref int processingCount)
private static bool FastConvert(byte* dstPtr, byte* srcPtr, ref int processingCount)
{
{
const int IterSize = 32;
for (; processingCount >= IterSize; processingCount -= IterSize)
{
*(dstPtr + 0) = *(srcPtr + 0 * 2);
*(dstPtr + 1) = *(srcPtr + 1 * 2);
*(dstPtr + 2) = *(srcPtr + 2 * 2);
*(dstPtr + 3) = *(srcPtr + 3 * 2);
*(dstPtr + 4) = *(srcPtr + 4 * 2);
*(dstPtr + 5) = *(srcPtr + 5 * 2);
*(dstPtr + 6) = *(srcPtr + 6 * 2);
*(dstPtr + 7) = *(srcPtr + 7 * 2);
*(dstPtr + 8) = *(srcPtr + 8 * 2);
*(dstPtr + 9) = *(srcPtr + 9 * 2);
*(dstPtr +10) = *(srcPtr +10 * 2);
*(dstPtr +11) = *(srcPtr +11 * 2);
*(dstPtr +12) = *(srcPtr +12 * 2);
*(dstPtr +13) = *(srcPtr +13 * 2);
*(dstPtr +14) = *(srcPtr +14 * 2);
*(dstPtr +15) = *(srcPtr +15 * 2);
*(dstPtr +16) = *(srcPtr +16 * 2);
*(dstPtr +17) = *(srcPtr +17 * 2);
*(dstPtr +18) = *(srcPtr +18 * 2);
*(dstPtr +19) = *(srcPtr +19 * 2);
*(dstPtr +20) = *(srcPtr +20 * 2);
*(dstPtr +21) = *(srcPtr +21 * 2);
*(dstPtr +22) = *(srcPtr +22 * 2);
*(dstPtr +23) = *(srcPtr +23 * 2);
*(dstPtr +24) = *(srcPtr +24 * 2);
*(dstPtr +25) = *(srcPtr +25 * 2);
*(dstPtr +26) = *(srcPtr +26 * 2);
*(dstPtr +27) = *(srcPtr +27 * 2);
*(dstPtr +28) = *(srcPtr +28 * 2);
*(dstPtr +29) = *(srcPtr +29 * 2);
*(dstPtr +30) = *(srcPtr +30 * 2);
*(dstPtr +31) = *(srcPtr +31 * 2);

const ulong NonAsciiBitmask =
(1ul << (7 + 8 * 7)) +
(1ul << (7 + 8 * 6)) +
(1ul << (7 + 8 * 5)) +
(1ul << (7 + 8 * 4)) +
(1ul << (7 + 8 * 3)) +
(1ul << (7 + 8 * 2)) +
(1ul << (7 + 8 * 1)) +
(1ul << (7 + 8 * 0));
if ((*((ulong*)dstPtr+0) & NonAsciiBitmask) != 0)
{
return false;
}
if ((*((ulong*)dstPtr+1) & NonAsciiBitmask) != 0)
{
return false;
}
if ((*((ulong*)dstPtr+2) & NonAsciiBitmask) != 0)
{
return false;
}
if ((*((ulong*)dstPtr+3) & NonAsciiBitmask) != 0)
{
return false;
}
dstPtr += IterSize;
srcPtr += 2*IterSize;
}
}

{
const int IterSize = 8;
for (; processingCount >= IterSize; processingCount -= IterSize)
Expand Down Expand Up @@ -284,6 +475,8 @@ private void AppendTime(int hours, int minutes, int seconds, long ticks)
return _bufferPtr + length;
}

private int CapacityLeft => _buffer.Length - _length;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void EnsureCapacity(int targetLength)
{
Expand Down
3 changes: 3 additions & 0 deletions ModTek/Features/Logging/MTLoggerAsyncQueue.cs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ Async internal processing had an average latency of {latencyStats.AverageNanosec
Flushing (to disk) {AppenderFile.FlushStopWatch.GetStats()}.
Filters {AppenderFile.FiltersStopWatch.GetStats()}.
Formatter {AppenderFile.FormatterStopWatch.GetStats()}.
AppendNativeStopwatch {FastBuffer.AppendNativeStopwatch.GetStats()}.
AppendManagedStopwatch {FastBuffer.AppendManagedStopwatch.GetStats()}.
AppendGetBytesStopwatch {FastBuffer.AppendGetBytesStopwatch.GetStats()}.
UTF8-Fallback {FastBuffer.UTF8FallbackStopwatch.GetStats()}.
Write (to OS buffers) {AppenderFile.WriteStopwatch.GetStats()}.
"""
Expand Down
15 changes: 15 additions & 0 deletions ModTek/ModTek.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -104,4 +104,19 @@
<ItemGroup>
<ProjectReference Include="..\ModTek.Common\ModTek.Common.csproj" />
</ItemGroup>

<ItemGroup>
<ClCompile Include="..\simdutf\simdutf.cpp">
<Link>ModTek\simdutf\simdutf.cpp</Link>
</ClCompile>
<ClCompile Include="..\simdutf\simdutfexport.cpp">
<Link>ModTek\simdutf\simdutfexport.cpp</Link>
</ClCompile>
</ItemGroup>

<ItemGroup>
<ClInclude Include="..\simdutf\simdutf.h">
<Link>ModTek\simdutf\simdutf.h</Link>
</ClInclude>
</ItemGroup>
</Project>
Loading

0 comments on commit a1d1a0d

Please sign in to comment.