diff --git a/README.md b/README.md index ca44b93..b587f8e 100644 --- a/README.md +++ b/README.md @@ -32,10 +32,10 @@ fully reproducible. | processor and base freq. | SimdBase64 (GB/s) | .NET speed (GB/s) | speed up | |:----------------|:------------------------|:-------------------|:-------------------| -| Apple M2 processor (ARM, 3.5 Ghz) | 6.5 | 3.8 | 1.7 x | -| AWS Graviton 3 (ARM, 2.6 GHz) | 3.6 | 2.0 | 1.8 x | -| Intel Ice Lake (2.0 GHz) | 6.5 | 3.4 | 1.9 x | -| AMD EPYC 7R32 (Zen 2, 2.8 GHz) | 6.8 | 2.9 | 2.3 x | +| Apple M2 processor (ARM, 3.5 Ghz) | 10 | 3.8 | 2.6 x | +| AWS Graviton 3 (ARM, 2.6 GHz) | 5.1 | 2.0 | 2.6 x | +| Intel Ice Lake (2.0 GHz) | 7.6 | 3.4 | 2.2 x | +| AMD EPYC 7R32 (Zen 2, 2.8 GHz) | 6.9 | 3.0 | 2.3 x | ## Results (SimdBase64 vs. string .NET functions) diff --git a/benchmark/Benchmark.cs b/benchmark/Benchmark.cs index c74d044..873412e 100644 --- a/benchmark/Benchmark.cs +++ b/benchmark/Benchmark.cs @@ -2,13 +2,9 @@ using BenchmarkDotNet.Running; using BenchmarkDotNet.Configs; using BenchmarkDotNet.Reports; -using BenchmarkDotNet.Filters; using BenchmarkDotNet.Jobs; using System.Text; -using System.Runtime.InteropServices; using BenchmarkDotNet.Columns; -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; namespace SimdUnicodeBenchmarks { @@ -464,7 +460,7 @@ public unsafe void RunOurDecodingBenchmarkWithAllocUTF16(string[] data, int[] le if (dataoutput.Length != lengths[i]) { - Console.WriteLine($"Error: {dataoutput.Length } != {lengths[i]}"); + Console.WriteLine($"Error: {dataoutput.Length} != {lengths[i]}"); #pragma warning disable CA2201 throw new Exception("Error"); } diff --git a/src/Base64.cs b/src/Base64.cs index 045e764..ec7b6c0 100644 --- a/src/Base64.cs +++ b/src/Base64.cs @@ -14,7 +14,8 @@ public static int MaximalBinaryLengthFromBase64(ReadOnlySpan input) { return Scalar.Base64.MaximalBinaryLengthFromBase64Scalar(input); } - public static byte[] FromBase64String(string s) { + public static byte[] FromBase64String(string s) + { ReadOnlySpan base64 = s.AsSpan(); byte[] newBytes = new byte[SimdBase64.Base64.MaximalBinaryLengthFromBase64(base64)]; int bytesConsumed = 0; @@ -35,7 +36,7 @@ public unsafe static OperationStatus DecodeFromBase64(ReadOnlySpan source, //if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported) //{ //} - if (Avx2.IsSupported) + if (Avx2.IsSupported && Popcnt.IsSupported && Bmi1.IsSupported) { return AVX2.Base64.DecodeFromBase64AVX2(source, dest, out bytesConsumed, out bytesWritten, isUrl); } @@ -60,7 +61,7 @@ public unsafe static OperationStatus DecodeFromBase64(ReadOnlySpan source, //{ // return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment); //} - if (Avx2.IsSupported) + if (Avx2.IsSupported && Popcnt.IsSupported && Bmi1.IsSupported) { return AVX2.Base64.DecodeFromBase64AVX2(source, dest, out bytesConsumed, out bytesWritten, isUrl); } diff --git a/src/Base64ARM.cs b/src/Base64ARM.cs index 99ce7e9..dad3362 100644 --- a/src/Base64ARM.cs +++ b/src/Base64ARM.cs @@ -219,6 +219,65 @@ private static unsafe ulong ToBase64MaskUrl(Block64* b, ref bool error) [MethodImpl(MethodImplOptions.AggressiveInlining)] private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr) { + // if mask is a power of 2, we can use a simpler version + if ((mask & (mask - 1)) == 0) // check if mask is a power of 2 + { + int pos64 = ArmBase.Arm64.LeadingZeroCount(mask); + int pos = pos64 & 0xf; + Vector128 v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + Vector128 v0 = Vector128.Create((byte)(0xe - pos)); + switch (pos64 >> 4) + { + case 3: + { + Vector128 v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = AdvSimd.Subtract(v1, v2); + Vector128 compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk0, sh); + Vector128.Store(compressed, output + 0 * 16); + Vector128.Store(b.chunk1, output + 1 * 16 - 1); + Vector128.Store(b.chunk2, output + 2 * 16 - 1); + Vector128.Store(b.chunk3, output + 3 * 16 - 1); + } + break; + + case 2: + { + Vector128 v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = AdvSimd.Subtract(v1, v2); + Vector128 compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk1, sh); + Vector128.Store(b.chunk0, output + 0 * 16); + Vector128.Store(compressed, output + 1 * 16); + Vector128.Store(b.chunk2, output + 2 * 16 - 1); + Vector128.Store(b.chunk3, output + 3 * 16 - 1); + } + break; + + case 1: + { + Vector128 v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = AdvSimd.Subtract(v1, v2); + Vector128 compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk2, sh); + Vector128.Store(b.chunk0, output + 0 * 16); + Vector128.Store(b.chunk1, output + 1 * 16); + Vector128.Store(compressed, output + 2 * 16); + Vector128.Store(b.chunk3, output + 3 * 16 - 1); + } + break; + + case 0: + { + Vector128 v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = AdvSimd.Subtract(v1, v2); + Vector128 compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk3, sh); + Vector128.Store(b.chunk0, output + 0 * 16); + Vector128.Store(b.chunk1, output + 1 * 16); + Vector128.Store(b.chunk2, output + 2 * 16); + Vector128.Store(compressed, output + 3 * 16); + } + break; + } + return 63; + } ulong nmask = ~mask; Compress(b.chunk0, (ushort)mask, output, tablePtr); Compress(b.chunk1, (ushort)(mask >> 16), output + UInt64.PopCount(nmask & 0xFFFF), tablePtr); diff --git a/src/Base64AVX2UTF8.cs b/src/Base64AVX2UTF8.cs index 4b4901b..e4a7f0b 100644 --- a/src/Base64AVX2UTF8.cs +++ b/src/Base64AVX2UTF8.cs @@ -167,6 +167,69 @@ private static UInt64 ToBase64Mask(bool base64Url, ref Vector256 src, ref [MethodImpl(MethodImplOptions.AggressiveInlining)] private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr) { + // if mask is a power of 2, we can use a simpler version + if ((mask & (mask - 1)) == 0) // check if mask is a power of 2 + { + ulong pos64 = Bmi1.X64.TrailingZeroCount(mask); + ulong pos = pos64 & 0xf; + Vector128 v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + Vector128 v0 = Vector128.Create((byte)(pos-1)); + switch (pos64 >> 4) + { + case 0: + { + Vector128 chunk0 = Avx2.ExtractVector128(b.chunk0, 0); + Vector128 chunk1 = Avx2.ExtractVector128(b.chunk0, 1); + Vector128 v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = Sse2.Subtract(v1, v2); + Vector128 compressed = Ssse3.Shuffle(chunk0, sh); + Vector128.Store(compressed, output + 0 * 16); + Vector128.Store(chunk1, output + 1 * 16 - 1); + Vector256.Store(b.chunk1, output + 2 * 16 - 1); + } + break; + + case 1: + { + Vector128 chunk0 = Avx2.ExtractVector128(b.chunk0, 0); + Vector128 chunk1 = Avx2.ExtractVector128(b.chunk0, 1); + Vector128 v2 = Sse2.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = Sse2.Subtract(v1, v2); + Vector128 compressed = Ssse3.Shuffle(chunk1, sh); + Vector128.Store(chunk0, output + 0 * 16); + Vector128.Store(compressed, output + 1 * 16); + Vector256.Store(b.chunk1, output + 2 * 16 - 1); + } + break; + + case 2: + { + Vector128 chunk0 = Avx2.ExtractVector128(b.chunk1, 0); + Vector128 chunk1 = Avx2.ExtractVector128(b.chunk0, 1); + Vector128 v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = Sse2.Subtract(v1, v2); + Vector128 compressed = Ssse3.Shuffle(chunk0, sh); + Vector256.Store(b.chunk0, output + 0 * 16); + Vector128.Store(compressed, output + 2 * 16); + Vector128.Store(chunk1, output + 3 * 16 - 1); + } + break; + + case 3: + { + Vector128 chunk0 = Avx2.ExtractVector128(b.chunk1, 0); + Vector128 chunk1 = Avx2.ExtractVector128(b.chunk0, 1); + Vector128 v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = Sse2.Subtract(v1, v2); + Vector128 compressed = Ssse3.Shuffle(chunk1, sh); + Vector256.Store(b.chunk0, output + 0 * 16); + Vector128.Store(chunk0, output + 2 * 16); + Vector128.Store(compressed, output + 3 * 16); + } + break; + } + return 63; + } ulong nmask = ~mask; Compress(b.chunk0, (UInt32)mask, output, tablePtr); Compress(b.chunk1, (UInt32)(mask >> 32), output + Popcnt.X64.PopCount(nmask & 0xFFFFFFFF), tablePtr); diff --git a/src/Base64SSEUTF8.cs b/src/Base64SSEUTF8.cs index 1214924..a666d4a 100644 --- a/src/Base64SSEUTF8.cs +++ b/src/Base64SSEUTF8.cs @@ -1,4 +1,5 @@ using System; +using System.Numerics; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; using System.Runtime.CompilerServices; @@ -131,6 +132,68 @@ private static ushort ToBase64Mask(bool base64Url, ref Vector128 src, ref [MethodImpl(MethodImplOptions.AggressiveInlining)] private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr) { + // if mask is a power of 2, we can use a simpler version + if ((mask & (mask - 1)) == 0) // check if mask is a power of 2 + { + int pos64 = BitOperations.TrailingZeroCount(mask); + int pos = pos64 & 0xf; + Vector128 v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + Vector128 v0 = Vector128.Create((byte)(pos-1)); + switch (pos64 >> 4) + { + case 0: + { + Vector128 v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = Sse2.Subtract(v1, v2); + Vector128 compressed = Ssse3.Shuffle(b.chunk0, sh); + Vector128.Store(compressed, output + 0 * 16); + Vector128.Store(b.chunk1, output + 1 * 16 - 1); + Vector128.Store(b.chunk2, output + 2 * 16 - 1); + Vector128.Store(b.chunk3, output + 3 * 16 - 1); + + } + break; + + case 1: + { + Vector128 v2 = Sse2.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = Sse2.Subtract(v1, v2); + Vector128 compressed = Ssse3.Shuffle(b.chunk1, sh); + Vector128.Store(b.chunk0, output + 0 * 16); + Vector128.Store(compressed, output + 1 * 16); + Vector128.Store(b.chunk2, output + 2 * 16 - 1); + Vector128.Store(b.chunk3, output + 3 * 16 - 1); + + } + break; + + case 2: + { + Vector128 v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = Sse2.Subtract(v1, v2); + Vector128 compressed = Ssse3.Shuffle(b.chunk2, sh); + Vector128.Store(b.chunk0, output + 0 * 16); + Vector128.Store(b.chunk1, output + 1 * 16); + Vector128.Store(compressed, output + 2 * 16); + Vector128.Store(b.chunk3, output + 3 * 16 - 1); + + } + break; + + case 3: + { + Vector128 v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = Sse2.Subtract(v1, v2); + Vector128 compressed = Ssse3.Shuffle(b.chunk3, sh); + Vector128.Store(b.chunk0, output + 0 * 16); + Vector128.Store(b.chunk1, output + 1 * 16); + Vector128.Store(b.chunk2, output + 2 * 16); + Vector128.Store(compressed, output + 3 * 16); + } + break; + } + return 63; + } ulong nmask = ~mask; Compress(b.chunk0, (ushort)mask, output, tablePtr); Compress(b.chunk1, (ushort)(mask >> 16), output + Popcnt.X64.PopCount(nmask & 0xFFFF), tablePtr);