From 4f75a4cdae4050eae6435d1018bc3fcd537733f6 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 13 Mar 2025 18:42:29 -0400 Subject: [PATCH 1/4] improving the performance in the case where ignorable characters are uncommon --- README.md | 2 +- benchmark/Benchmark.cs | 6 +--- src/Base64.cs | 3 +- src/Base64ARM.cs | 68 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index ca44b93..990c794 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ fully reproducible. | processor and base freq. | SimdBase64 (GB/s) | .NET speed (GB/s) | speed up | |:----------------|:------------------------|:-------------------|:-------------------| -| Apple M2 processor (ARM, 3.5 Ghz) | 6.5 | 3.8 | 1.7 x | +| Apple M2 processor (ARM, 3.5 Ghz) | 10 | 3.8 | 2.6 x | | AWS Graviton 3 (ARM, 2.6 GHz) | 3.6 | 2.0 | 1.8 x | | Intel Ice Lake (2.0 GHz) | 6.5 | 3.4 | 1.9 x | | AMD EPYC 7R32 (Zen 2, 2.8 GHz) | 6.8 | 2.9 | 2.3 x | diff --git a/benchmark/Benchmark.cs b/benchmark/Benchmark.cs index c74d044..873412e 100644 --- a/benchmark/Benchmark.cs +++ b/benchmark/Benchmark.cs @@ -2,13 +2,9 @@ using BenchmarkDotNet.Running; using BenchmarkDotNet.Configs; using BenchmarkDotNet.Reports; -using BenchmarkDotNet.Filters; using BenchmarkDotNet.Jobs; using System.Text; -using System.Runtime.InteropServices; using BenchmarkDotNet.Columns; -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; namespace SimdUnicodeBenchmarks { @@ -464,7 +460,7 @@ public unsafe void RunOurDecodingBenchmarkWithAllocUTF16(string[] data, int[] le if (dataoutput.Length != lengths[i]) { - Console.WriteLine($"Error: {dataoutput.Length } != {lengths[i]}"); + Console.WriteLine($"Error: {dataoutput.Length} != {lengths[i]}"); #pragma warning disable CA2201 throw new Exception("Error"); } diff --git a/src/Base64.cs b/src/Base64.cs index 045e764..b67e504 100644 --- a/src/Base64.cs +++ b/src/Base64.cs @@ -14,7 +14,8 @@ public static int MaximalBinaryLengthFromBase64(ReadOnlySpan input) { return Scalar.Base64.MaximalBinaryLengthFromBase64Scalar(input); } - public static byte[] FromBase64String(string s) { + public static byte[] FromBase64String(string s) + { ReadOnlySpan base64 = s.AsSpan(); byte[] newBytes = new byte[SimdBase64.Base64.MaximalBinaryLengthFromBase64(base64)]; int bytesConsumed = 0; diff --git a/src/Base64ARM.cs b/src/Base64ARM.cs index 99ce7e9..71fecb0 100644 --- a/src/Base64ARM.cs +++ b/src/Base64ARM.cs @@ -219,6 +219,74 @@ private static unsafe ulong ToBase64MaskUrl(Block64* b, ref bool error) [MethodImpl(MethodImplOptions.AggressiveInlining)] private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr) { + + // if mask is a power of 2, we can use a simpler version + if ((mask & (mask - 1)) == 0) // check if mask is a power of 2 + { + int pos64 = ArmBase.Arm64.LeadingZeroCount(mask); + int pos = pos64 & 0xf; + Vector128 v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + + + Vector128 v0 = Vector128.Create((byte)(0xe - pos)); + switch (pos64 >> 4) + { + case 3: + { + Vector128 v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = AdvSimd.Subtract(v1, v2); + Vector128 compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk0, sh); + Vector128.Store(compressed, output + 0 * 16); + Vector128.Store(b.chunk1, output + 1 * 16 - 1); + Vector128.Store(b.chunk2, output + 2 * 16 - 1); + Vector128.Store(b.chunk3, output + 3 * 16 - 1); + + } + break; + + case 2: + { + Vector128 v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = AdvSimd.Subtract(v1, v2); + Vector128 compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk1, sh); + Vector128.Store(b.chunk0, output + 0 * 16); + Vector128.Store(compressed, output + 1 * 16); + Vector128.Store(b.chunk2, output + 2 * 16 - 1); + Vector128.Store(b.chunk3, output + 3 * 16 - 1); + + } + break; + + case 1: + { + Vector128 v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = AdvSimd.Subtract(v1, v2); + Vector128 compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk2, sh); + Vector128.Store(b.chunk0, output + 0 * 16); + Vector128.Store(b.chunk1, output + 1 * 16); + Vector128.Store(compressed, output + 2 * 16); + Vector128.Store(b.chunk3, output + 3 * 16 - 1); + + } + break; + + case 0: + { + Vector128 v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = AdvSimd.Subtract(v1, v2); + Vector128 compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk2, sh); + Vector128.Store(b.chunk0, output + 0 * 16); + Vector128.Store(b.chunk1, output + 1 * 16); + Vector128.Store(b.chunk2, output + 2 * 16); + Vector128.Store(compressed, output + 3 * 16); + } + break; + } + + + return 63; + + } ulong nmask = ~mask; Compress(b.chunk0, (ushort)mask, output, tablePtr); Compress(b.chunk1, (ushort)(mask >> 16), output + UInt64.PopCount(nmask & 0xFFFF), tablePtr); From 4992306b34da371bb51116d10df72e29f8f32900 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 13 Mar 2025 19:46:26 -0400 Subject: [PATCH 2/4] updating numbers --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 990c794..9941103 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ fully reproducible. | processor and base freq. | SimdBase64 (GB/s) | .NET speed (GB/s) | speed up | |:----------------|:------------------------|:-------------------|:-------------------| | Apple M2 processor (ARM, 3.5 Ghz) | 10 | 3.8 | 2.6 x | -| AWS Graviton 3 (ARM, 2.6 GHz) | 3.6 | 2.0 | 1.8 x | +| AWS Graviton 3 (ARM, 2.6 GHz) | 5.1 | 2.0 | 2.6 x | | Intel Ice Lake (2.0 GHz) | 6.5 | 3.4 | 1.9 x | | AMD EPYC 7R32 (Zen 2, 2.8 GHz) | 6.8 | 2.9 | 2.3 x | From 29d6bacc853eec1ca2d64f1b8599623f9ca435e4 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Fri, 14 Mar 2025 11:23:15 -0400 Subject: [PATCH 3/4] porting to AVX2 --- README.md | 2 +- src/Base64.cs | 4 +-- src/Base64ARM.cs | 11 +------- src/Base64AVX2UTF8.cs | 63 +++++++++++++++++++++++++++++++++++++++++++ src/Base64SSEUTF8.cs | 63 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 130 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 9941103..a1bec7d 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ fully reproducible. |:----------------|:------------------------|:-------------------|:-------------------| | Apple M2 processor (ARM, 3.5 Ghz) | 10 | 3.8 | 2.6 x | | AWS Graviton 3 (ARM, 2.6 GHz) | 5.1 | 2.0 | 2.6 x | -| Intel Ice Lake (2.0 GHz) | 6.5 | 3.4 | 1.9 x | +| Intel Ice Lake (2.0 GHz) | 7.6 | 3.4 | 2.2 x | | AMD EPYC 7R32 (Zen 2, 2.8 GHz) | 6.8 | 2.9 | 2.3 x | ## Results (SimdBase64 vs. string .NET functions) diff --git a/src/Base64.cs b/src/Base64.cs index b67e504..ec7b6c0 100644 --- a/src/Base64.cs +++ b/src/Base64.cs @@ -36,7 +36,7 @@ public unsafe static OperationStatus DecodeFromBase64(ReadOnlySpan source, //if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported) //{ //} - if (Avx2.IsSupported) + if (Avx2.IsSupported && Popcnt.IsSupported && Bmi1.IsSupported) { return AVX2.Base64.DecodeFromBase64AVX2(source, dest, out bytesConsumed, out bytesWritten, isUrl); } @@ -61,7 +61,7 @@ public unsafe static OperationStatus DecodeFromBase64(ReadOnlySpan source, //{ // return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment); //} - if (Avx2.IsSupported) + if (Avx2.IsSupported && Popcnt.IsSupported && Bmi1.IsSupported) { return AVX2.Base64.DecodeFromBase64AVX2(source, dest, out bytesConsumed, out bytesWritten, isUrl); } diff --git a/src/Base64ARM.cs b/src/Base64ARM.cs index 71fecb0..dad3362 100644 --- a/src/Base64ARM.cs +++ b/src/Base64ARM.cs @@ -219,15 +219,12 @@ private static unsafe ulong ToBase64MaskUrl(Block64* b, ref bool error) [MethodImpl(MethodImplOptions.AggressiveInlining)] private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr) { - // if mask is a power of 2, we can use a simpler version if ((mask & (mask - 1)) == 0) // check if mask is a power of 2 { int pos64 = ArmBase.Arm64.LeadingZeroCount(mask); int pos = pos64 & 0xf; Vector128 v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - - Vector128 v0 = Vector128.Create((byte)(0xe - pos)); switch (pos64 >> 4) { @@ -240,7 +237,6 @@ private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* outpu Vector128.Store(b.chunk1, output + 1 * 16 - 1); Vector128.Store(b.chunk2, output + 2 * 16 - 1); Vector128.Store(b.chunk3, output + 3 * 16 - 1); - } break; @@ -253,7 +249,6 @@ private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* outpu Vector128.Store(compressed, output + 1 * 16); Vector128.Store(b.chunk2, output + 2 * 16 - 1); Vector128.Store(b.chunk3, output + 3 * 16 - 1); - } break; @@ -266,7 +261,6 @@ private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* outpu Vector128.Store(b.chunk1, output + 1 * 16); Vector128.Store(compressed, output + 2 * 16); Vector128.Store(b.chunk3, output + 3 * 16 - 1); - } break; @@ -274,7 +268,7 @@ private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* outpu { Vector128 v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte(); Vector128 sh = AdvSimd.Subtract(v1, v2); - Vector128 compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk2, sh); + Vector128 compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk3, sh); Vector128.Store(b.chunk0, output + 0 * 16); Vector128.Store(b.chunk1, output + 1 * 16); Vector128.Store(b.chunk2, output + 2 * 16); @@ -282,10 +276,7 @@ private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* outpu } break; } - - return 63; - } ulong nmask = ~mask; Compress(b.chunk0, (ushort)mask, output, tablePtr); diff --git a/src/Base64AVX2UTF8.cs b/src/Base64AVX2UTF8.cs index 4b4901b..e4a7f0b 100644 --- a/src/Base64AVX2UTF8.cs +++ b/src/Base64AVX2UTF8.cs @@ -167,6 +167,69 @@ private static UInt64 ToBase64Mask(bool base64Url, ref Vector256 src, ref [MethodImpl(MethodImplOptions.AggressiveInlining)] private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr) { + // if mask is a power of 2, we can use a simpler version + if ((mask & (mask - 1)) == 0) // check if mask is a power of 2 + { + ulong pos64 = Bmi1.X64.TrailingZeroCount(mask); + ulong pos = pos64 & 0xf; + Vector128 v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + Vector128 v0 = Vector128.Create((byte)(pos-1)); + switch (pos64 >> 4) + { + case 0: + { + Vector128 chunk0 = Avx2.ExtractVector128(b.chunk0, 0); + Vector128 chunk1 = Avx2.ExtractVector128(b.chunk0, 1); + Vector128 v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = Sse2.Subtract(v1, v2); + Vector128 compressed = Ssse3.Shuffle(chunk0, sh); + Vector128.Store(compressed, output + 0 * 16); + Vector128.Store(chunk1, output + 1 * 16 - 1); + Vector256.Store(b.chunk1, output + 2 * 16 - 1); + } + break; + + case 1: + { + Vector128 chunk0 = Avx2.ExtractVector128(b.chunk0, 0); + Vector128 chunk1 = Avx2.ExtractVector128(b.chunk0, 1); + Vector128 v2 = Sse2.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = Sse2.Subtract(v1, v2); + Vector128 compressed = Ssse3.Shuffle(chunk1, sh); + Vector128.Store(chunk0, output + 0 * 16); + Vector128.Store(compressed, output + 1 * 16); + Vector256.Store(b.chunk1, output + 2 * 16 - 1); + } + break; + + case 2: + { + Vector128 chunk0 = Avx2.ExtractVector128(b.chunk1, 0); + Vector128 chunk1 = Avx2.ExtractVector128(b.chunk0, 1); + Vector128 v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = Sse2.Subtract(v1, v2); + Vector128 compressed = Ssse3.Shuffle(chunk0, sh); + Vector256.Store(b.chunk0, output + 0 * 16); + Vector128.Store(compressed, output + 2 * 16); + Vector128.Store(chunk1, output + 3 * 16 - 1); + } + break; + + case 3: + { + Vector128 chunk0 = Avx2.ExtractVector128(b.chunk1, 0); + Vector128 chunk1 = Avx2.ExtractVector128(b.chunk0, 1); + Vector128 v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = Sse2.Subtract(v1, v2); + Vector128 compressed = Ssse3.Shuffle(chunk1, sh); + Vector256.Store(b.chunk0, output + 0 * 16); + Vector128.Store(chunk0, output + 2 * 16); + Vector128.Store(compressed, output + 3 * 16); + } + break; + } + return 63; + } ulong nmask = ~mask; Compress(b.chunk0, (UInt32)mask, output, tablePtr); Compress(b.chunk1, (UInt32)(mask >> 32), output + Popcnt.X64.PopCount(nmask & 0xFFFFFFFF), tablePtr); diff --git a/src/Base64SSEUTF8.cs b/src/Base64SSEUTF8.cs index 1214924..a666d4a 100644 --- a/src/Base64SSEUTF8.cs +++ b/src/Base64SSEUTF8.cs @@ -1,4 +1,5 @@ using System; +using System.Numerics; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; using System.Runtime.CompilerServices; @@ -131,6 +132,68 @@ private static ushort ToBase64Mask(bool base64Url, ref Vector128 src, ref [MethodImpl(MethodImplOptions.AggressiveInlining)] private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr) { + // if mask is a power of 2, we can use a simpler version + if ((mask & (mask - 1)) == 0) // check if mask is a power of 2 + { + int pos64 = BitOperations.TrailingZeroCount(mask); + int pos = pos64 & 0xf; + Vector128 v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + Vector128 v0 = Vector128.Create((byte)(pos-1)); + switch (pos64 >> 4) + { + case 0: + { + Vector128 v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = Sse2.Subtract(v1, v2); + Vector128 compressed = Ssse3.Shuffle(b.chunk0, sh); + Vector128.Store(compressed, output + 0 * 16); + Vector128.Store(b.chunk1, output + 1 * 16 - 1); + Vector128.Store(b.chunk2, output + 2 * 16 - 1); + Vector128.Store(b.chunk3, output + 3 * 16 - 1); + + } + break; + + case 1: + { + Vector128 v2 = Sse2.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = Sse2.Subtract(v1, v2); + Vector128 compressed = Ssse3.Shuffle(b.chunk1, sh); + Vector128.Store(b.chunk0, output + 0 * 16); + Vector128.Store(compressed, output + 1 * 16); + Vector128.Store(b.chunk2, output + 2 * 16 - 1); + Vector128.Store(b.chunk3, output + 3 * 16 - 1); + + } + break; + + case 2: + { + Vector128 v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = Sse2.Subtract(v1, v2); + Vector128 compressed = Ssse3.Shuffle(b.chunk2, sh); + Vector128.Store(b.chunk0, output + 0 * 16); + Vector128.Store(b.chunk1, output + 1 * 16); + Vector128.Store(compressed, output + 2 * 16); + Vector128.Store(b.chunk3, output + 3 * 16 - 1); + + } + break; + + case 3: + { + Vector128 v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte(); + Vector128 sh = Sse2.Subtract(v1, v2); + Vector128 compressed = Ssse3.Shuffle(b.chunk3, sh); + Vector128.Store(b.chunk0, output + 0 * 16); + Vector128.Store(b.chunk1, output + 1 * 16); + Vector128.Store(b.chunk2, output + 2 * 16); + Vector128.Store(compressed, output + 3 * 16); + } + break; + } + return 63; + } ulong nmask = ~mask; Compress(b.chunk0, (ushort)mask, output, tablePtr); Compress(b.chunk1, (ushort)(mask >> 16), output + Popcnt.X64.PopCount(nmask & 0xFFFF), tablePtr); From 3d73d8813b670e7d91eca1c982e1617250256171 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Fri, 14 Mar 2025 11:35:24 -0400 Subject: [PATCH 4/4] doc update --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a1bec7d..b587f8e 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ fully reproducible. | Apple M2 processor (ARM, 3.5 Ghz) | 10 | 3.8 | 2.6 x | | AWS Graviton 3 (ARM, 2.6 GHz) | 5.1 | 2.0 | 2.6 x | | Intel Ice Lake (2.0 GHz) | 7.6 | 3.4 | 2.2 x | -| AMD EPYC 7R32 (Zen 2, 2.8 GHz) | 6.8 | 2.9 | 2.3 x | +| AMD EPYC 7R32 (Zen 2, 2.8 GHz) | 6.9 | 3.0 | 2.3 x | ## Results (SimdBase64 vs. string .NET functions)