From 4f75a4cdae4050eae6435d1018bc3fcd537733f6 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Thu, 13 Mar 2025 18:42:29 -0400
Subject: [PATCH 1/4] improving the performance in the case where ignorable
 characters are uncommon

---
 README.md              |  2 +-
 benchmark/Benchmark.cs |  6 +---
 src/Base64.cs          |  3 +-
 src/Base64ARM.cs       | 68 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 72 insertions(+), 7 deletions(-)
diff --git a/README.md b/README.md
index ca44b93..990c794 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ fully reproducible.
 
 | processor and base freq.      | SimdBase64 (GB/s) | .NET speed (GB/s) | speed up |
 |:----------------|:------------------------|:-------------------|:-------------------|
-| Apple M2 processor (ARM, 3.5 Ghz)   | 6.5                      | 3.8               | 1.7 x |
+| Apple M2 processor (ARM, 3.5 Ghz)   | 10                     | 3.8               | 2.6 x |
 | AWS Graviton 3 (ARM, 2.6 GHz)   | 3.6  | 2.0 | 1.8 x |
 | Intel Ice Lake (2.0 GHz)  | 6.5                      | 3.4              | 1.9 x |
 | AMD EPYC 7R32 (Zen 2, 2.8 GHz)    |  6.8        | 2.9 | 2.3 x |
diff --git a/benchmark/Benchmark.cs b/benchmark/Benchmark.cs
index c74d044..873412e 100644
--- a/benchmark/Benchmark.cs
+++ b/benchmark/Benchmark.cs
@@ -2,13 +2,9 @@
 using BenchmarkDotNet.Running;
 using BenchmarkDotNet.Configs;
 using BenchmarkDotNet.Reports;
-using BenchmarkDotNet.Filters;
 using BenchmarkDotNet.Jobs;
 using System.Text;
-using System.Runtime.InteropServices;
 using BenchmarkDotNet.Columns;
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
 
 namespace SimdUnicodeBenchmarks
 {
@@ -464,7 +460,7 @@ public unsafe void RunOurDecodingBenchmarkWithAllocUTF16(string[] data, int[] le
 
                 if (dataoutput.Length != lengths[i])
                 {
-                    Console.WriteLine($"Error: {dataoutput.Length } != {lengths[i]}");
+                    Console.WriteLine($"Error: {dataoutput.Length} != {lengths[i]}");
 #pragma warning disable CA2201
                     throw new Exception("Error");
                 }
diff --git a/src/Base64.cs b/src/Base64.cs
index 045e764..b67e504 100644
--- a/src/Base64.cs
+++ b/src/Base64.cs
@@ -14,7 +14,8 @@ public static int MaximalBinaryLengthFromBase64<T>(ReadOnlySpan<T> input)
         {
             return Scalar.Base64.MaximalBinaryLengthFromBase64Scalar(input);
         }
-        public static byte[] FromBase64String(string s) {
+        public static byte[] FromBase64String(string s)
+        {
             ReadOnlySpan<char> base64 = s.AsSpan();
             byte[] newBytes = new byte[SimdBase64.Base64.MaximalBinaryLengthFromBase64<char>(base64)];
             int bytesConsumed = 0;
diff --git a/src/Base64ARM.cs b/src/Base64ARM.cs
index 99ce7e9..71fecb0 100644
--- a/src/Base64ARM.cs
+++ b/src/Base64ARM.cs
@@ -219,6 +219,74 @@ private static unsafe ulong ToBase64MaskUrl(Block64* b, ref bool error)
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr)
             {
+
+                // if mask is a power of 2, we can use a simpler version
+                if ((mask & (mask - 1)) == 0) // check if mask is a power of 2
+                {
+                    int pos64 = ArmBase.Arm64.LeadingZeroCount(mask);
+                    int pos = pos64 & 0xf;
+                    Vector128<byte> v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+
+                    Vector128<byte> v0 = Vector128.Create((byte)(0xe - pos));
+                    switch (pos64 >> 4)
+                    {
+                        case 3:
+                            {
+                                Vector128<byte> v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
+                                Vector128<byte> sh = AdvSimd.Subtract(v1, v2);
+                                Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk0, sh);
+                                Vector128.Store(compressed, output + 0 * 16);
+                                Vector128.Store(b.chunk1, output + 1 * 16 - 1);
+                                Vector128.Store(b.chunk2, output + 2 * 16 - 1);
+                                Vector128.Store(b.chunk3, output + 3 * 16 - 1);
+
+                            }
+                            break;
+
+                        case 2:
+                            {
+                                Vector128<byte> v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
+                                Vector128<byte> sh = AdvSimd.Subtract(v1, v2);
+                                Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk1, sh);
+                                Vector128.Store(b.chunk0, output + 0 * 16);
+                                Vector128.Store(compressed, output + 1 * 16);
+                                Vector128.Store(b.chunk2, output + 2 * 16 - 1);
+                                Vector128.Store(b.chunk3, output + 3 * 16 - 1);
+
+                            }
+                            break;
+
+                        case 1:
+                            {
+                                Vector128<byte> v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
+                                Vector128<byte> sh = AdvSimd.Subtract(v1, v2);
+                                Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk2, sh);
+                                Vector128.Store(b.chunk0, output + 0 * 16);
+                                Vector128.Store(b.chunk1, output + 1 * 16);
+                                Vector128.Store(compressed, output + 2 * 16);
+                                Vector128.Store(b.chunk3, output + 3 * 16 - 1);
+
+                            }
+                            break;
+
+                        case 0:
+                            {
+                                Vector128<byte> v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
+                                Vector128<byte> sh = AdvSimd.Subtract(v1, v2);
+                                Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk2, sh);
+                                Vector128.Store(b.chunk0, output + 0 * 16);
+                                Vector128.Store(b.chunk1, output + 1 * 16);
+                                Vector128.Store(b.chunk2, output + 2 * 16);
+                                Vector128.Store(compressed, output + 3 * 16);
+                            }
+                            break;
+                    }
+
+
+                    return 63;
+
+                }
                 ulong nmask = ~mask;
                 Compress(b.chunk0, (ushort)mask, output, tablePtr);
                 Compress(b.chunk1, (ushort)(mask >> 16), output + UInt64.PopCount(nmask & 0xFFFF), tablePtr);

From 4992306b34da371bb51116d10df72e29f8f32900 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Thu, 13 Mar 2025 19:46:26 -0400
Subject: [PATCH 2/4] updating numbers

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 990c794..9941103 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ fully reproducible.
 | processor and base freq.      | SimdBase64 (GB/s) | .NET speed (GB/s) | speed up |
 |:----------------|:------------------------|:-------------------|:-------------------|
 | Apple M2 processor (ARM, 3.5 Ghz)   | 10                     | 3.8               | 2.6 x |
-| AWS Graviton 3 (ARM, 2.6 GHz)   | 3.6  | 2.0 | 1.8 x |
+| AWS Graviton 3 (ARM, 2.6 GHz)   | 5.1 | 2.0 | 2.6 x |
 | Intel Ice Lake (2.0 GHz)  | 6.5                      | 3.4              | 1.9 x |
 | AMD EPYC 7R32 (Zen 2, 2.8 GHz)    |  6.8        | 2.9 | 2.3 x |
 

From 29d6bacc853eec1ca2d64f1b8599623f9ca435e4 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Fri, 14 Mar 2025 11:23:15 -0400
Subject: [PATCH 3/4] porting to AVX2

---
 README.md             |  2 +-
 src/Base64.cs         |  4 +--
 src/Base64ARM.cs      | 11 +-------
 src/Base64AVX2UTF8.cs | 63 +++++++++++++++++++++++++++++++++++++++++++
 src/Base64SSEUTF8.cs  | 63 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 130 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 9941103..a1bec7d 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@ fully reproducible.
 |:----------------|:------------------------|:-------------------|:-------------------|
 | Apple M2 processor (ARM, 3.5 Ghz)   | 10                     | 3.8               | 2.6 x |
 | AWS Graviton 3 (ARM, 2.6 GHz)   | 5.1 | 2.0 | 2.6 x |
-| Intel Ice Lake (2.0 GHz)  | 6.5                      | 3.4              | 1.9 x |
+| Intel Ice Lake (2.0 GHz)  | 7.6                     | 3.4              | 2.2 x |
 | AMD EPYC 7R32 (Zen 2, 2.8 GHz)    |  6.8        | 2.9 | 2.3 x |
 
 ## Results (SimdBase64 vs. string .NET functions)
diff --git a/src/Base64.cs b/src/Base64.cs
index b67e504..ec7b6c0 100644
--- a/src/Base64.cs
+++ b/src/Base64.cs
@@ -36,7 +36,7 @@ public unsafe static OperationStatus DecodeFromBase64(ReadOnlySpan<byte> source,
             //if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported)
             //{
             //}
-            if (Avx2.IsSupported)
+            if (Avx2.IsSupported  && Popcnt.IsSupported && Bmi1.IsSupported)
             {
                 return AVX2.Base64.DecodeFromBase64AVX2(source, dest, out bytesConsumed, out bytesWritten, isUrl);
             }
@@ -61,7 +61,7 @@ public unsafe static OperationStatus DecodeFromBase64(ReadOnlySpan<char> source,
             //{
             //    return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment);
             //}
-            if (Avx2.IsSupported)
+            if (Avx2.IsSupported  && Popcnt.IsSupported && Bmi1.IsSupported)
             {
                 return AVX2.Base64.DecodeFromBase64AVX2(source, dest, out bytesConsumed, out bytesWritten, isUrl);
             }
diff --git a/src/Base64ARM.cs b/src/Base64ARM.cs
index 71fecb0..dad3362 100644
--- a/src/Base64ARM.cs
+++ b/src/Base64ARM.cs
@@ -219,15 +219,12 @@ private static unsafe ulong ToBase64MaskUrl(Block64* b, ref bool error)
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr)
             {
-
                 // if mask is a power of 2, we can use a simpler version
                 if ((mask & (mask - 1)) == 0) // check if mask is a power of 2
                 {
                     int pos64 = ArmBase.Arm64.LeadingZeroCount(mask);
                     int pos = pos64 & 0xf;
                     Vector128<byte> v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-
-
                     Vector128<byte> v0 = Vector128.Create((byte)(0xe - pos));
                     switch (pos64 >> 4)
                     {
@@ -240,7 +237,6 @@ private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* outpu
                                 Vector128.Store(b.chunk1, output + 1 * 16 - 1);
                                 Vector128.Store(b.chunk2, output + 2 * 16 - 1);
                                 Vector128.Store(b.chunk3, output + 3 * 16 - 1);
-
                             }
                             break;
 
@@ -253,7 +249,6 @@ private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* outpu
                                 Vector128.Store(compressed, output + 1 * 16);
                                 Vector128.Store(b.chunk2, output + 2 * 16 - 1);
                                 Vector128.Store(b.chunk3, output + 3 * 16 - 1);
-
                             }
                             break;
 
@@ -266,7 +261,6 @@ private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* outpu
                                 Vector128.Store(b.chunk1, output + 1 * 16);
                                 Vector128.Store(compressed, output + 2 * 16);
                                 Vector128.Store(b.chunk3, output + 3 * 16 - 1);
-
                             }
                             break;
 
@@ -274,7 +268,7 @@ private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* outpu
                             {
                                 Vector128<byte> v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
                                 Vector128<byte> sh = AdvSimd.Subtract(v1, v2);
-                                Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk2, sh);
+                                Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk3, sh);
                                 Vector128.Store(b.chunk0, output + 0 * 16);
                                 Vector128.Store(b.chunk1, output + 1 * 16);
                                 Vector128.Store(b.chunk2, output + 2 * 16);
@@ -282,10 +276,7 @@ private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* outpu
                             }
                             break;
                     }
-
-
                     return 63;
-
                 }
                 ulong nmask = ~mask;
                 Compress(b.chunk0, (ushort)mask, output, tablePtr);
diff --git a/src/Base64AVX2UTF8.cs b/src/Base64AVX2UTF8.cs
index 4b4901b..e4a7f0b 100644
--- a/src/Base64AVX2UTF8.cs
+++ b/src/Base64AVX2UTF8.cs
@@ -167,6 +167,69 @@ private static UInt64 ToBase64Mask(bool base64Url, ref Vector256<byte> src, ref
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr)
             {
+                // if mask is a power of 2, we can use a simpler version
+                if ((mask & (mask - 1)) == 0) // check if mask is a power of 2
+                {
+                    ulong pos64 = Bmi1.X64.TrailingZeroCount(mask);
+                    ulong pos = pos64 & 0xf;
+                    Vector128<byte> v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+                    Vector128<byte> v0 = Vector128.Create((byte)(pos-1));
+                    switch (pos64 >> 4)
+                    {
+                        case 0:
+                            {
+                                Vector128<byte> chunk0 = Avx2.ExtractVector128(b.chunk0, 0);
+                                Vector128<byte> chunk1 = Avx2.ExtractVector128(b.chunk0, 1);
+                                Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
+                                Vector128<byte> sh = Sse2.Subtract(v1, v2);
+                                Vector128<byte> compressed = Ssse3.Shuffle(chunk0, sh);
+                                Vector128.Store(compressed, output + 0 * 16);
+                                Vector128.Store(chunk1, output + 1 * 16 - 1);
+                                Vector256.Store(b.chunk1, output + 2 * 16 - 1);
+                            }
+                            break;
+
+                        case 1:
+                            {
+                                Vector128<byte> chunk0 = Avx2.ExtractVector128(b.chunk0, 0);
+                                Vector128<byte> chunk1 = Avx2.ExtractVector128(b.chunk0, 1);
+                                Vector128<byte> v2 = Sse2.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
+                                Vector128<byte> sh = Sse2.Subtract(v1, v2);
+                                Vector128<byte> compressed = Ssse3.Shuffle(chunk1, sh);
+                                Vector128.Store(chunk0, output + 0 * 16);
+                                Vector128.Store(compressed, output + 1 * 16);
+                                Vector256.Store(b.chunk1, output + 2 * 16 - 1);
+                            }
+                            break;
+
+                        case 2:
+                            {
+                                Vector128<byte> chunk0 = Avx2.ExtractVector128(b.chunk1, 0);
+                                Vector128<byte> chunk1 = Avx2.ExtractVector128(b.chunk0, 1);
+                                Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
+                                Vector128<byte> sh = Sse2.Subtract(v1, v2);
+                                Vector128<byte> compressed = Ssse3.Shuffle(chunk0, sh);
+                                Vector256.Store(b.chunk0, output + 0 * 16);
+                                Vector128.Store(compressed, output + 2 * 16);
+                                Vector128.Store(chunk1, output + 3 * 16 - 1);
+                            }
+                            break;
+
+                        case 3:
+                            {
+                                Vector128<byte> chunk0 = Avx2.ExtractVector128(b.chunk1, 0);
+                                Vector128<byte> chunk1 = Avx2.ExtractVector128(b.chunk0, 1);
+                                Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
+                                Vector128<byte> sh = Sse2.Subtract(v1, v2);
+                                Vector128<byte> compressed = Ssse3.Shuffle(chunk1, sh);
+                                Vector256.Store(b.chunk0, output + 0 * 16);
+                                Vector128.Store(chunk0, output + 2 * 16);
+                                Vector128.Store(compressed, output + 3 * 16);
+                            }
+                            break;
+                    }
+                    return 63;
+                }
                 ulong nmask = ~mask;
                 Compress(b.chunk0, (UInt32)mask, output, tablePtr);
                 Compress(b.chunk1, (UInt32)(mask >> 32), output + Popcnt.X64.PopCount(nmask & 0xFFFFFFFF), tablePtr);
diff --git a/src/Base64SSEUTF8.cs b/src/Base64SSEUTF8.cs
index 1214924..a666d4a 100644
--- a/src/Base64SSEUTF8.cs
+++ b/src/Base64SSEUTF8.cs
@@ -1,4 +1,5 @@
 using System;
+using System.Numerics;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 using System.Runtime.CompilerServices;
@@ -131,6 +132,68 @@ private static ushort ToBase64Mask(bool base64Url, ref Vector128<byte> src, ref
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr)
             {
+                // if mask is a power of 2, we can use a simpler version
+                if ((mask & (mask - 1)) == 0) // check if mask is a power of 2
+                {
+                    int pos64 = BitOperations.TrailingZeroCount(mask);
+                    int pos = pos64 & 0xf;
+                    Vector128<byte> v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+                    Vector128<byte> v0 = Vector128.Create((byte)(pos-1));
+                    switch (pos64 >> 4)
+                    {
+                        case 0:
+                            {
+                                Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
+                                Vector128<byte> sh = Sse2.Subtract(v1, v2);
+                                Vector128<byte> compressed = Ssse3.Shuffle(b.chunk0, sh);
+                                Vector128.Store(compressed, output + 0 * 16);
+                                Vector128.Store(b.chunk1, output + 1 * 16 - 1);
+                                Vector128.Store(b.chunk2, output + 2 * 16 - 1);
+                                Vector128.Store(b.chunk3, output + 3 * 16 - 1);
+
+                            }
+                            break;
+
+                        case 1:
+                            {
+                                Vector128<byte> v2 = Sse2.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
+                                Vector128<byte> sh = Sse2.Subtract(v1, v2);
+                                Vector128<byte> compressed = Ssse3.Shuffle(b.chunk1, sh);
+                                Vector128.Store(b.chunk0, output + 0 * 16);
+                                Vector128.Store(compressed, output + 1 * 16);
+                                Vector128.Store(b.chunk2, output + 2 * 16 - 1);
+                                Vector128.Store(b.chunk3, output + 3 * 16 - 1);
+
+                            }
+                            break;
+
+                        case 2:
+                            {
+                                Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
+                                Vector128<byte> sh = Sse2.Subtract(v1, v2);
+                                Vector128<byte> compressed = Ssse3.Shuffle(b.chunk2, sh);
+                                Vector128.Store(b.chunk0, output + 0 * 16);
+                                Vector128.Store(b.chunk1, output + 1 * 16);
+                                Vector128.Store(compressed, output + 2 * 16);
+                                Vector128.Store(b.chunk3, output + 3 * 16 - 1);
+
+                            }
+                            break;
+
+                        case 3:
+                            {
+                                Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
+                                Vector128<byte> sh = Sse2.Subtract(v1, v2);
+                                Vector128<byte> compressed = Ssse3.Shuffle(b.chunk3, sh);
+                                Vector128.Store(b.chunk0, output + 0 * 16);
+                                Vector128.Store(b.chunk1, output + 1 * 16);
+                                Vector128.Store(b.chunk2, output + 2 * 16);
+                                Vector128.Store(compressed, output + 3 * 16);
+                            }
+                            break;
+                    }
+                    return 63;
+                }
                 ulong nmask = ~mask;
                 Compress(b.chunk0, (ushort)mask, output, tablePtr);
                 Compress(b.chunk1, (ushort)(mask >> 16), output + Popcnt.X64.PopCount(nmask & 0xFFFF), tablePtr);

From 3d73d8813b670e7d91eca1c982e1617250256171 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Fri, 14 Mar 2025 11:35:24 -0400
Subject: [PATCH 4/4] doc update

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a1bec7d..b587f8e 100644
--- a/README.md
+++ b/README.md
@@ -35,7 +35,7 @@ fully reproducible.
 | Apple M2 processor (ARM, 3.5 Ghz)   | 10                     | 3.8               | 2.6 x |
 | AWS Graviton 3 (ARM, 2.6 GHz)   | 5.1 | 2.0 | 2.6 x |
 | Intel Ice Lake (2.0 GHz)  | 7.6                     | 3.4              | 2.2 x |
-| AMD EPYC 7R32 (Zen 2, 2.8 GHz)    |  6.8        | 2.9 | 2.3 x |
+| AMD EPYC 7R32 (Zen 2, 2.8 GHz)    |  6.9       | 3.0 | 2.3 x |
 
 ## Results (SimdBase64 vs. string .NET functions)