fix edge cases

gfx · gfx · commit e48105c23150 · 2025-12-28T22:59:26.000+09:00
diff --git a/src/utils/utf8-wasm-binary.ts b/src/utils/utf8-wasm-binary.ts
@@ -6,18 +6,22 @@ AGFzbQEAAAABNQhedwFgAW8Bf2ACb38Bf2ADb2QAfwF/YANkAH9/AWRvYAJ/ZAABf2ABfwFkAGADZA
 B/fwFvAnsEDndhc206anMtc3RyaW5nBmxlbmd0aAABDndhc206anMtc3RyaW5nCmNoYXJDb2RlQXQA
 Ag53YXNtOmpzLXN0cmluZxFpbnRvQ2hhckNvZGVBcnJheQADDndhc206anMtc3RyaW5nEWZyb21DaG
 FyQ29kZUFycmF5AAQDBgUBAgUGBwUDAQABB1QGBm1lbW9yeQIACXV0ZjhDb3VudAAECnV0ZjhFbmNv
-ZGUABRF1dGY4RGVjb2RlVG9BcnJheQAGCmFsbG9jQXJyYXkABw1hcnJheVRvU3RyaW5nAAgK6AUFaw
-EEfyAAEAAhBANAIAEgBE9FBEAgACABEAEiA0GAAUkEfyACQQFqBSADQYAQSQR/IAJBAmoFIANB/7cD
-TSADQYCwA09xBH8gAUEBaiEBIAJBBGoFIAJBA2oLCwshAiABQQFqIQEMAQsLIAILswICBH8BZAAgAS
-ECIAAgABAAIgX7BwAiBkEAEAIaA0AgBCAFT0UEQCAGIAT7DQAiA0GAAUkEfyACIAM6AAAgAkEBagUg
-A0GAEEkEfyACIANBBnZBwAFyOgAAIAJBAWogA0E/cUGAAXI6AAAgAkECagUgA0H/twNNIANBgLADT3
-EEfyACIANBCnQgBiAEQQFqIgT7DQBqQYC4/xprIgNBEnZB8AFyOgAAIAJBAWogA0EMdkE/cUGAAXI6
-AAAgAkECaiADQQZ2QT9xQYABcjoAACACQQNqIANBP3FBgAFyOgAAIAJBBGoFIAIgA0EMdkHgAXI6AA
-AgAkEBaiADQQZ2QT9xQYABcjoAACACQQJqIANBP3FBgAFyOgAAIAJBA2oLCwshAiAEQQFqIQQMAQsL
-IAIgAWsLsQIBA38DQCAAIANNRQRAIAMtAAAiBEGAAXEEfyAEQeABcUHAAUYEfyABIAIgA0EBai0AAE
-E/cSAEQR9xQQZ0cvsOACACQQFqIQIgA0ECagUgBEHwAXFB4AFGBH8gASACIANBAmotAABBP3EgBEEP
-cUEMdCADQQFqLQAAQT9xQQZ0cnL7DgAgAkEBaiECIANBA2oFIARB+AFxQfABRgR/IAEgAiADQQNqLQ
-AAQT9xIARBB3FBEnQgA0EBai0AAEE/cUEMdHIgA0ECai0AAEE/cUEGdHJyQYCABGsiBEEKdkGAsANy
-+w4AIAEgAkEBaiICIARB/wdxQYC4A3L7DgAgAkEBaiECIANBBGoFIANBAWoLCwsFIAEgAiAE+w4AIA
-JBAWohAiADQQFqCyEDDAELCyACCwcAIAD7BwALCgAgACABIAIQAws=
+ZGUABRF1dGY4RGVjb2RlVG9BcnJheQAGCmFsbG9jQXJyYXkABw1hcnJheVRvU3RyaW5nAAgKsQcFlA
+EBBH8gABAAIQQDQCADIARPRQRAIAAgAxABIgJBgAFJBH8gAUEBagUgAkGAEEkEfyABQQJqBSACQf+3
+A00gAkGAsANPcQR/IANBAWoiAiAESQR/IAAgAhABQYD4A3FBgLgDRgR/IAIhAyABQQRqBSABQQNqCw
+UgAUEDagsFIAFBA2oLCwshASADQQFqIQMMAQsLIAELwgMCBn8BZAAgASECIAAgABAAIgX7BwAiCEEA
+EAIaA0AgBCAFT0UEQCAIIAT7DQAiA0GAAUkEfyACIAM6AAAgAkEBagUgA0GAEEkEfyACIANBBnZBwA
+FyOgAAIAJBAWogA0E/cUGAAXI6AAAgAkECagUgA0H/twNNIANBgLADT3EEfyAEQQFqIgYgBUkEfyAI
+IAb7DQAiB0GA+ANxQYC4A0YEfyAGIQQgAiADQQp0IAdqQYC4/xprIgNBEnZB8AFyOgAAIAJBAWogA0
+EMdkE/cUGAAXI6AAAgAkECaiADQQZ2QT9xQYABcjoAACACQQNqIANBP3FBgAFyOgAAIAJBBGoFIAIg
+A0EMdkHgAXI6AAAgAkEBaiADQQZ2QT9xQYABcjoAACACQQJqIANBP3FBgAFyOgAAIAJBA2oLBSACIA
+NBDHZB4AFyOgAAIAJBAWogA0EGdkE/cUGAAXI6AAAgAkECaiADQT9xQYABcjoAACACQQNqCwUgAiAD
+QQx2QeABcjoAACACQQFqIANBBnZBP3FBgAFyOgAAIAJBAmogA0E/cUGAAXI6AAAgAkEDagsLCyECIA
+RBAWohBAwBCwsgAiABawvBAgEDfwNAIAAgA01FBEAgAy0AACIEQYABcQR/IARB4AFxQcABRgR/IAEg
+AiADQQFqLQAAQT9xIARBH3FBBnRy+w4AIAJBAWohAiADQQJqBSAEQfABcUHgAUYEfyABIAIgA0ECai
+0AAEE/cSAEQQ9xQQx0IANBAWotAABBP3FBBnRycvsOACACQQFqIQIgA0EDagUgBEH4AXFB8AFGBH8g
+ASACIANBA2otAABBP3EgBEEHcUESdCADQQFqLQAAQT9xQQx0ciADQQJqLQAAQT9xQQZ0cnJBgIAEay
+IEQQp2QYCwA3L7DgAgASACQQFqIgIgBEH/B3FBgLgDcvsOACACQQFqIQIgA0EEagUgASACIAT7DgAg
+AkEBaiECIANBAWoLCwsFIAEgAiAE+w4AIAJBAWohAiADQQFqCyEDDAELCyACCwcAIAD7BwALCgAgAC
+ABIAIQAws=
 `;
diff --git a/test/utf8-wasm.test.ts b/test/utf8-wasm.test.ts
@@ -1,5 +1,5 @@
 import assert from "assert";
-import { WASM_AVAILABLE, getWasmError, getWasmExports } from "../src/utils/utf8-wasm.ts";
+import { WASM_AVAILABLE, getWasmError, getWasmExports, utf8CountWasm, utf8EncodeWasm, utf8DecodeWasm } from "../src/utils/utf8-wasm.ts";
 import { utf8Count, utf8CountJs, utf8Encode, utf8EncodeJs, utf8Decode, utf8DecodeJs } from "../src/utils/utf8.ts";
 
 describe("utf8-wasm", () => {
@@ -129,4 +129,236 @@ describe("utf8-wasm", () => {
       });
     }
   });
+
+  // Edge case tests for invalid/malformed data
+  // These tests ensure JS and WASM implementations behave identically
+  describe("edge cases: lone surrogates", () => {
+    // Lone high surrogate (0xD800-0xDBFF without following low surrogate)
+    const loneHighSurrogate = "\uD800"; // U+D800
+    const loneHighSurrogateAtEnd = "abc\uD800";
+    const loneHighSurrogateFollowedByAscii = "\uD800X";
+    const loneHighSurrogateFollowedByHighSurrogate = "\uD800\uD800";
+
+    // Lone low surrogate (0xDC00-0xDFFF without preceding high surrogate)
+    const loneLowSurrogate = "\uDC00";
+    const loneLowSurrogateAtStart = "\uDC00abc";
+    const loneLowSurrogateBetweenAscii = "a\uDC00b";
+
+    // Mixed valid and invalid surrogates
+    const validSurrogatePair = "\uD83D\uDE00"; // 😀
+    const validThenLoneHigh = "\uD83D\uDE00\uD800";
+    const loneLowThenValid = "\uDC00\uD83D\uDE00";
+
+    const surrogateTestCases = [
+      { str: loneHighSurrogate, description: "lone high surrogate" },
+      { str: loneHighSurrogateAtEnd, description: "lone high surrogate at end" },
+      { str: loneHighSurrogateFollowedByAscii, description: "lone high surrogate followed by ASCII" },
+      { str: loneHighSurrogateFollowedByHighSurrogate, description: "two lone high surrogates" },
+      { str: loneLowSurrogate, description: "lone low surrogate" },
+      { str: loneLowSurrogateAtStart, description: "lone low surrogate at start" },
+      { str: loneLowSurrogateBetweenAscii, description: "lone low surrogate between ASCII" },
+      { str: validSurrogatePair, description: "valid surrogate pair (emoji)" },
+      { str: validThenLoneHigh, description: "valid pair then lone high" },
+      { str: loneLowThenValid, description: "lone low then valid pair" },
+    ];
+
+    describe("utf8Count", () => {
+      for (const { str, description } of surrogateTestCases) {
+        it(`counts ${description} consistently`, () => {
+          const jsResult = utf8CountJs(str);
+
+          // JS implementation is the reference - lone surrogates should be 3 bytes each
+          assert.ok(jsResult > 0, `JS count should be positive for "${description}"`);
+
+          if (WASM_AVAILABLE) {
+            const wasmResult = utf8CountWasm(str);
+            assert.strictEqual(wasmResult, jsResult, `WASM count should match JS for "${description}"`);
+          }
+        });
+      }
+
+      it("lone high surrogate counts as 3 bytes", () => {
+        // A lone high surrogate (0xD800-0xDBFF) should be encoded as 3 bytes
+        // because it's in the 0x800-0xFFFF range
+        assert.strictEqual(utf8CountJs("\uD800"), 3);
+        if (WASM_AVAILABLE) {
+          assert.strictEqual(utf8CountWasm("\uD800"), 3);
+        }
+      });
+
+      it("lone low surrogate counts as 3 bytes", () => {
+        assert.strictEqual(utf8CountJs("\uDC00"), 3);
+        if (WASM_AVAILABLE) {
+          assert.strictEqual(utf8CountWasm("\uDC00"), 3);
+        }
+      });
+
+      it("valid surrogate pair counts as 4 bytes", () => {
+        assert.strictEqual(utf8CountJs("\uD83D\uDE00"), 4); // 😀
+        if (WASM_AVAILABLE) {
+          assert.strictEqual(utf8CountWasm("\uD83D\uDE00"), 4);
+        }
+      });
+    });
+
+    describe("utf8Encode", () => {
+      for (const { str, description } of surrogateTestCases) {
+        it(`encodes ${description} consistently`, () => {
+          const byteLength = utf8CountJs(str);
+          const jsBuffer = new Uint8Array(byteLength);
+          utf8EncodeJs(str, jsBuffer, 0);
+
+          if (WASM_AVAILABLE) {
+            const wasmBuffer = new Uint8Array(byteLength);
+            utf8EncodeWasm(str, wasmBuffer, 0);
+            assert.deepStrictEqual(wasmBuffer, jsBuffer, `WASM encode should match JS for "${description}"`);
+          }
+        });
+      }
+    });
+
+    describe("round-trip with lone surrogates", () => {
+      for (const { str, description } of surrogateTestCases) {
+        it(`round-trips ${description}`, () => {
+          const byteLength = utf8CountJs(str);
+          const buffer = new Uint8Array(byteLength);
+          utf8EncodeJs(str, buffer, 0);
+          const decoded = utf8DecodeJs(buffer, 0, byteLength);
+
+          assert.strictEqual(decoded, str, `JS round-trip failed for "${description}"`);
+
+          if (WASM_AVAILABLE) {
+            const wasmBuffer = new Uint8Array(byteLength);
+            utf8EncodeWasm(str, wasmBuffer, 0);
+            const wasmDecoded = utf8DecodeWasm(wasmBuffer, 0, byteLength);
+            assert.strictEqual(wasmDecoded, str, `WASM round-trip failed for "${description}"`);
+          }
+        });
+      }
+    });
+  });
+
+  describe("edge cases: invalid UTF-8 bytes in decode", () => {
+    // Invalid UTF-8 sequences that don't match any valid pattern
+    const invalidByteSequences = [
+      {
+        bytes: new Uint8Array([0x80]), // Continuation byte without leading byte
+        description: "lone continuation byte 0x80",
+      },
+      {
+        bytes: new Uint8Array([0xBF]), // Continuation byte without leading byte
+        description: "lone continuation byte 0xBF",
+      },
+      {
+        bytes: new Uint8Array([0xFE]), // Invalid byte (never valid in UTF-8)
+        description: "invalid byte 0xFE",
+      },
+      {
+        bytes: new Uint8Array([0xFF]), // Invalid byte (never valid in UTF-8)
+        description: "invalid byte 0xFF",
+      },
+      {
+        bytes: new Uint8Array([0xF8, 0x80, 0x80, 0x80, 0x80]), // 5-byte sequence (invalid)
+        description: "5-byte sequence (invalid)",
+      },
+      {
+        bytes: new Uint8Array([0x41, 0x80, 0x42]), // ASCII, invalid, ASCII
+        description: "invalid byte between ASCII",
+      },
+      {
+        bytes: new Uint8Array([0xC0, 0x80]), // Overlong encoding of NUL
+        description: "overlong encoding of NUL",
+      },
+      {
+        bytes: new Uint8Array([0xE0, 0x80, 0x80]), // Overlong encoding
+        description: "overlong 3-byte encoding",
+      },
+    ];
+
+    describe("utf8Decode preserves invalid bytes", () => {
+      for (const { bytes, description } of invalidByteSequences) {
+        it(`preserves ${description}`, () => {
+          const jsResult = utf8DecodeJs(bytes, 0, bytes.length);
+
+          // The JS implementation should preserve invalid bytes as code units
+          // So the result length should be > 0
+          assert.ok(jsResult.length > 0, `JS decode should produce output for "${description}"`);
+
+          if (WASM_AVAILABLE) {
+            const wasmResult = utf8DecodeWasm(bytes, 0, bytes.length);
+            assert.strictEqual(
+              wasmResult,
+              jsResult,
+              `WASM decode should match JS for "${description}": got "${wasmResult}" vs "${jsResult}"`
+            );
+          }
+        });
+      }
+    });
+
+    describe("invalid bytes are not dropped", () => {
+      it("0x80 byte is preserved, not dropped", () => {
+        const bytes = new Uint8Array([0x80]);
+        const jsResult = utf8DecodeJs(bytes, 0, 1);
+        // Should be a single character with code point 0x80
+        assert.strictEqual(jsResult.length, 1);
+        assert.strictEqual(jsResult.charCodeAt(0), 0x80);
+
+        if (WASM_AVAILABLE) {
+          const wasmResult = utf8DecodeWasm(bytes, 0, 1);
+          assert.strictEqual(wasmResult.length, 1, "WASM should not drop the byte");
+          assert.strictEqual(wasmResult.charCodeAt(0), 0x80);
+        }
+      });
+
+      it("0xFF byte is preserved, not dropped", () => {
+        const bytes = new Uint8Array([0xFF]);
+        const jsResult = utf8DecodeJs(bytes, 0, 1);
+        assert.strictEqual(jsResult.length, 1);
+        assert.strictEqual(jsResult.charCodeAt(0), 0xFF);
+
+        if (WASM_AVAILABLE) {
+          const wasmResult = utf8DecodeWasm(bytes, 0, 1);
+          assert.strictEqual(wasmResult.length, 1, "WASM should not drop the byte");
+          assert.strictEqual(wasmResult.charCodeAt(0), 0xFF);
+        }
+      });
+
+      it("invalid bytes between valid UTF-8 are preserved", () => {
+        // "A" + invalid + "B"
+        const bytes = new Uint8Array([0x41, 0x80, 0x42]);
+        const jsResult = utf8DecodeJs(bytes, 0, 3);
+
+        // Should be 3 characters: 'A', char(0x80), 'B'
+        assert.strictEqual(jsResult.length, 3);
+        assert.strictEqual(jsResult.charCodeAt(0), 0x41); // 'A'
+        assert.strictEqual(jsResult.charCodeAt(1), 0x80); // invalid byte preserved
+        assert.strictEqual(jsResult.charCodeAt(2), 0x42); // 'B'
+
+        if (WASM_AVAILABLE) {
+          const wasmResult = utf8DecodeWasm(bytes, 0, 3);
+          assert.strictEqual(wasmResult.length, 3, "WASM should produce 3 chars");
+          assert.strictEqual(wasmResult, jsResult, "WASM should match JS");
+        }
+      });
+
+      it("multiple invalid bytes are all preserved", () => {
+        const bytes = new Uint8Array([0x80, 0x81, 0x82, 0xFE, 0xFF]);
+        const jsResult = utf8DecodeJs(bytes, 0, 5);
+
+        assert.strictEqual(jsResult.length, 5, "All 5 invalid bytes should produce 5 chars");
+        assert.strictEqual(jsResult.charCodeAt(0), 0x80);
+        assert.strictEqual(jsResult.charCodeAt(1), 0x81);
+        assert.strictEqual(jsResult.charCodeAt(2), 0x82);
+        assert.strictEqual(jsResult.charCodeAt(3), 0xFE);
+        assert.strictEqual(jsResult.charCodeAt(4), 0xFF);
+
+        if (WASM_AVAILABLE) {
+          const wasmResult = utf8DecodeWasm(bytes, 0, 5);
+          assert.strictEqual(wasmResult.length, 5, "WASM should produce 5 chars");
+          assert.strictEqual(wasmResult, jsResult, "WASM should match JS");
+        }
+      });
+    });
+  });
 });
diff --git a/wasm/utf8.wat b/wasm/utf8.wat