Skip to content

Commit e48105c

Browse files
committed
fix edge cases
1 parent 2e0c0fa commit e48105c

File tree

3 files changed

+322
-45
lines changed

3 files changed

+322
-45
lines changed

src/utils/utf8-wasm-binary.ts

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,22 @@ AGFzbQEAAAABNQhedwFgAW8Bf2ACb38Bf2ADb2QAfwF/YANkAH9/AWRvYAJ/ZAABf2ABfwFkAGADZA
66
B/fwFvAnsEDndhc206anMtc3RyaW5nBmxlbmd0aAABDndhc206anMtc3RyaW5nCmNoYXJDb2RlQXQA
77
Ag53YXNtOmpzLXN0cmluZxFpbnRvQ2hhckNvZGVBcnJheQADDndhc206anMtc3RyaW5nEWZyb21DaG
88
FyQ29kZUFycmF5AAQDBgUBAgUGBwUDAQABB1QGBm1lbW9yeQIACXV0ZjhDb3VudAAECnV0ZjhFbmNv
9-
ZGUABRF1dGY4RGVjb2RlVG9BcnJheQAGCmFsbG9jQXJyYXkABw1hcnJheVRvU3RyaW5nAAgK6AUFaw
10-
EEfyAAEAAhBANAIAEgBE9FBEAgACABEAEiA0GAAUkEfyACQQFqBSADQYAQSQR/IAJBAmoFIANB/7cD
11-
TSADQYCwA09xBH8gAUEBaiEBIAJBBGoFIAJBA2oLCwshAiABQQFqIQEMAQsLIAILswICBH8BZAAgAS
12-
ECIAAgABAAIgX7BwAiBkEAEAIaA0AgBCAFT0UEQCAGIAT7DQAiA0GAAUkEfyACIAM6AAAgAkEBagUg
13-
A0GAEEkEfyACIANBBnZBwAFyOgAAIAJBAWogA0E/cUGAAXI6AAAgAkECagUgA0H/twNNIANBgLADT3
14-
EEfyACIANBCnQgBiAEQQFqIgT7DQBqQYC4/xprIgNBEnZB8AFyOgAAIAJBAWogA0EMdkE/cUGAAXI6
15-
AAAgAkECaiADQQZ2QT9xQYABcjoAACACQQNqIANBP3FBgAFyOgAAIAJBBGoFIAIgA0EMdkHgAXI6AA
16-
AgAkEBaiADQQZ2QT9xQYABcjoAACACQQJqIANBP3FBgAFyOgAAIAJBA2oLCwshAiAEQQFqIQQMAQsL
17-
IAIgAWsLsQIBA38DQCAAIANNRQRAIAMtAAAiBEGAAXEEfyAEQeABcUHAAUYEfyABIAIgA0EBai0AAE
18-
E/cSAEQR9xQQZ0cvsOACACQQFqIQIgA0ECagUgBEHwAXFB4AFGBH8gASACIANBAmotAABBP3EgBEEP
19-
cUEMdCADQQFqLQAAQT9xQQZ0cnL7DgAgAkEBaiECIANBA2oFIARB+AFxQfABRgR/IAEgAiADQQNqLQ
20-
AAQT9xIARBB3FBEnQgA0EBai0AAEE/cUEMdHIgA0ECai0AAEE/cUEGdHJyQYCABGsiBEEKdkGAsANy
21-
+w4AIAEgAkEBaiICIARB/wdxQYC4A3L7DgAgAkEBaiECIANBBGoFIANBAWoLCwsFIAEgAiAE+w4AIA
22-
JBAWohAiADQQFqCyEDDAELCyACCwcAIAD7BwALCgAgACABIAIQAws=
9+
ZGUABRF1dGY4RGVjb2RlVG9BcnJheQAGCmFsbG9jQXJyYXkABw1hcnJheVRvU3RyaW5nAAgKsQcFlA
10+
EBBH8gABAAIQQDQCADIARPRQRAIAAgAxABIgJBgAFJBH8gAUEBagUgAkGAEEkEfyABQQJqBSACQf+3
11+
A00gAkGAsANPcQR/IANBAWoiAiAESQR/IAAgAhABQYD4A3FBgLgDRgR/IAIhAyABQQRqBSABQQNqCw
12+
UgAUEDagsFIAFBA2oLCwshASADQQFqIQMMAQsLIAELwgMCBn8BZAAgASECIAAgABAAIgX7BwAiCEEA
13+
EAIaA0AgBCAFT0UEQCAIIAT7DQAiA0GAAUkEfyACIAM6AAAgAkEBagUgA0GAEEkEfyACIANBBnZBwA
14+
FyOgAAIAJBAWogA0E/cUGAAXI6AAAgAkECagUgA0H/twNNIANBgLADT3EEfyAEQQFqIgYgBUkEfyAI
15+
IAb7DQAiB0GA+ANxQYC4A0YEfyAGIQQgAiADQQp0IAdqQYC4/xprIgNBEnZB8AFyOgAAIAJBAWogA0
16+
EMdkE/cUGAAXI6AAAgAkECaiADQQZ2QT9xQYABcjoAACACQQNqIANBP3FBgAFyOgAAIAJBBGoFIAIg
17+
A0EMdkHgAXI6AAAgAkEBaiADQQZ2QT9xQYABcjoAACACQQJqIANBP3FBgAFyOgAAIAJBA2oLBSACIA
18+
NBDHZB4AFyOgAAIAJBAWogA0EGdkE/cUGAAXI6AAAgAkECaiADQT9xQYABcjoAACACQQNqCwUgAiAD
19+
QQx2QeABcjoAACACQQFqIANBBnZBP3FBgAFyOgAAIAJBAmogA0E/cUGAAXI6AAAgAkEDagsLCyECIA
20+
RBAWohBAwBCwsgAiABawvBAgEDfwNAIAAgA01FBEAgAy0AACIEQYABcQR/IARB4AFxQcABRgR/IAEg
21+
AiADQQFqLQAAQT9xIARBH3FBBnRy+w4AIAJBAWohAiADQQJqBSAEQfABcUHgAUYEfyABIAIgA0ECai
22+
0AAEE/cSAEQQ9xQQx0IANBAWotAABBP3FBBnRycvsOACACQQFqIQIgA0EDagUgBEH4AXFB8AFGBH8g
23+
ASACIANBA2otAABBP3EgBEEHcUESdCADQQFqLQAAQT9xQQx0ciADQQJqLQAAQT9xQQZ0cnJBgIAEay
24+
IEQQp2QYCwA3L7DgAgASACQQFqIgIgBEH/B3FBgLgDcvsOACACQQFqIQIgA0EEagUgASACIAT7DgAg
25+
AkEBaiECIANBAWoLCwsFIAEgAiAE+w4AIAJBAWohAiADQQFqCyEDDAELCyACCwcAIAD7BwALCgAgAC
26+
ABIAIQAws=
2327
`;

test/utf8-wasm.test.ts

Lines changed: 233 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import assert from "assert";
2-
import { WASM_AVAILABLE, getWasmError, getWasmExports } from "../src/utils/utf8-wasm.ts";
2+
import { WASM_AVAILABLE, getWasmError, getWasmExports, utf8CountWasm, utf8EncodeWasm, utf8DecodeWasm } from "../src/utils/utf8-wasm.ts";
33
import { utf8Count, utf8CountJs, utf8Encode, utf8EncodeJs, utf8Decode, utf8DecodeJs } from "../src/utils/utf8.ts";
44

55
describe("utf8-wasm", () => {
@@ -129,4 +129,236 @@ describe("utf8-wasm", () => {
129129
});
130130
}
131131
});
132+
133+
// Edge case tests for invalid/malformed data
134+
// These tests ensure JS and WASM implementations behave identically
135+
describe("edge cases: lone surrogates", () => {
136+
// Lone high surrogate (0xD800-0xDBFF without following low surrogate)
137+
const loneHighSurrogate = "\uD800"; // U+D800
138+
const loneHighSurrogateAtEnd = "abc\uD800";
139+
const loneHighSurrogateFollowedByAscii = "\uD800X";
140+
const loneHighSurrogateFollowedByHighSurrogate = "\uD800\uD800";
141+
142+
// Lone low surrogate (0xDC00-0xDFFF without preceding high surrogate)
143+
const loneLowSurrogate = "\uDC00";
144+
const loneLowSurrogateAtStart = "\uDC00abc";
145+
const loneLowSurrogateBetweenAscii = "a\uDC00b";
146+
147+
// Mixed valid and invalid surrogates
148+
const validSurrogatePair = "\uD83D\uDE00"; // 😀
149+
const validThenLoneHigh = "\uD83D\uDE00\uD800";
150+
const loneLowThenValid = "\uDC00\uD83D\uDE00";
151+
152+
const surrogateTestCases = [
153+
{ str: loneHighSurrogate, description: "lone high surrogate" },
154+
{ str: loneHighSurrogateAtEnd, description: "lone high surrogate at end" },
155+
{ str: loneHighSurrogateFollowedByAscii, description: "lone high surrogate followed by ASCII" },
156+
{ str: loneHighSurrogateFollowedByHighSurrogate, description: "two lone high surrogates" },
157+
{ str: loneLowSurrogate, description: "lone low surrogate" },
158+
{ str: loneLowSurrogateAtStart, description: "lone low surrogate at start" },
159+
{ str: loneLowSurrogateBetweenAscii, description: "lone low surrogate between ASCII" },
160+
{ str: validSurrogatePair, description: "valid surrogate pair (emoji)" },
161+
{ str: validThenLoneHigh, description: "valid pair then lone high" },
162+
{ str: loneLowThenValid, description: "lone low then valid pair" },
163+
];
164+
165+
describe("utf8Count", () => {
166+
for (const { str, description } of surrogateTestCases) {
167+
it(`counts ${description} consistently`, () => {
168+
const jsResult = utf8CountJs(str);
169+
170+
// JS implementation is the reference - lone surrogates should be 3 bytes each
171+
assert.ok(jsResult > 0, `JS count should be positive for "${description}"`);
172+
173+
if (WASM_AVAILABLE) {
174+
const wasmResult = utf8CountWasm(str);
175+
assert.strictEqual(wasmResult, jsResult, `WASM count should match JS for "${description}"`);
176+
}
177+
});
178+
}
179+
180+
it("lone high surrogate counts as 3 bytes", () => {
181+
// A lone high surrogate (0xD800-0xDBFF) should be encoded as 3 bytes
182+
// because it's in the 0x800-0xFFFF range
183+
assert.strictEqual(utf8CountJs("\uD800"), 3);
184+
if (WASM_AVAILABLE) {
185+
assert.strictEqual(utf8CountWasm("\uD800"), 3);
186+
}
187+
});
188+
189+
it("lone low surrogate counts as 3 bytes", () => {
190+
assert.strictEqual(utf8CountJs("\uDC00"), 3);
191+
if (WASM_AVAILABLE) {
192+
assert.strictEqual(utf8CountWasm("\uDC00"), 3);
193+
}
194+
});
195+
196+
it("valid surrogate pair counts as 4 bytes", () => {
197+
assert.strictEqual(utf8CountJs("\uD83D\uDE00"), 4); // 😀
198+
if (WASM_AVAILABLE) {
199+
assert.strictEqual(utf8CountWasm("\uD83D\uDE00"), 4);
200+
}
201+
});
202+
});
203+
204+
describe("utf8Encode", () => {
205+
for (const { str, description } of surrogateTestCases) {
206+
it(`encodes ${description} consistently`, () => {
207+
const byteLength = utf8CountJs(str);
208+
const jsBuffer = new Uint8Array(byteLength);
209+
utf8EncodeJs(str, jsBuffer, 0);
210+
211+
if (WASM_AVAILABLE) {
212+
const wasmBuffer = new Uint8Array(byteLength);
213+
utf8EncodeWasm(str, wasmBuffer, 0);
214+
assert.deepStrictEqual(wasmBuffer, jsBuffer, `WASM encode should match JS for "${description}"`);
215+
}
216+
});
217+
}
218+
});
219+
220+
describe("round-trip with lone surrogates", () => {
221+
for (const { str, description } of surrogateTestCases) {
222+
it(`round-trips ${description}`, () => {
223+
const byteLength = utf8CountJs(str);
224+
const buffer = new Uint8Array(byteLength);
225+
utf8EncodeJs(str, buffer, 0);
226+
const decoded = utf8DecodeJs(buffer, 0, byteLength);
227+
228+
assert.strictEqual(decoded, str, `JS round-trip failed for "${description}"`);
229+
230+
if (WASM_AVAILABLE) {
231+
const wasmBuffer = new Uint8Array(byteLength);
232+
utf8EncodeWasm(str, wasmBuffer, 0);
233+
const wasmDecoded = utf8DecodeWasm(wasmBuffer, 0, byteLength);
234+
assert.strictEqual(wasmDecoded, str, `WASM round-trip failed for "${description}"`);
235+
}
236+
});
237+
}
238+
});
239+
});
240+
241+
describe("edge cases: invalid UTF-8 bytes in decode", () => {
242+
// Invalid UTF-8 sequences that don't match any valid pattern
243+
const invalidByteSequences = [
244+
{
245+
bytes: new Uint8Array([0x80]), // Continuation byte without leading byte
246+
description: "lone continuation byte 0x80",
247+
},
248+
{
249+
bytes: new Uint8Array([0xBF]), // Continuation byte without leading byte
250+
description: "lone continuation byte 0xBF",
251+
},
252+
{
253+
bytes: new Uint8Array([0xFE]), // Invalid byte (never valid in UTF-8)
254+
description: "invalid byte 0xFE",
255+
},
256+
{
257+
bytes: new Uint8Array([0xFF]), // Invalid byte (never valid in UTF-8)
258+
description: "invalid byte 0xFF",
259+
},
260+
{
261+
bytes: new Uint8Array([0xF8, 0x80, 0x80, 0x80, 0x80]), // 5-byte sequence (invalid)
262+
description: "5-byte sequence (invalid)",
263+
},
264+
{
265+
bytes: new Uint8Array([0x41, 0x80, 0x42]), // ASCII, invalid, ASCII
266+
description: "invalid byte between ASCII",
267+
},
268+
{
269+
bytes: new Uint8Array([0xC0, 0x80]), // Overlong encoding of NUL
270+
description: "overlong encoding of NUL",
271+
},
272+
{
273+
bytes: new Uint8Array([0xE0, 0x80, 0x80]), // Overlong encoding
274+
description: "overlong 3-byte encoding",
275+
},
276+
];
277+
278+
describe("utf8Decode preserves invalid bytes", () => {
279+
for (const { bytes, description } of invalidByteSequences) {
280+
it(`preserves ${description}`, () => {
281+
const jsResult = utf8DecodeJs(bytes, 0, bytes.length);
282+
283+
// The JS implementation should preserve invalid bytes as code units
284+
// So the result length should be > 0
285+
assert.ok(jsResult.length > 0, `JS decode should produce output for "${description}"`);
286+
287+
if (WASM_AVAILABLE) {
288+
const wasmResult = utf8DecodeWasm(bytes, 0, bytes.length);
289+
assert.strictEqual(
290+
wasmResult,
291+
jsResult,
292+
`WASM decode should match JS for "${description}": got "${wasmResult}" vs "${jsResult}"`
293+
);
294+
}
295+
});
296+
}
297+
});
298+
299+
describe("invalid bytes are not dropped", () => {
300+
it("0x80 byte is preserved, not dropped", () => {
301+
const bytes = new Uint8Array([0x80]);
302+
const jsResult = utf8DecodeJs(bytes, 0, 1);
303+
// Should be a single character with code point 0x80
304+
assert.strictEqual(jsResult.length, 1);
305+
assert.strictEqual(jsResult.charCodeAt(0), 0x80);
306+
307+
if (WASM_AVAILABLE) {
308+
const wasmResult = utf8DecodeWasm(bytes, 0, 1);
309+
assert.strictEqual(wasmResult.length, 1, "WASM should not drop the byte");
310+
assert.strictEqual(wasmResult.charCodeAt(0), 0x80);
311+
}
312+
});
313+
314+
it("0xFF byte is preserved, not dropped", () => {
315+
const bytes = new Uint8Array([0xFF]);
316+
const jsResult = utf8DecodeJs(bytes, 0, 1);
317+
assert.strictEqual(jsResult.length, 1);
318+
assert.strictEqual(jsResult.charCodeAt(0), 0xFF);
319+
320+
if (WASM_AVAILABLE) {
321+
const wasmResult = utf8DecodeWasm(bytes, 0, 1);
322+
assert.strictEqual(wasmResult.length, 1, "WASM should not drop the byte");
323+
assert.strictEqual(wasmResult.charCodeAt(0), 0xFF);
324+
}
325+
});
326+
327+
it("invalid bytes between valid UTF-8 are preserved", () => {
328+
// "A" + invalid + "B"
329+
const bytes = new Uint8Array([0x41, 0x80, 0x42]);
330+
const jsResult = utf8DecodeJs(bytes, 0, 3);
331+
332+
// Should be 3 characters: 'A', char(0x80), 'B'
333+
assert.strictEqual(jsResult.length, 3);
334+
assert.strictEqual(jsResult.charCodeAt(0), 0x41); // 'A'
335+
assert.strictEqual(jsResult.charCodeAt(1), 0x80); // invalid byte preserved
336+
assert.strictEqual(jsResult.charCodeAt(2), 0x42); // 'B'
337+
338+
if (WASM_AVAILABLE) {
339+
const wasmResult = utf8DecodeWasm(bytes, 0, 3);
340+
assert.strictEqual(wasmResult.length, 3, "WASM should produce 3 chars");
341+
assert.strictEqual(wasmResult, jsResult, "WASM should match JS");
342+
}
343+
});
344+
345+
it("multiple invalid bytes are all preserved", () => {
346+
const bytes = new Uint8Array([0x80, 0x81, 0x82, 0xFE, 0xFF]);
347+
const jsResult = utf8DecodeJs(bytes, 0, 5);
348+
349+
assert.strictEqual(jsResult.length, 5, "All 5 invalid bytes should produce 5 chars");
350+
assert.strictEqual(jsResult.charCodeAt(0), 0x80);
351+
assert.strictEqual(jsResult.charCodeAt(1), 0x81);
352+
assert.strictEqual(jsResult.charCodeAt(2), 0x82);
353+
assert.strictEqual(jsResult.charCodeAt(3), 0xFE);
354+
assert.strictEqual(jsResult.charCodeAt(4), 0xFF);
355+
356+
if (WASM_AVAILABLE) {
357+
const wasmResult = utf8DecodeWasm(bytes, 0, 5);
358+
assert.strictEqual(wasmResult.length, 5, "WASM should produce 5 chars");
359+
assert.strictEqual(wasmResult, jsResult, "WASM should match JS");
360+
}
361+
});
362+
});
363+
});
132364
});

0 commit comments

Comments
 (0)