From 3c1ac2a60318096dc6cd13dc37a6a5b98a190163 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 18 Feb 2026 00:13:37 -0800 Subject: [PATCH] fix: handle non-BMP Unicode codepoints in foldl, foldr, and %c format foldl/foldr iterated strings by UTF-16 code unit (for (char <- s.value)), splitting non-BMP characters like emoji into surrogate pair halves. Use codePointAt/codePointBefore with Character.charCount for correct codepoint iteration. The %c format conversion used s.toChar.toString which truncates codepoints above U+FFFF to 16 bits. Use Character.toString(s.toInt) instead. Co-Authored-By: Claude Opus 4.6 --- sjsonnet/src/sjsonnet/Format.scala | 2 +- .../src/sjsonnet/stdlib/ArrayModule.scala | 16 +++++++--- .../src/sjsonnet/UnicodeHandlingTests.scala | 29 +++++++++++++++++++ 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/sjsonnet/src/sjsonnet/Format.scala b/sjsonnet/src/sjsonnet/Format.scala index 46a2f56e..3b2c9a2c 100644 --- a/sjsonnet/src/sjsonnet/Format.scala +++ b/sjsonnet/src/sjsonnet/Format.scala @@ -196,7 +196,7 @@ object Format { case 'f' | 'F' => formatFloat(formatted, s) case 'g' => formatGeneric(formatted, s).toLowerCase case 'G' => formatGeneric(formatted, s) - case 'c' => widenRaw(formatted, s.toChar.toString) + case 'c' => widenRaw(formatted, Character.toString(s.toInt)) case 's' => if (s.toLong == s) widenRaw(formatted, s.toLong.toString) else widenRaw(formatted, s.toString) diff --git a/sjsonnet/src/sjsonnet/stdlib/ArrayModule.scala b/sjsonnet/src/sjsonnet/stdlib/ArrayModule.scala index 300c7ca1..948ec7e0 100644 --- a/sjsonnet/src/sjsonnet/stdlib/ArrayModule.scala +++ b/sjsonnet/src/sjsonnet/stdlib/ArrayModule.scala @@ -296,12 +296,16 @@ object ArrayModule extends AbstractFunctionModule { case s: Val.Str => var current = init.force - for (char <- s.value) { + val str = s.value + var i = 0 + while (i < str.length) { val c = current - current = func.apply2(c, Val.Str(pos, new String(Array(char))), pos.noOffset)( + val codePoint = str.codePointAt(i) + current = func.apply2(c, Val.Str(pos, Character.toString(codePoint)), pos.noOffset)( ev, TailstrictModeDisabled ) + i += Character.charCount(codePoint) } current @@ -324,9 +328,13 @@ object ArrayModule extends AbstractFunctionModule { current case s: Val.Str => var current = init.force - for (char <- s.value.reverse) { + val str = s.value + var i = str.length + while (i > 0) { + val codePoint = str.codePointBefore(i) + i -= Character.charCount(codePoint) val c = current - current = func.apply2(Val.Str(pos, new String(Array(char))), c, pos.noOffset)( + current = func.apply2(Val.Str(pos, Character.toString(codePoint)), c, pos.noOffset)( ev, TailstrictModeDisabled ) diff --git a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala index d7266f93..a1197f70 100644 --- a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala +++ b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala @@ -256,5 +256,34 @@ object UnicodeHandlingTests extends TestSuite { eval("""std.trim("🌍 ")""") ==> ujson.Str("🌍") eval("""std.trim(" 🌍 ")""") ==> ujson.Str("🌍") } + + test("foldl") { + // foldl must iterate by codepoint, not UTF-16 code unit + eval("""std.foldl(function(acc, c) acc + [c], "a😀b", [])""") ==> + ujson.Arr("a", "😀", "b") + eval("""std.foldl(function(acc, c) acc + 1, "a😀b", 0)""") ==> ujson.Num(3) + eval("""std.foldl(function(acc, c) acc + [c], "🎉🔥", [])""") ==> + ujson.Arr("🎉", "🔥") + // Round-trip concatenation + eval("""std.foldl(function(acc, c) acc + c, "a😀b", "")""") ==> ujson.Str("a😀b") + } + + test("foldr") { + // foldr must iterate by codepoint, not UTF-16 code unit + eval("""std.foldr(function(c, acc) acc + [c], "a😀b", [])""") ==> + ujson.Arr("b", "😀", "a") + eval("""std.foldr(function(c, acc) acc + [c], "🎉🔥", [])""") ==> + ujson.Arr("🔥", "🎉") + // Round-trip concatenation (right-to-left: 'b' then '😀' then 'a') + eval("""std.foldr(function(c, acc) acc + c, "a😀b", "")""") ==> ujson.Str("b😀a") + } + + test("formatPercentC") { + // %c must handle non-BMP codepoints + eval("""std.format("%c", [128512])""") ==> ujson.Str("😀") // U+1F600 + eval("""std.format("%c", [128293])""") ==> ujson.Str("🔥") // U+1F525 + eval("""std.format("%c", [127757])""") ==> ujson.Str("🌍") // U+1F30D + eval("""std.format("%c", [65])""") ==> ujson.Str("A") // BMP char + } } }