diff --git a/src/lib_json/json_writer.cpp b/src/lib_json/json_writer.cpp index 72799445d..a05ce1ae9 100644 --- a/src/lib_json/json_writer.cpp +++ b/src/lib_json/json_writer.cpp @@ -131,9 +131,17 @@ static unsigned int utf8ToCodepoint(const char*& s, const char* e) { if (firstByte < 0x80) return firstByte; + // continuation bytes must be of the form 10xxxxxx + const auto isTrailingByte = [](char b) { + return (static_cast(b) & 0xC0) == 0x80; + }; + if (firstByte < 0xE0) { if (e - s < 2) return REPLACEMENT_CHARACTER; + // a malformed continuation byte does not belong to this sequence + if (!isTrailingByte(s[1])) + return REPLACEMENT_CHARACTER; unsigned int calculated = ((firstByte & 0x1F) << 6) | (static_cast(s[1]) & 0x3F); @@ -145,6 +153,8 @@ static unsigned int utf8ToCodepoint(const char*& s, const char* e) { if (firstByte < 0xF0) { if (e - s < 3) return REPLACEMENT_CHARACTER; + if (!isTrailingByte(s[1]) || !isTrailingByte(s[2])) + return REPLACEMENT_CHARACTER; unsigned int calculated = ((firstByte & 0x0F) << 12) | ((static_cast(s[1]) & 0x3F) << 6) | @@ -161,12 +171,17 @@ static unsigned int utf8ToCodepoint(const char*& s, const char* e) { if (firstByte < 0xF8) { if (e - s < 4) return REPLACEMENT_CHARACTER; + if (!isTrailingByte(s[1]) || !isTrailingByte(s[2]) || !isTrailingByte(s[3])) + return REPLACEMENT_CHARACTER; unsigned int calculated = ((firstByte & 0x07) << 18) | ((static_cast(s[1]) & 0x3F) << 12) | ((static_cast(s[2]) & 0x3F) << 6) | (static_cast(s[3]) & 0x3F); s += 3; + // codepoints beyond U+10FFFF are invalid + if (calculated > 0x10FFFF) + return REPLACEMENT_CHARACTER; // oversized encoded characters are invalid return calculated < 0x10000 ? REPLACEMENT_CHARACTER : calculated; } diff --git a/src/test_lib_json/main.cpp b/src/test_lib_json/main.cpp index 0d1c33064..e9d980bf0 100644 --- a/src/test_lib_json/main.cpp +++ b/src/test_lib_json/main.cpp @@ -2889,6 +2889,31 @@ JSONTEST_FIXTURE_LOCAL(StreamWriterTest, unicode) { "\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}"); } +// Malformed UTF-8 must not swallow the bytes that follow a broken sequence. +JSONTEST_FIXTURE_LOCAL(StreamWriterTest, invalidUtf8) { + Json::StreamWriterBuilder b; + b.settings_["indentation"] = ""; + + // 0xE0 announces a 3-byte sequence, but 'A'/'B' are not continuation bytes: + // only the lead byte is replaced, the ASCII must be preserved. + Json::Value bad3(std::string("\xE0" + "AB")); + JSONTEST_ASSERT_STRING_EQUAL("\"\\ufffdAB\"", Json::writeString(b, bad3)); + + // 0xF0 announces a 4-byte sequence with no valid continuation bytes. + Json::Value bad4(std::string("\xF0" + "XYZ")); + JSONTEST_ASSERT_STRING_EQUAL("\"\\ufffdXYZ\"", Json::writeString(b, bad4)); + + // A 4-byte sequence that decodes past U+10FFFF is not a valid codepoint. + Json::Value over(std::string("\xF7\xBF\xBF\xBF")); + JSONTEST_ASSERT_STRING_EQUAL("\"\\ufffd\"", Json::writeString(b, over)); + + // A valid multibyte sequence still round-trips unchanged. + Json::Value euro(std::string("\xE2\x82\xAC")); + JSONTEST_ASSERT_STRING_EQUAL("\"\\u20ac\"", Json::writeString(b, euro)); +} + // Control chars should be escaped regardless of UTF-8 input encoding. JSONTEST_FIXTURE_LOCAL(StreamWriterTest, escapeControlCharacters) { auto uEscape = [](unsigned ch) {