Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions src/lib_json/json_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,17 @@ static unsigned int utf8ToCodepoint(const char*& s, const char* e) {
if (firstByte < 0x80)
return firstByte;

// continuation bytes must be of the form 10xxxxxx
const auto isTrailingByte = [](char b) {
return (static_cast<unsigned char>(b) & 0xC0) == 0x80;
};

if (firstByte < 0xE0) {
if (e - s < 2)
return REPLACEMENT_CHARACTER;
// a malformed continuation byte does not belong to this sequence
if (!isTrailingByte(s[1]))
return REPLACEMENT_CHARACTER;

unsigned int calculated =
((firstByte & 0x1F) << 6) | (static_cast<unsigned int>(s[1]) & 0x3F);
Expand All @@ -145,6 +153,8 @@ static unsigned int utf8ToCodepoint(const char*& s, const char* e) {
if (firstByte < 0xF0) {
if (e - s < 3)
return REPLACEMENT_CHARACTER;
if (!isTrailingByte(s[1]) || !isTrailingByte(s[2]))
return REPLACEMENT_CHARACTER;

unsigned int calculated = ((firstByte & 0x0F) << 12) |
((static_cast<unsigned int>(s[1]) & 0x3F) << 6) |
Expand All @@ -161,12 +171,17 @@ static unsigned int utf8ToCodepoint(const char*& s, const char* e) {
if (firstByte < 0xF8) {
if (e - s < 4)
return REPLACEMENT_CHARACTER;
if (!isTrailingByte(s[1]) || !isTrailingByte(s[2]) || !isTrailingByte(s[3]))
return REPLACEMENT_CHARACTER;

unsigned int calculated = ((firstByte & 0x07) << 18) |
((static_cast<unsigned int>(s[1]) & 0x3F) << 12) |
((static_cast<unsigned int>(s[2]) & 0x3F) << 6) |
(static_cast<unsigned int>(s[3]) & 0x3F);
s += 3;
// codepoints beyond U+10FFFF are invalid
if (calculated > 0x10FFFF)
return REPLACEMENT_CHARACTER;
// oversized encoded characters are invalid
return calculated < 0x10000 ? REPLACEMENT_CHARACTER : calculated;
}
Expand Down
25 changes: 25 additions & 0 deletions src/test_lib_json/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2889,6 +2889,31 @@ JSONTEST_FIXTURE_LOCAL(StreamWriterTest, unicode) {
"\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}");
}

// Malformed UTF-8 must not swallow the bytes that follow a broken sequence.
JSONTEST_FIXTURE_LOCAL(StreamWriterTest, invalidUtf8) {
Json::StreamWriterBuilder b;
b.settings_["indentation"] = "";

// 0xE0 announces a 3-byte sequence, but 'A'/'B' are not continuation bytes:
// only the lead byte is replaced, the ASCII must be preserved.
Json::Value bad3(std::string("\xE0"
"AB"));
JSONTEST_ASSERT_STRING_EQUAL("\"\\ufffdAB\"", Json::writeString(b, bad3));

// 0xF0 announces a 4-byte sequence with no valid continuation bytes.
Json::Value bad4(std::string("\xF0"
"XYZ"));
JSONTEST_ASSERT_STRING_EQUAL("\"\\ufffdXYZ\"", Json::writeString(b, bad4));

// A 4-byte sequence that decodes past U+10FFFF is not a valid codepoint.
Json::Value over(std::string("\xF7\xBF\xBF\xBF"));
JSONTEST_ASSERT_STRING_EQUAL("\"\\ufffd\"", Json::writeString(b, over));

// A valid multibyte sequence still round-trips unchanged.
Json::Value euro(std::string("\xE2\x82\xAC"));
JSONTEST_ASSERT_STRING_EQUAL("\"\\u20ac\"", Json::writeString(b, euro));
}

// Control chars should be escaped regardless of UTF-8 input encoding.
JSONTEST_FIXTURE_LOCAL(StreamWriterTest, escapeControlCharacters) {
auto uEscape = [](unsigned ch) {
Expand Down
Loading