From 5e597858974ec8977f9af5ae5f0fc939dd7211a2 Mon Sep 17 00:00:00 2001 From: SABITHSAHEB Date: Wed, 17 Jun 2026 21:20:39 +0530 Subject: [PATCH 1/2] validate utf-16 surrogate halves in decodeUnicodeCodePoint --- src/lib_json/json_reader.cpp | 16 ++++++++++++++++ src/test_lib_json/main.cpp | 18 ++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/src/lib_json/json_reader.cpp b/src/lib_json/json_reader.cpp index 164d41d6f..18e3f845e 100644 --- a/src/lib_json/json_reader.cpp +++ b/src/lib_json/json_reader.cpp @@ -678,6 +678,10 @@ bool Reader::decodeUnicodeCodePoint(Token& token, Location& current, if (*(current++) == '\\' && *(current++) == 'u') { unsigned int surrogatePair; if (decodeUnicodeEscapeSequence(token, current, end, surrogatePair)) { + if (surrogatePair < 0xDC00 || surrogatePair > 0xDFFF) + return addError("expecting a low surrogate (DC00-DFFF) to complete " + "the unicode surrogate pair", + token, current); unicode = 0x10000 + ((unicode & 0x3FF) << 10) + (surrogatePair & 0x3FF); } else return false; @@ -685,6 +689,10 @@ bool Reader::decodeUnicodeCodePoint(Token& token, Location& current, return addError("expecting another \\u token to begin the second half of " "a unicode surrogate pair", token, current); + } else if (unicode >= 0xDC00 && unicode <= 0xDFFF) { + return addError("unexpected low surrogate (DC00-DFFF); a high surrogate " + "(D800-DBFF) must come first", + token, current); } return true; } @@ -1759,6 +1767,10 @@ bool OurReader::decodeUnicodeCodePoint(Token& token, Location& current, if (*(current++) == '\\' && *(current++) == 'u') { unsigned int surrogatePair; if (decodeUnicodeEscapeSequence(token, current, end, surrogatePair)) { + if (surrogatePair < 0xDC00 || surrogatePair > 0xDFFF) + return addError("expecting a low surrogate (DC00-DFFF) to complete " + "the unicode surrogate pair", + token, current); unicode = 0x10000 + ((unicode & 0x3FF) << 10) + (surrogatePair & 0x3FF); } else return false; @@ -1766,6 +1778,10 @@ bool OurReader::decodeUnicodeCodePoint(Token& token, Location& current, return addError("expecting another \\u token to begin the second half of " "a unicode surrogate pair", token, current); + } else if (unicode >= 0xDC00 && unicode <= 0xDFFF) { + return addError("unexpected low surrogate (DC00-DFFF); a high surrogate " + "(D800-DBFF) must come first", + token, current); } return true; } diff --git a/src/test_lib_json/main.cpp b/src/test_lib_json/main.cpp index 90025b443..e2e4bbf1c 100644 --- a/src/test_lib_json/main.cpp +++ b/src/test_lib_json/main.cpp @@ -3322,6 +3322,24 @@ JSONTEST_FIXTURE_LOCAL(CharReaderTest, parseString) { "second half of a unicode surrogate pair\n" "See Line 1, Column 12 for detail.\n"); } + { + char const doc[] = R"([ "\uD801\u0041" ])"; + bool ok = reader->parse(doc, doc + std::strlen(doc), &root, &errs); + JSONTEST_ASSERT(!ok); + JSONTEST_ASSERT(errs == "* Line 1, Column 3\n" + " expecting a low surrogate (DC00-DFFF) to " + "complete the unicode surrogate pair\n" + "See Line 1, Column 16 for detail.\n"); + } + { + char const doc[] = R"([ "\uDC00" ])"; + bool ok = reader->parse(doc, doc + std::strlen(doc), &root, &errs); + JSONTEST_ASSERT(!ok); + JSONTEST_ASSERT(errs == "* Line 1, Column 3\n" + " unexpected low surrogate (DC00-DFFF); a high " + "surrogate (D800-DBFF) must come first\n" + "See Line 1, Column 10 for detail.\n"); + } { char const doc[] = R"([ "\ua3t@" ])"; bool ok = reader->parse(doc, doc + std::strlen(doc), &root, &errs); From fc1cef1c8b7241a4f7cb2bbff27d3ff792222dad Mon Sep 17 00:00:00 2001 From: Sabith Saheb Date: Sat, 4 Jul 2026 10:15:32 +0530 Subject: [PATCH 2/2] gate surrogate validation behind rejectInvalidSurrogates setting Route the escape validation through a CharReaderBuilder setting instead of failing unconditionally, matching failIfExtra/rejectDupKeys. Default on, on in strictMode, off in ecma404Mode so ECMA-404-conforming lone surrogates still parse. Revert the deprecated Json::Reader change so the legacy path keeps its prior behaviour. --- include/json/reader.h | 5 +++++ src/lib_json/json_reader.cpp | 21 +++++++++++---------- src/test_lib_json/main.cpp | 11 +++++++++++ 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/include/json/reader.h b/include/json/reader.h index 7aa227188..ddd4fb8a5 100644 --- a/include/json/reader.h +++ b/include/json/reader.h @@ -352,6 +352,11 @@ class JSON_API CharReaderBuilder : public CharReader::Factory { * - `"allowSpecialFloats": false or true` * - If true, special float values (NaNs and infinities) are allowed and * their values are lossfree restorable. + * - `"rejectInvalidSurrogates": false or true` + * - If true, `parse()` returns false when a `\u` escape is a lone or + * mismatched UTF-16 surrogate half (a high surrogate not followed by a + * low surrogate, or a low surrogate with no preceding high surrogate). + * - Disabled by ecma404Mode(), which treats such escapes as conforming. * - `"skipBom": false or true` * - If true, if the input starts with the Unicode byte order mark (BOM), * it is skipped. diff --git a/src/lib_json/json_reader.cpp b/src/lib_json/json_reader.cpp index 18e3f845e..3333629d9 100644 --- a/src/lib_json/json_reader.cpp +++ b/src/lib_json/json_reader.cpp @@ -678,10 +678,6 @@ bool Reader::decodeUnicodeCodePoint(Token& token, Location& current, if (*(current++) == '\\' && *(current++) == 'u') { unsigned int surrogatePair; if (decodeUnicodeEscapeSequence(token, current, end, surrogatePair)) { - if (surrogatePair < 0xDC00 || surrogatePair > 0xDFFF) - return addError("expecting a low surrogate (DC00-DFFF) to complete " - "the unicode surrogate pair", - token, current); unicode = 0x10000 + ((unicode & 0x3FF) << 10) + (surrogatePair & 0x3FF); } else return false; @@ -689,10 +685,6 @@ bool Reader::decodeUnicodeCodePoint(Token& token, Location& current, return addError("expecting another \\u token to begin the second half of " "a unicode surrogate pair", token, current); - } else if (unicode >= 0xDC00 && unicode <= 0xDFFF) { - return addError("unexpected low surrogate (DC00-DFFF); a high surrogate " - "(D800-DBFF) must come first", - token, current); } return true; } @@ -869,6 +861,7 @@ class OurFeatures { bool failIfExtra_; bool rejectDupKeys_; bool allowSpecialFloats_; + bool rejectInvalidSurrogates_; bool skipBom_; size_t stackLimit_; }; // OurFeatures @@ -1767,7 +1760,8 @@ bool OurReader::decodeUnicodeCodePoint(Token& token, Location& current, if (*(current++) == '\\' && *(current++) == 'u') { unsigned int surrogatePair; if (decodeUnicodeEscapeSequence(token, current, end, surrogatePair)) { - if (surrogatePair < 0xDC00 || surrogatePair > 0xDFFF) + if (features_.rejectInvalidSurrogates_ && + (surrogatePair < 0xDC00 || surrogatePair > 0xDFFF)) return addError("expecting a low surrogate (DC00-DFFF) to complete " "the unicode surrogate pair", token, current); @@ -1778,7 +1772,8 @@ bool OurReader::decodeUnicodeCodePoint(Token& token, Location& current, return addError("expecting another \\u token to begin the second half of " "a unicode surrogate pair", token, current); - } else if (unicode >= 0xDC00 && unicode <= 0xDFFF) { + } else if (features_.rejectInvalidSurrogates_ && unicode >= 0xDC00 && + unicode <= 0xDFFF) { return addError("unexpected low surrogate (DC00-DFFF); a high surrogate " "(D800-DBFF) must come first", token, current); @@ -1956,6 +1951,8 @@ CharReader* CharReaderBuilder::newCharReader() const { features.failIfExtra_ = settings_["failIfExtra"].asBool(); features.rejectDupKeys_ = settings_["rejectDupKeys"].asBool(); features.allowSpecialFloats_ = settings_["allowSpecialFloats"].asBool(); + features.rejectInvalidSurrogates_ = + settings_["rejectInvalidSurrogates"].asBool(); features.skipBom_ = settings_["skipBom"].asBool(); return new OurCharReader(collectComments, features); } @@ -1973,6 +1970,7 @@ bool CharReaderBuilder::validate(Json::Value* invalid) const { "failIfExtra", "rejectDupKeys", "allowSpecialFloats", + "rejectInvalidSurrogates", "skipBom", }; for (auto si = settings_.begin(); si != settings_.end(); ++si) { @@ -2003,6 +2001,7 @@ void CharReaderBuilder::strictMode(Json::Value* settings) { (*settings)["failIfExtra"] = true; (*settings)["rejectDupKeys"] = true; (*settings)["allowSpecialFloats"] = false; + (*settings)["rejectInvalidSurrogates"] = true; (*settings)["skipBom"] = true; //! [CharReaderBuilderStrictMode] } @@ -2020,6 +2019,7 @@ void CharReaderBuilder::setDefaults(Json::Value* settings) { (*settings)["failIfExtra"] = false; (*settings)["rejectDupKeys"] = false; (*settings)["allowSpecialFloats"] = false; + (*settings)["rejectInvalidSurrogates"] = true; (*settings)["skipBom"] = true; //! [CharReaderBuilderDefaults] } @@ -2036,6 +2036,7 @@ void CharReaderBuilder::ecma404Mode(Json::Value* settings) { (*settings)["failIfExtra"] = true; (*settings)["rejectDupKeys"] = false; (*settings)["allowSpecialFloats"] = false; + (*settings)["rejectInvalidSurrogates"] = false; (*settings)["skipBom"] = false; //! [CharReaderBuilderECMA404Mode] } diff --git a/src/test_lib_json/main.cpp b/src/test_lib_json/main.cpp index e2e4bbf1c..f039f5e5e 100644 --- a/src/test_lib_json/main.cpp +++ b/src/test_lib_json/main.cpp @@ -3340,6 +3340,17 @@ JSONTEST_FIXTURE_LOCAL(CharReaderTest, parseString) { "surrogate (D800-DBFF) must come first\n" "See Line 1, Column 10 for detail.\n"); } + { + // The escape hatch: with rejectInvalidSurrogates off, the lenient path + // keeps the pre-existing behaviour of passing lone surrogates through. + Json::CharReaderBuilder lenient; + lenient["rejectInvalidSurrogates"] = false; + CharReaderPtr lenientReader(lenient.newCharReader()); + char const doc[] = R"([ "\uDC00" ])"; + bool ok = lenientReader->parse(doc, doc + std::strlen(doc), &root, &errs); + JSONTEST_ASSERT(ok); + JSONTEST_ASSERT(errs.empty()); + } { char const doc[] = R"([ "\ua3t@" ])"; bool ok = reader->parse(doc, doc + std::strlen(doc), &root, &errs);