From 06b1a3fc66cf1ad405c0f4d56331b18915b846df Mon Sep 17 00:00:00 2001 From: JSap0914 Date: Tue, 16 Jun 2026 13:02:05 +0900 Subject: [PATCH] fix: respect string literals when splitting JSON objects split_and_parse_json_objects counted every { and } to find object boundaries, including braces inside JSON string values. A value such as "x}y" terminated the object early, so the segment failed to parse and valid objects were silently dropped. Track string state (and backslash escapes) so braces inside string values no longer affect the depth count. Add regression tests covering closing/opening braces and escaped quotes inside string values. --- crawl4ai/utils.py | 15 +++++++ .../test_split_and_parse_json_objects.py | 42 +++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 tests/general/test_split_and_parse_json_objects.py diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 89fb782d9..8e5b3694b 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -724,8 +724,23 @@ def split_and_parse_json_objects(json_string): segments = [] depth = 0 start_index = 0 + in_string = False + escape = False for i, char in enumerate(json_string): + # Skip the character escaped by a preceding backslash (e.g. \" or \\). + if escape: + escape = False + continue + if char == "\\": + escape = True + continue + if char == '"': + in_string = not in_string + continue + # Braces inside string values must not affect the depth count. + if in_string: + continue if char == "{": if depth == 0: start_index = i diff --git a/tests/general/test_split_and_parse_json_objects.py b/tests/general/test_split_and_parse_json_objects.py new file mode 100644 index 000000000..2edc27b2f --- /dev/null +++ b/tests/general/test_split_and_parse_json_objects.py @@ -0,0 +1,42 @@ +"""Regression tests for ``split_and_parse_json_objects``. + +The helper splits a JSON array string into individual object segments by +tracking brace depth. Brace characters that appear *inside* JSON string +values must not affect the depth count, otherwise objects get split at the +wrong position and fail to parse. +""" + +from crawl4ai.utils import split_and_parse_json_objects + + +def test_plain_objects_parse(): + parsed, unparsed = split_and_parse_json_objects('[{"a": 1}, {"b": 2}]') + assert parsed == [{"a": 1}, {"b": 2}] + assert unparsed == [] + + +def test_closing_brace_inside_string_value(): + # A lone "}" inside a string value must not terminate the object early. + parsed, unparsed = split_and_parse_json_objects('[{"a": "x}y"}, {"b": 2}]') + assert parsed == [{"a": "x}y"}, {"b": 2}] + assert unparsed == [] + + +def test_opening_brace_inside_string_value(): + parsed, unparsed = split_and_parse_json_objects('[{"a": "x{y"}, {"b": 2}]') + assert parsed == [{"a": "x{y"}, {"b": 2}] + assert unparsed == [] + + +def test_escaped_quote_inside_string_value(): + # An escaped quote must not be treated as the end of the string, so the + # "}" that follows still counts as being inside the string. + parsed, unparsed = split_and_parse_json_objects('[{"a": "he said \\"}\\""}]') + assert parsed == [{"a": 'he said "}"'}] + assert unparsed == [] + + +def test_balanced_braces_inside_string_still_work(): + parsed, unparsed = split_and_parse_json_objects('[{"text": "see {this}", "n": 1}]') + assert parsed == [{"text": "see {this}", "n": 1}] + assert unparsed == []