Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions crawl4ai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -724,8 +724,23 @@ def split_and_parse_json_objects(json_string):
segments = []
depth = 0
start_index = 0
in_string = False
escape = False

for i, char in enumerate(json_string):
# Skip the character escaped by a preceding backslash (e.g. \" or \\).
if escape:
escape = False
continue
if char == "\\":
escape = True
continue
if char == '"':
in_string = not in_string
continue
Comment on lines +727 to +740
# Braces inside string values must not affect the depth count.
if in_string:
continue
if char == "{":
if depth == 0:
start_index = i
Expand Down
42 changes: 42 additions & 0 deletions tests/general/test_split_and_parse_json_objects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Regression tests for ``split_and_parse_json_objects``.

The helper splits a JSON array string into individual object segments by
tracking brace depth. Brace characters that appear *inside* JSON string
values must not affect the depth count, otherwise objects get split at the
wrong position and fail to parse.
"""

from crawl4ai.utils import split_and_parse_json_objects


def test_plain_objects_parse():
parsed, unparsed = split_and_parse_json_objects('[{"a": 1}, {"b": 2}]')
assert parsed == [{"a": 1}, {"b": 2}]
assert unparsed == []


def test_closing_brace_inside_string_value():
# A lone "}" inside a string value must not terminate the object early.
parsed, unparsed = split_and_parse_json_objects('[{"a": "x}y"}, {"b": 2}]')
assert parsed == [{"a": "x}y"}, {"b": 2}]
assert unparsed == []


def test_opening_brace_inside_string_value():
parsed, unparsed = split_and_parse_json_objects('[{"a": "x{y"}, {"b": 2}]')
assert parsed == [{"a": "x{y"}, {"b": 2}]
assert unparsed == []


def test_escaped_quote_inside_string_value():
# An escaped quote must not be treated as the end of the string, so the
# "}" that follows still counts as being inside the string.
parsed, unparsed = split_and_parse_json_objects('[{"a": "he said \\"}\\""}]')
assert parsed == [{"a": 'he said "}"'}]
assert unparsed == []


def test_balanced_braces_inside_string_still_work():
parsed, unparsed = split_and_parse_json_objects('[{"text": "see {this}", "n": 1}]')
assert parsed == [{"text": "see {this}", "n": 1}]
assert unparsed == []