Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions src/semble/chunking/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@

logger = getLogger(__name__)

_RECURSION_DEPTH = 500
_MIN_CHUNK_SIZE = 50


def is_supported_language(language: str) -> bool:
"""Check if the language is supported by tree-sitter."""
Expand Down Expand Up @@ -69,12 +72,21 @@ def _merge_adjacent_chunks(
return merged


def _merge_node_inner(node: Node, desired_length: int) -> list[ChunkBoundary]:
def _merge_node_inner(node: Node, desired_length: int, i: int) -> list[ChunkBoundary]:
"""Recursively merge and split nodes."""
# If there are no child nodes, the only thing we can do is return the current node.
if not node.children:
return [ChunkBoundary(node.start_byte, node.end_byte)]

length = node.end_byte - node.start_byte
# Prevent recursion issues. A depth of > 500 is unlikely
if i > _RECURSION_DEPTH:
logger.warning("Recursion depth exceeded in chunk.")
return [ChunkBoundary(node.start_byte, node.end_byte)]
# Prevent recursing into short chunks.
if length < _MIN_CHUNK_SIZE:
return [ChunkBoundary(node.start_byte, node.end_byte)]

groups: list[ChunkBoundary] = []
children = node.children
index = 0
Expand All @@ -90,7 +102,7 @@ def _merge_node_inner(node: Node, desired_length: int) -> list[ChunkBoundary]:
# If this single chunk is longer than the desired length
# we try to split it again.
if length > desired_length:
groups.extend(_merge_node_inner(child, desired_length))
groups.extend(_merge_node_inner(child, desired_length, i + 1))
continue

while index < len(children):
Expand All @@ -112,7 +124,7 @@ def _merge_node_inner(node: Node, desired_length: int) -> list[ChunkBoundary]:

def _merge_node(node: Node, desired_length: int) -> list[ChunkBoundary]:
"""Recursively turn nodes into chunks, then merge adjacent chunks."""
raw_chunks = _merge_node_inner(node, desired_length)
raw_chunks = _merge_node_inner(node, desired_length, 0)
return _merge_adjacent_chunks(raw_chunks, desired_length)


Expand Down
12 changes: 12 additions & 0 deletions tests/test_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,15 @@ def test_download_error() -> None:
with patch("semble.chunking.core.get_parser", side_effect=DownloadError):
chunks = chunk("x = 1", "python", 10)
assert chunks is None


def test_chunker_deep_string(caplog: pytest.LogCaptureFixture) -> None:
"""Test that chunking works with a very deep string."""
deep_string = "abs(0)\n"
for _ in range(10000):
deep_string = f"abs({deep_string})\n"
with caplog.at_level(logging.WARNING, logger="semble.chunking.core"):
chunks = chunk_source(deep_string, "deep_string.py", "python")
assert chunks is not None
assert len(caplog.records) == 1
assert "Recursion depth exceeded in chunk." in caplog.records[0].message
Loading