Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion crawl4ai/chunking_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,10 @@ def chunk(self, text: str) -> list:
if len(words) <= self.window_size:
return [text]

# The stride must be positive so ``start`` always advances. Otherwise an
# overlap >= window_size leaves start unchanged (or moving backwards),
# turning the crawl into an infinite loop that never terminates.
stride = max(1, self.window_size - self.overlap)
Comment on lines +244 to +247
Comment on lines +244 to +247
start = 0
while start < len(words):
end = start + self.window_size
Expand All @@ -250,6 +254,6 @@ def chunk(self, text: str) -> list:
if end >= len(words):
break

start = end - self.overlap
start += stride

return chunks
39 changes: 39 additions & 0 deletions tests/general/test_chunking_strategy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""Unit tests for word-based chunking strategies."""

from crawl4ai.chunking_strategy import OverlappingWindowChunking


def _words(n):
return " ".join(str(i) for i in range(n))


def test_overlapping_window_basic_overlap():
chunks = OverlappingWindowChunking(window_size=100, overlap=20).chunk(_words(250))
assert len(chunks) > 1
assert chunks[0].split()[0] == "0"
# The chunks must cover the text up to and including the final word.
assert chunks[-1].split()[-1] == "249"


def test_overlapping_window_no_overlap():
chunks = OverlappingWindowChunking(window_size=100, overlap=0).chunk(_words(250))
assert len(chunks) == 3 # 0-100, 100-200, 200-250


def test_overlapping_window_short_text_single_chunk():
chunks = OverlappingWindowChunking(window_size=100, overlap=10).chunk(_words(50))
assert len(chunks) == 1


def test_overlapping_window_overlap_equal_to_window_terminates():
# Regression: overlap >= window_size previously left `start` unchanged,
# so chunk() looped forever. It must now terminate and still reach the end.
chunks = OverlappingWindowChunking(window_size=100, overlap=100).chunk(_words(250))
assert len(chunks) >= 1
assert chunks[-1].split()[-1] == "249"


def test_overlapping_window_overlap_greater_than_window_terminates():
chunks = OverlappingWindowChunking(window_size=50, overlap=80).chunk(_words(200))
assert len(chunks) >= 1
assert chunks[-1].split()[-1] == "199"