From 613c912be0f5e5e22f201d0d8b2c19f3794f4b57 Mon Sep 17 00:00:00 2001 From: Osamaali313 <86572800+Osamaali313@users.noreply.github.com> Date: Sat, 13 Jun 2026 23:02:10 +0300 Subject: [PATCH] fix: prevent infinite loop in OverlappingWindowChunking when overlap >= window_size OverlappingWindowChunking.chunk() advanced the window with start = end - self.overlap. When overlap >= window_size this leaves start unchanged (overlap == window_size) or moves it backwards (overlap > window_size), so the while loop never terminates and the call hangs while the chunk list grows without bound. Advance by a guaranteed-positive stride of max(1, window_size - overlap). For valid configurations (overlap < window_size) the stride equals window_size - overlap, so the produced chunks are identical to before; degenerate configurations now terminate instead of hanging. Adds unit tests for OverlappingWindowChunking covering normal overlap, zero overlap, short input, and the overlap >= window_size regression. --- crawl4ai/chunking_strategy.py | 6 +++- tests/general/test_chunking_strategy.py | 39 +++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 tests/general/test_chunking_strategy.py diff --git a/crawl4ai/chunking_strategy.py b/crawl4ai/chunking_strategy.py index a0bfe1bf4..74c24f681 100644 --- a/crawl4ai/chunking_strategy.py +++ b/crawl4ai/chunking_strategy.py @@ -241,6 +241,10 @@ def chunk(self, text: str) -> list: if len(words) <= self.window_size: return [text] + # The stride must be positive so ``start`` always advances. Otherwise an + # overlap >= window_size leaves start unchanged (or moving backwards), + # turning the crawl into an infinite loop that never terminates. + stride = max(1, self.window_size - self.overlap) start = 0 while start < len(words): end = start + self.window_size @@ -250,6 +254,6 @@ def chunk(self, text: str) -> list: if end >= len(words): break - start = end - self.overlap + start += stride return chunks diff --git a/tests/general/test_chunking_strategy.py b/tests/general/test_chunking_strategy.py new file mode 100644 index 000000000..4450ef1c6 --- /dev/null +++ b/tests/general/test_chunking_strategy.py @@ -0,0 +1,39 @@ +"""Unit tests for word-based chunking strategies.""" + +from crawl4ai.chunking_strategy import OverlappingWindowChunking + + +def _words(n): + return " ".join(str(i) for i in range(n)) + + +def test_overlapping_window_basic_overlap(): + chunks = OverlappingWindowChunking(window_size=100, overlap=20).chunk(_words(250)) + assert len(chunks) > 1 + assert chunks[0].split()[0] == "0" + # The chunks must cover the text up to and including the final word. + assert chunks[-1].split()[-1] == "249" + + +def test_overlapping_window_no_overlap(): + chunks = OverlappingWindowChunking(window_size=100, overlap=0).chunk(_words(250)) + assert len(chunks) == 3 # 0-100, 100-200, 200-250 + + +def test_overlapping_window_short_text_single_chunk(): + chunks = OverlappingWindowChunking(window_size=100, overlap=10).chunk(_words(50)) + assert len(chunks) == 1 + + +def test_overlapping_window_overlap_equal_to_window_terminates(): + # Regression: overlap >= window_size previously left `start` unchanged, + # so chunk() looped forever. It must now terminate and still reach the end. + chunks = OverlappingWindowChunking(window_size=100, overlap=100).chunk(_words(250)) + assert len(chunks) >= 1 + assert chunks[-1].split()[-1] == "249" + + +def test_overlapping_window_overlap_greater_than_window_terminates(): + chunks = OverlappingWindowChunking(window_size=50, overlap=80).chunk(_words(200)) + assert len(chunks) >= 1 + assert chunks[-1].split()[-1] == "199"