From ccac508a436e01c8b859d504be17f492cb5a3363 Mon Sep 17 00:00:00 2001 From: a Date: Wed, 10 Jun 2026 12:13:43 +0800 Subject: [PATCH 1/2] reserve rowspan/colspan when bypass_tables is enabled --- crawl4ai/html2text/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/crawl4ai/html2text/__init__.py b/crawl4ai/html2text/__init__.py index efbe27018..c1e90a3f3 100644 --- a/crawl4ai/html2text/__init__.py +++ b/crawl4ai/html2text/__init__.py @@ -703,7 +703,13 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: self.soft_br() if tag in ["td", "th"]: if start: - self.o("<{}>\n\n".format(tag)) + # Preserve rowspan and colspan attributes + extra = "" + for attr_name in ("rowspan", "colspan"): + val = attrs.get(attr_name) + if val is not None: + extra += ' {}="{}"'.format(attr_name, val) + self.o("<{}{}>\n\n".format(tag, extra)) else: self.o("\n".format(tag)) else: From a5c04beed09804a2fdc6f3da2a93190cba5a35e7 Mon Sep 17 00:00:00 2001 From: a Date: Wed, 10 Jun 2026 12:16:04 +0800 Subject: [PATCH 2/2] add test for rowspan/colspan --- tests/bypass_tables/test_add_span.py | 50 ++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 tests/bypass_tables/test_add_span.py diff --git a/tests/bypass_tables/test_add_span.py b/tests/bypass_tables/test_add_span.py new file mode 100644 index 000000000..1e790eaeb --- /dev/null +++ b/tests/bypass_tables/test_add_span.py @@ -0,0 +1,50 @@ +import asyncio +import pathlib + +from crawl4ai.async_configs import BrowserConfig,CrawlerRunConfig +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai import DefaultTableExtraction +from crawl4ai import AsyncWebCrawler,CacheMode +from crawl4ai.content_filter_strategy import PruningContentFilter + +target_url = "https://en.wikipedia.org/wiki/List_of_prime_ministers_of_India" +md_file = pathlib.Path(__file__).parent.absolute().joinpath('test.md').absolute() + + +# browser_config +browser_config = BrowserConfig( + headless=True, + user_agent_mode='random', +) + + + +prune_filter = PruningContentFilter( + threshold=0.8, + threshold_type="dynamic", +) + +# CrawlerConfig +run_config = CrawlerRunConfig( + magic=True, + markdown_generator=DefaultMarkdownGenerator( + content_source = "cleaned_html", + options={ + 'bypass_tables': True, + } + ), + cache_mode=CacheMode.BYPASS, + css_selector='table.wikitable', + table_extraction= DefaultTableExtraction() +) + +async def main(): + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url=target_url,config=run_config) + print(result.markdown) + print(result.tables) + with open(md_file,'w') as f: + f.write(result.markdown) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file