apify · Mantisus · Jun 14, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -50,14 +50,15 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,sql_mysql,stagehand,redis]"]
+all = ["crawlee[adaptive-crawler,ai,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,sql_mysql,stagehand,redis]"]
 adaptive-crawler = [
     "jaro-winkler>=2.0.3",
     "playwright>=1.27.0",
     "scikit-learn>=1.6.0",
     "apify_fingerprint_datapoints>=0.0.3",
     "browserforge>=1.2.4"
 ]
+ai = ["pydantic-ai-slim[openai]>=1.106.0", "parsel>=1.10.0", "lxml[html_clean]>=5.2.0"]
 beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"]
 cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"]
 curl-impersonate = ["curl-cffi>=0.9.0"]

diff --git a/src/crawlee/crawlers/__init__.py b/src/crawlee/crawlers/__init__.py
@@ -65,6 +65,36 @@
         StagehandPreNavCrawlingContext,
     )
 
+with _try_import(
+    __name__,
+    'AiCleanHtmlDistiller',
+    'AiCrawler',
+    'AiCrawlingContext',
+    'AiDirectExtractor',
+    'AiHtmlDistiller',
+    'AiHtmlExtractor',
+    'AiSelectorExtractor',
+    'AiSkeletonDistiller',
+    'AiUsageStats',
+    'BaseAiHtmlDistiller',
+    'BaseAiHtmlExtractor',
+    'get_basic_ai_cleaner',
+):
+    from ._ai import (
+        AiCleanHtmlDistiller,
+        AiCrawler,
+        AiCrawlingContext,
+        AiDirectExtractor,
+        AiHtmlDistiller,
+        AiHtmlExtractor,
+        AiSelectorExtractor,
+        AiSkeletonDistiller,
+        AiUsageStats,
+        BaseAiHtmlDistiller,
+        BaseAiHtmlExtractor,
+        get_basic_ai_cleaner,
+    )
+
 
 __all__ = [
     'AbstractHttpCrawler',
@@ -74,6 +104,17 @@
     'AdaptivePlaywrightCrawlingContext',
     'AdaptivePlaywrightPostNavCrawlingContext',
     'AdaptivePlaywrightPreNavCrawlingContext',
+    'AiCleanHtmlDistiller',
+    'AiCrawler',
+    'AiCrawlingContext',
+    'AiDirectExtractor',
+    'AiHtmlDistiller',
+    'AiHtmlExtractor',
+    'AiSelectorExtractor',
+    'AiSkeletonDistiller',
+    'AiUsageStats',
+    'BaseAiHtmlDistiller',
+    'BaseAiHtmlExtractor',
     'BasicCrawler',
     'BasicCrawlerOptions',
     'BasicCrawlingContext',
@@ -99,4 +140,5 @@
     'StagehandCrawlingContext',
     'StagehandPostNavCrawlingContext',
     'StagehandPreNavCrawlingContext',
+    'get_basic_ai_cleaner',
 ]
diff --git a/src/crawlee/crawlers/_ai/__init__.py b/src/crawlee/crawlers/_ai/__init__.py
@@ -0,0 +1,42 @@
+from crawlee._utils.try_import import install_import_hook as _install_import_hook
+from crawlee._utils.try_import import try_import as _try_import
+
+_install_import_hook(__name__)
+
+# The following imports are wrapped in try_import to handle optional dependencies (the `ai` extra),
+# ensuring the module can still function even if these dependencies are missing.
+with _try_import(__name__, 'AiCrawler'):
+    from ._ai_crawler import AiCrawler
+with _try_import(__name__, 'AiCrawlingContext'):
+    from ._ai_crawling_context import AiCrawlingContext
+with _try_import(__name__, 'BaseAiHtmlExtractor'):
+    from ._base_extractor import BaseAiHtmlExtractor
+with _try_import(__name__, 'AiDirectExtractor'):
+    from ._direct_extractor import AiDirectExtractor
+with _try_import(__name__, 'AiSelectorExtractor'):
+    from ._selector_extractor import AiSelectorExtractor
+with _try_import(__name__, 'BaseAiHtmlDistiller'):
+    from ._base_distiller import BaseAiHtmlDistiller
+with _try_import(__name__, 'AiCleanHtmlDistiller'):
+    from ._clean_html_distiller import AiCleanHtmlDistiller
+with _try_import(__name__, 'AiSkeletonDistiller'):
+    from ._skeleton_distiller import AiSkeletonDistiller
+with _try_import(__name__, 'AiHtmlDistiller', 'AiHtmlExtractor', 'AiUsageStats'):
+    from ._types import AiHtmlDistiller, AiHtmlExtractor, AiUsageStats
+with _try_import(__name__, 'get_basic_ai_cleaner'):
+    from ._utils import get_basic_ai_cleaner
+
+__all__ = [
+    'AiCleanHtmlDistiller',
+    'AiCrawler',
+    'AiCrawlingContext',
+    'AiDirectExtractor',
+    'AiHtmlDistiller',
+    'AiHtmlExtractor',
+    'AiSelectorExtractor',
+    'AiSkeletonDistiller',
+    'AiUsageStats',
+    'BaseAiHtmlDistiller',
+    'BaseAiHtmlExtractor',
+    'get_basic_ai_cleaner',
+]
diff --git a/src/crawlee/crawlers/_ai/_ai_crawler.py b/src/crawlee/crawlers/_ai/_ai_crawler.py
@@ -0,0 +1,173 @@
+from __future__ import annotations
+
+import warnings
+from contextlib import AbstractAsyncContextManager
+from logging import getLogger
+from typing import TYPE_CHECKING
+
+from parsel import Selector
+
+from crawlee._utils.docs import docs_group
+from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
+from crawlee.crawlers._parsel._parsel_crawling_context import ParselCrawlingContext
+from crawlee.crawlers._parsel._parsel_parser import ParselParser
+
+from ._ai_crawling_context import AiCrawlingContext
+from ._direct_extractor import AiDirectExtractor
+
+if TYPE_CHECKING:
+    from collections.abc import AsyncGenerator
+
+    from pydantic_ai.models import Model
+    from typing_extensions import Unpack
+
+    from crawlee import Request
+    from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext
+
+    from ._types import AiHtmlExtractor, AiUsageStats, ExtractFunction, TSchema
+
+
+logger = getLogger(__name__)
+
+
+@docs_group('Crawlers')
+class AiCrawler(AbstractHttpCrawler[AiCrawlingContext, Selector, Selector]):
+    """A web crawler that extracts structured data from pages using an AI model.
+
+    Builds on `AbstractHttpCrawler` and parses responses with Parsel, so the request handler has both the usual
+    Parsel `selector` and the AI-powered `extract` helper: pass a Pydantic model and get a validated instance back.
+
+    The model layer is Pydantic AI, so any provider it supports (OpenAI, Anthropic, Gemini, Ollama, ...) works
+    through the `model` argument. The default extractor is an `AiDirectExtractor`: each page is distilled and sent
+    to the model in one call. For cached CSS-selector extraction at near-zero LLM cost, pass an `AiSelectorExtractor`
+    through the `extractor` argument.
+
+    Warning:
+        This is an experimental crawler. Its public API may change in future versions.
+
+    ### Usage
+
+    ```python
+    from pydantic import BaseModel
+    from pydantic_ai.models.openai import OpenAIChatModel
+    from pydantic_ai.providers.openai import OpenAIProvider
+
+    from crawlee.crawlers import AiCrawler, AiCrawlingContext
+
+
+    class Article(BaseModel):
+        title: str
+        author: str | None
+
+
+    crawler = AiCrawler(model=OpenAIChatModel('gpt-5.4-nano', provider=OpenAIProvider(api_key='...')))
+
+
+    @crawler.router.default_handler
+    async def request_handler(context: AiCrawlingContext) -> None:
+        article = await context.extract(Article)
+        await context.push_data(article.model_dump())
+
+
+    await crawler.run(['https://crawlee.dev/'])
+    ```
+    """
+
+    def __init__(
+        self,
+        *,
+        model: str | Model | None = None,
+        extractor: AiHtmlExtractor | None = None,
+        **kwargs: Unpack[HttpCrawlerOptions[AiCrawlingContext]],
+    ) -> None:
+        """Initialize a new instance.
+
+        Args:
+            model: The model used for extraction, given to the default extractor (`AiDirectExtractor`). A
+                provider-prefixed name (e.g. `'openai:gpt-5.4-nano'`) or a Pydantic AI `Model` instance. When given
+                as a string, the provider reads credentials from its environment variable (e.g. `OPENAI_API_KEY`).
+                Pass a `Model` instance to supply them explicitly. Provide exactly one of `model` or `extractor`.
+            extractor: A pre-configured `AiHtmlExtractor`, for full control over the distiller, instructions,
+                caching, usage limits, and model fallback. Pass an `AiSelectorExtractor` here for cached-selector
+                extraction. Provide exactly one of `model` or `extractor`.
+            kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`.
+        """
+        if (model is None) == (extractor is None):
+            raise ValueError('Provide exactly one of `model` or `extractor`.')
+
+        if extractor is None and model is not None:
+            extractor = AiDirectExtractor(model)
+
+        if not extractor:
+            raise ValueError('Extractor initialization failed; check the provided model or extractor configuration.')
+
+        # Call the notification only once.
+        warnings.warn(
+            'The AiCrawler is experimental and its public API may change in future releases.',
+            category=UserWarning,
+            stacklevel=2,
+        )
+
+        self._ai_usage = extractor.ai_usage
+        self._extractor = extractor
+
+        async def final_step(
+            context: ParsedHttpCrawlingContext[Selector],
+        ) -> AsyncGenerator[AiCrawlingContext, None]:
+            """Enhance `ParsedHttpCrawlingContext[Selector]` with the `extract` helper and `ai_usage`."""
+            parsel_context = ParselCrawlingContext.from_parsed_http_crawling_context(context)
+            yield AiCrawlingContext.from_parsel_crawling_context(
+                parsel_context,
+                extract=self._create_extract_function(parsel_context.selector, parsel_context.request),
+                ai_usage=self._ai_usage,
+            )
+
+        kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline().compose(final_step)
+
+        # If the extractor is an async context manager, add it to the crawler's additional context managers so it's
+        # properly entered and exited around the crawl.
+        if isinstance(extractor, AbstractAsyncContextManager):
+            kwargs['_additional_context_managers'] = [
+                *kwargs.get('_additional_context_managers', []),
+                extractor,
+            ]
+        super().__init__(
+            parser=ParselParser(),
+            **kwargs,
+        )
+
+    @property
+    def extractor(self) -> AiHtmlExtractor:
+        """The extractor used to turn pages into structured data."""
+        return self._extractor
+
+    @property
+    def ai_usage(self) -> AiUsageStats:
+        """Accumulated token usage across extraction calls."""
+        return self._ai_usage
+
+    def _create_extract_function(self, selector: Selector, request: Request) -> ExtractFunction:
+        """Build an `extract` helper bound to the page's parsed tree.
+
+        When the caller omits `cache_tag`, it defaults to `request.label` so an `AiSelectorExtractor` buckets
+        selectors per route without extra wiring. An explicit `cache_tag` overrides this.
+        """
+
+        async def extract(
+            schema: type[TSchema],
+            *,
+            scope: str | None = None,
+            cache_tag: str | None = None,
+            additional_instructions: str | None = None,
+        ) -> TSchema:
+            # `AiHtmlExtractor.extract` accepts a Selector directly, so the already-parsed tree is handed over
+            # without a serialize round trip.
+            return await self._extractor.extract(
+                selector,
+                schema,
+                scope=scope,
+                cache_tag=cache_tag if cache_tag is not None else request.label,
+                additional_instructions=additional_instructions,
+            )
+
+        return extract
diff --git a/src/crawlee/crawlers/_ai/_ai_crawling_context.py b/src/crawlee/crawlers/_ai/_ai_crawling_context.py
@@ -0,0 +1,44 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, fields
+from typing import TYPE_CHECKING
+
+from crawlee._utils.docs import docs_group
+from crawlee.crawlers._parsel._parsel_crawling_context import ParselCrawlingContext
+
+if TYPE_CHECKING:
+    from typing_extensions import Self
+
+    from ._types import AiUsageStats, ExtractFunction
+
+
+@dataclass(frozen=True)
+@docs_group('Crawling contexts')
+class AiCrawlingContext(ParselCrawlingContext):
+    """The crawling context used by the `AiCrawler`.
+
+    It extends `ParselCrawlingContext`, so the full Parsel `selector` (and `enqueue_links`) remain available
+    alongside the AI-powered `extract` helper. Handlers can mix cheap manual selectors with AI extraction on the
+    same page.
+    """
+
+    extract: ExtractFunction
+    """Extract a structured Pydantic model from the page using the configured AI extractor."""
+
+    ai_usage: AiUsageStats
+    """The cumulative token usage stats of the extractor across calls in this crawl."""
+
+    @classmethod
+    def from_parsel_crawling_context(
+        cls,
+        context: ParselCrawlingContext,
+        *,
+        extract: ExtractFunction,
+        ai_usage: AiUsageStats,
+    ) -> Self:
+        """Create a new context from an existing `ParselCrawlingContext`."""
+        return cls(
+            extract=extract,
+            ai_usage=ai_usage,
+            **{field.name: getattr(context, field.name) for field in fields(context)},
+        )