Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,15 @@ dependencies = [
]

[project.optional-dependencies]
all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,sql_mysql,stagehand,redis]"]
all = ["crawlee[adaptive-crawler,ai,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,sql_mysql,stagehand,redis]"]
adaptive-crawler = [
"jaro-winkler>=2.0.3",
"playwright>=1.27.0",
"scikit-learn>=1.6.0",
"apify_fingerprint_datapoints>=0.0.3",
"browserforge>=1.2.4"
]
ai = ["pydantic-ai-slim[openai]>=1.106.0", "parsel>=1.10.0", "lxml[html_clean]>=5.2.0"]
beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"]
cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"]
curl-impersonate = ["curl-cffi>=0.9.0"]
Expand Down
42 changes: 42 additions & 0 deletions src/crawlee/crawlers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,36 @@
StagehandPreNavCrawlingContext,
)

with _try_import(
__name__,
'AiCleanHtmlDistiller',
'AiCrawler',
'AiCrawlingContext',
'AiDirectExtractor',
'AiHtmlDistiller',
'AiHtmlExtractor',
'AiSelectorExtractor',
'AiSkeletonDistiller',
'AiUsageStats',
'BaseAiHtmlDistiller',
'BaseAiHtmlExtractor',
'get_basic_ai_cleaner',
):
from ._ai import (
AiCleanHtmlDistiller,
AiCrawler,
AiCrawlingContext,
AiDirectExtractor,
AiHtmlDistiller,
AiHtmlExtractor,
AiSelectorExtractor,
AiSkeletonDistiller,
AiUsageStats,
BaseAiHtmlDistiller,
BaseAiHtmlExtractor,
get_basic_ai_cleaner,
)


__all__ = [
'AbstractHttpCrawler',
Expand All @@ -74,6 +104,17 @@
'AdaptivePlaywrightCrawlingContext',
'AdaptivePlaywrightPostNavCrawlingContext',
'AdaptivePlaywrightPreNavCrawlingContext',
'AiCleanHtmlDistiller',
'AiCrawler',
'AiCrawlingContext',
'AiDirectExtractor',
'AiHtmlDistiller',
'AiHtmlExtractor',
'AiSelectorExtractor',
'AiSkeletonDistiller',
'AiUsageStats',
'BaseAiHtmlDistiller',
'BaseAiHtmlExtractor',
'BasicCrawler',
'BasicCrawlerOptions',
'BasicCrawlingContext',
Expand All @@ -99,4 +140,5 @@
'StagehandCrawlingContext',
'StagehandPostNavCrawlingContext',
'StagehandPreNavCrawlingContext',
'get_basic_ai_cleaner',
]
42 changes: 42 additions & 0 deletions src/crawlee/crawlers/_ai/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from crawlee._utils.try_import import install_import_hook as _install_import_hook
from crawlee._utils.try_import import try_import as _try_import

_install_import_hook(__name__)

# The following imports are wrapped in try_import to handle optional dependencies (the `ai` extra),
# ensuring the module can still function even if these dependencies are missing.
with _try_import(__name__, 'AiCrawler'):
from ._ai_crawler import AiCrawler
with _try_import(__name__, 'AiCrawlingContext'):
from ._ai_crawling_context import AiCrawlingContext
with _try_import(__name__, 'BaseAiHtmlExtractor'):
from ._base_extractor import BaseAiHtmlExtractor
with _try_import(__name__, 'AiDirectExtractor'):
from ._direct_extractor import AiDirectExtractor
with _try_import(__name__, 'AiSelectorExtractor'):
from ._selector_extractor import AiSelectorExtractor
with _try_import(__name__, 'BaseAiHtmlDistiller'):
from ._base_distiller import BaseAiHtmlDistiller
with _try_import(__name__, 'AiCleanHtmlDistiller'):
from ._clean_html_distiller import AiCleanHtmlDistiller
with _try_import(__name__, 'AiSkeletonDistiller'):
from ._skeleton_distiller import AiSkeletonDistiller
with _try_import(__name__, 'AiHtmlDistiller', 'AiHtmlExtractor', 'AiUsageStats'):
from ._types import AiHtmlDistiller, AiHtmlExtractor, AiUsageStats
with _try_import(__name__, 'get_basic_ai_cleaner'):
from ._utils import get_basic_ai_cleaner

__all__ = [
'AiCleanHtmlDistiller',
'AiCrawler',
'AiCrawlingContext',
'AiDirectExtractor',
'AiHtmlDistiller',
'AiHtmlExtractor',
'AiSelectorExtractor',
'AiSkeletonDistiller',
'AiUsageStats',
'BaseAiHtmlDistiller',
'BaseAiHtmlExtractor',
'get_basic_ai_cleaner',
]
173 changes: 173 additions & 0 deletions src/crawlee/crawlers/_ai/_ai_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
from __future__ import annotations

import warnings
from contextlib import AbstractAsyncContextManager
from logging import getLogger
from typing import TYPE_CHECKING

from parsel import Selector

from crawlee._utils.docs import docs_group
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
from crawlee.crawlers._parsel._parsel_crawling_context import ParselCrawlingContext
from crawlee.crawlers._parsel._parsel_parser import ParselParser

from ._ai_crawling_context import AiCrawlingContext
from ._direct_extractor import AiDirectExtractor

if TYPE_CHECKING:
from collections.abc import AsyncGenerator

from pydantic_ai.models import Model
from typing_extensions import Unpack

from crawlee import Request
from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext

from ._types import AiHtmlExtractor, AiUsageStats, ExtractFunction, TSchema


logger = getLogger(__name__)


@docs_group('Crawlers')
class AiCrawler(AbstractHttpCrawler[AiCrawlingContext, Selector, Selector]):
"""A web crawler that extracts structured data from pages using an AI model.

Builds on `AbstractHttpCrawler` and parses responses with Parsel, so the request handler has both the usual
Parsel `selector` and the AI-powered `extract` helper: pass a Pydantic model and get a validated instance back.

The model layer is Pydantic AI, so any provider it supports (OpenAI, Anthropic, Gemini, Ollama, ...) works
through the `model` argument. The default extractor is an `AiDirectExtractor`: each page is distilled and sent
to the model in one call. For cached CSS-selector extraction at near-zero LLM cost, pass an `AiSelectorExtractor`
through the `extractor` argument.

Warning:
This is an experimental crawler. Its public API may change in future versions.

### Usage

```python
from pydantic import BaseModel
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.providers.openai import OpenAIProvider

from crawlee.crawlers import AiCrawler, AiCrawlingContext


class Article(BaseModel):
title: str
author: str | None


crawler = AiCrawler(model=OpenAIChatModel('gpt-5.4-nano', provider=OpenAIProvider(api_key='...')))


@crawler.router.default_handler
async def request_handler(context: AiCrawlingContext) -> None:
article = await context.extract(Article)
await context.push_data(article.model_dump())


await crawler.run(['https://crawlee.dev/'])
```
"""

def __init__(
self,
*,
model: str | Model | None = None,
extractor: AiHtmlExtractor | None = None,
**kwargs: Unpack[HttpCrawlerOptions[AiCrawlingContext]],
) -> None:
"""Initialize a new instance.

Args:
model: The model used for extraction, given to the default extractor (`AiDirectExtractor`). A
provider-prefixed name (e.g. `'openai:gpt-5.4-nano'`) or a Pydantic AI `Model` instance. When given
as a string, the provider reads credentials from its environment variable (e.g. `OPENAI_API_KEY`).
Pass a `Model` instance to supply them explicitly. Provide exactly one of `model` or `extractor`.
extractor: A pre-configured `AiHtmlExtractor`, for full control over the distiller, instructions,
caching, usage limits, and model fallback. Pass an `AiSelectorExtractor` here for cached-selector
extraction. Provide exactly one of `model` or `extractor`.
kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`.
"""
if (model is None) == (extractor is None):
raise ValueError('Provide exactly one of `model` or `extractor`.')

if extractor is None and model is not None:
extractor = AiDirectExtractor(model)

if not extractor:
raise ValueError('Extractor initialization failed; check the provided model or extractor configuration.')

# Call the notification only once.
warnings.warn(
'The AiCrawler is experimental and its public API may change in future releases.',
category=UserWarning,
stacklevel=2,
)

self._ai_usage = extractor.ai_usage
self._extractor = extractor

async def final_step(
context: ParsedHttpCrawlingContext[Selector],
) -> AsyncGenerator[AiCrawlingContext, None]:
"""Enhance `ParsedHttpCrawlingContext[Selector]` with the `extract` helper and `ai_usage`."""
parsel_context = ParselCrawlingContext.from_parsed_http_crawling_context(context)
yield AiCrawlingContext.from_parsel_crawling_context(
parsel_context,
extract=self._create_extract_function(parsel_context.selector, parsel_context.request),
ai_usage=self._ai_usage,
)

kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline().compose(final_step)

# If the extractor is an async context manager, add it to the crawler's additional context managers so it's
# properly entered and exited around the crawl.
if isinstance(extractor, AbstractAsyncContextManager):
kwargs['_additional_context_managers'] = [
*kwargs.get('_additional_context_managers', []),
extractor,
]
super().__init__(
parser=ParselParser(),
**kwargs,
)

@property
def extractor(self) -> AiHtmlExtractor:
"""The extractor used to turn pages into structured data."""
return self._extractor

@property
def ai_usage(self) -> AiUsageStats:
"""Accumulated token usage across extraction calls."""
return self._ai_usage

def _create_extract_function(self, selector: Selector, request: Request) -> ExtractFunction:
"""Build an `extract` helper bound to the page's parsed tree.

When the caller omits `cache_tag`, it defaults to `request.label` so an `AiSelectorExtractor` buckets
selectors per route without extra wiring. An explicit `cache_tag` overrides this.
"""

async def extract(
schema: type[TSchema],
*,
scope: str | None = None,
cache_tag: str | None = None,
additional_instructions: str | None = None,
) -> TSchema:
# `AiHtmlExtractor.extract` accepts a Selector directly, so the already-parsed tree is handed over
# without a serialize round trip.
return await self._extractor.extract(
selector,
schema,
scope=scope,
cache_tag=cache_tag if cache_tag is not None else request.label,
additional_instructions=additional_instructions,
)

return extract
44 changes: 44 additions & 0 deletions src/crawlee/crawlers/_ai/_ai_crawling_context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from __future__ import annotations

from dataclasses import dataclass, fields
from typing import TYPE_CHECKING

from crawlee._utils.docs import docs_group
from crawlee.crawlers._parsel._parsel_crawling_context import ParselCrawlingContext

if TYPE_CHECKING:
from typing_extensions import Self

from ._types import AiUsageStats, ExtractFunction


@dataclass(frozen=True)
@docs_group('Crawling contexts')
class AiCrawlingContext(ParselCrawlingContext):
"""The crawling context used by the `AiCrawler`.

It extends `ParselCrawlingContext`, so the full Parsel `selector` (and `enqueue_links`) remain available
alongside the AI-powered `extract` helper. Handlers can mix cheap manual selectors with AI extraction on the
same page.
"""

extract: ExtractFunction
"""Extract a structured Pydantic model from the page using the configured AI extractor."""

ai_usage: AiUsageStats
"""The cumulative token usage stats of the extractor across calls in this crawl."""

@classmethod
def from_parsel_crawling_context(
cls,
context: ParselCrawlingContext,
*,
extract: ExtractFunction,
ai_usage: AiUsageStats,
) -> Self:
"""Create a new context from an existing `ParselCrawlingContext`."""
return cls(
extract=extract,
ai_usage=ai_usage,
**{field.name: getattr(context, field.name) for field in fields(context)},
)
Loading
Loading