forked from CloakHQ/CloakBrowser
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawlee_example.py
More file actions
72 lines (55 loc) · 2.45 KB
/
crawlee_example.py
File metadata and controls
72 lines (55 loc) · 2.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""Crawlee + CloakBrowser: stealth web crawling with PlaywrightCrawler.
Uses a custom BrowserPlugin to swap Crawlee's default Chromium
for CloakBrowser's patched binary with source-level fingerprint patches.
Requires: pip install cloakbrowser "crawlee[playwright]"
"""
import asyncio
from cloakbrowser.config import IGNORE_DEFAULT_ARGS, get_default_stealth_args
from cloakbrowser.download import ensure_binary
from typing_extensions import override
from crawlee.browsers import (
BrowserPool,
PlaywrightBrowserController,
PlaywrightBrowserPlugin,
)
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
class CloakBrowserPlugin(PlaywrightBrowserPlugin):
"""Browser plugin that uses CloakBrowser's patched Chromium,
but otherwise keeps the functionality of PlaywrightBrowserPlugin.
"""
@override
async def new_browser(self) -> PlaywrightBrowserController:
if not self._playwright:
raise RuntimeError('Playwright browser plugin is not initialized.')
binary_path = ensure_binary()
stealth_args = get_default_stealth_args()
# Merge CloakBrowser stealth args with any user-provided launch options.
launch_options = dict(self._browser_launch_options)
launch_options.pop('executable_path', None)
launch_options.pop('chromium_sandbox', None)
existing_args = list(launch_options.pop('args', []))
launch_options['args'] = [*existing_args, *stealth_args]
return PlaywrightBrowserController(
browser=await self._playwright.chromium.launch(
executable_path=binary_path,
ignore_default_args=IGNORE_DEFAULT_ARGS,
**launch_options,
),
max_open_pages_per_browser=1,
# CloakBrowser handles fingerprints at the binary level.
header_generator=None,
)
async def main() -> None:
crawler = PlaywrightCrawler(
max_requests_per_crawl=10,
browser_pool=BrowserPool(plugins=[CloakBrowserPlugin()]),
)
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
title = await context.page.title()
await context.push_data({'url': context.request.url, 'title': title})
await context.enqueue_links()
await crawler.run(['https://example.com'])
if __name__ == '__main__':
asyncio.run(main())