-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathbasic-crawl.ts
More file actions
103 lines (89 loc) · 2.86 KB
/
basic-crawl.ts
File metadata and controls
103 lines (89 loc) · 2.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
/**
* Crawl Example
*
* This example demonstrates:
* - Using the Crawl SDK to discover and scrape multiple pages
* - Configuring crawl scope (domain, subdomain, or path)
* - Setting limits and depth for crawling
* - Using sitemap discovery
* - Filtering pages with include/exclude paths
*
* Site: Example blog or documentation site
*/
import 'dotenv/config';
import { Crawl } from 'maxun-sdk';
async function main() {
const crawler = new Crawl({
apiKey: process.env.MAXUN_API_KEY!,
baseUrl: process.env.MAXUN_BASE_URL!
});
try {
const robot = await crawler.create(
'YC Companies Crawler',
'https://www.ycombinator.com/jobs',
{
mode: 'domain',
limit: 10,
maxDepth: 3,
includePaths: [],
excludePaths: [],
useSitemap: true,
followLinks: true,
respectRobots: true
}
);
console.log(`Crawl robot created: ${robot.id}`);
console.log('Starting crawl...');
const result = await robot.run();
console.log('\n=== Crawl Completed ===');
console.log('Status:', result.status);
console.log('Run ID:', result.runId);
if (result.data.crawlData) {
const crawlData = result.data.crawlData;
if (typeof crawlData === 'object' && !Array.isArray(crawlData)) {
const allPages: any[] = [];
Object.values(crawlData).forEach((value: any) => {
if (Array.isArray(value)) {
allPages.push(...value);
}
});
console.log('Pages crawled:', allPages.length);
console.log('\nCrawled URLs:');
allPages.forEach((page: any, i: number) => {
const url = page?.metadata?.url || page?.url || `Page ${i + 1}`;
console.log(` ${i + 1}. ${url}`);
if (page.metadata?.title) {
console.log(` Title: ${page.metadata.title}`);
}
if (page.wordCount) {
console.log(` Words: ${page.wordCount}`);
}
});
} else if (Array.isArray(crawlData)) {
console.log('Pages crawled:', crawlData.length);
console.log('\nCrawled URLs:');
crawlData.forEach((page: any, i: number) => {
const url = page?.metadata?.url || page?.url || `Page ${i + 1}`;
console.log(` ${i + 1}. ${url}`);
});
} else {
console.log('Crawl data format:', typeof crawlData);
console.log('Crawl data:', JSON.stringify(crawlData, null, 2));
}
} else {
console.log('No crawl data found');
console.log('Result data keys:', Object.keys(result.data));
}
} catch (error: any) {
console.error('Error:', error.message);
if (error.details) {
console.error('Details:', error.details);
}
process.exit(1);
}
}
if (!process.env.MAXUN_API_KEY) {
console.error('Error: MAXUN_API_KEY environment variable is required');
process.exit(1);
}
main();