node-sdk/examples/basic-crawl.ts at master · getmaxun/node-sdk · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
/**
 * Crawl Example
 *
 * This example demonstrates:
 * - Using the Crawl SDK to discover and scrape multiple pages
 * - Configuring crawl scope (domain, subdomain, or path)
 * - Setting limits and depth for crawling
 * - Using sitemap discovery
 * - Filtering pages with include/exclude paths
 *
 * Site: Example blog or documentation site
 */

import 'dotenv/config';
import { Crawl } from 'maxun-sdk';

async function main() {
  const crawler = new Crawl({
    apiKey: process.env.MAXUN_API_KEY!,
    baseUrl: process.env.MAXUN_BASE_URL!
  });

  try {
    const robot = await crawler.create(
      'YC Companies Crawler',
      'https://www.ycombinator.com/jobs',
      {
        mode: 'domain',
        limit: 10,
        maxDepth: 3,
        includePaths: [],
        excludePaths: [],
        useSitemap: true,
        followLinks: true,
        respectRobots: true
      }
    );

    console.log(`Crawl robot created: ${robot.id}`);
    console.log('Starting crawl...');

    const result = await robot.run();

    console.log('\n=== Crawl Completed ===');
    console.log('Status:', result.status);
    console.log('Run ID:', result.runId);

    if (result.data.crawlData) {
      const crawlData = result.data.crawlData;

      if (typeof crawlData === 'object' && !Array.isArray(crawlData)) {
        const allPages: any[] = [];
        Object.values(crawlData).forEach((value: any) => {
          if (Array.isArray(value)) {
            allPages.push(...value);
          }
        });

        console.log('Pages crawled:', allPages.length);

        console.log('\nCrawled URLs:');
        allPages.forEach((page: any, i: number) => {
          const url = page?.metadata?.url || page?.url || `Page ${i + 1}`;
          console.log(`  ${i + 1}. ${url}`);
          if (page.metadata?.title) {
            console.log(`     Title: ${page.metadata.title}`);
          }
          if (page.wordCount) {
            console.log(`     Words: ${page.wordCount}`);
          }
        });
      } else if (Array.isArray(crawlData)) {
        console.log('Pages crawled:', crawlData.length);

        console.log('\nCrawled URLs:');
        crawlData.forEach((page: any, i: number) => {
          const url = page?.metadata?.url || page?.url || `Page ${i + 1}`;
          console.log(`  ${i + 1}. ${url}`);
        });
      } else {
        console.log('Crawl data format:', typeof crawlData);
        console.log('Crawl data:', JSON.stringify(crawlData, null, 2));
      }
    } else {
      console.log('No crawl data found');
      console.log('Result data keys:', Object.keys(result.data));
    }

  } catch (error: any) {
    console.error('Error:', error.message);
    if (error.details) {
      console.error('Details:', error.details);
    }
    process.exit(1);
  }
}

if (!process.env.MAXUN_API_KEY) {
  console.error('Error: MAXUN_API_KEY environment variable is required');
  process.exit(1);
}

main();