Skip to content

Commit f9220f1

Browse files
committed
feat(google): add search, suggest, news, and trends adapters
Four new commands under `google`: - search: browser-based DOM extraction from google.com/search - suggest: public JSON API (suggestqueries.google.com) - news: public RSS feed (top stories + keyword search) - trends: public RSS feed (daily trending searches by region) Shared RSS parser in utils.ts with attribute/CDATA support. Unit tests for parseRssItems, E2E tests with network skip guards.
1 parent 0b71c6c commit f9220f1

8 files changed

Lines changed: 464 additions & 0 deletions

File tree

src/clis/google/news.ts

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/**
2+
* Google News via public RSS feed.
3+
* Supports top stories (no keyword) and search (with keyword).
4+
*/
5+
6+
import { cli, Strategy } from '../../registry.js';
7+
import { CliError } from '../../errors.js';
8+
import { parseRssItems } from './utils.js';
9+
10+
cli({
11+
site: 'google',
12+
name: 'news',
13+
description: 'Get Google News headlines',
14+
strategy: Strategy.PUBLIC,
15+
browser: false,
16+
args: [
17+
{ name: 'keyword', positional: true, help: 'Search query (omit for top stories)' },
18+
{ name: 'limit', type: 'int', default: 10, help: 'Number of results' },
19+
{ name: 'lang', default: 'en', help: 'Language short code (e.g. en, zh)' },
20+
{ name: 'region', default: 'US', help: 'Region code (e.g. US, CN)' },
21+
],
22+
columns: ['title', 'source', 'date', 'url'],
23+
func: async (_page, args) => {
24+
const limit = Math.max(1, Math.min(Number(args.limit), 100));
25+
const lang = encodeURIComponent(args.lang);
26+
const region = encodeURIComponent(args.region);
27+
const ceid = `${args.region}:${args.lang}`;
28+
29+
// Top stories or search
30+
const base = args.keyword
31+
? `https://news.google.com/rss/search?q=${encodeURIComponent(args.keyword)}&hl=${lang}&gl=${region}&ceid=${ceid}`
32+
: `https://news.google.com/rss?hl=${lang}&gl=${region}&ceid=${ceid}`;
33+
34+
const resp = await fetch(base);
35+
if (!resp.ok) {
36+
throw new CliError('FETCH_ERROR', `HTTP ${resp.status}`, 'Check your network connection');
37+
}
38+
39+
const xml = await resp.text();
40+
const items = parseRssItems(xml, ['title', 'link', 'pubDate', 'source']);
41+
42+
if (!items.length) {
43+
throw new CliError('NOT_FOUND', 'No news articles found', 'Try a different keyword or region');
44+
}
45+
46+
return items.slice(0, limit).map(item => {
47+
// Extract source: prefer <source> element, fallback to parsing title
48+
let title = item['title'] || '';
49+
let source = item['source'] || '';
50+
if (!source) {
51+
const idx = title.lastIndexOf(' - ');
52+
if (idx !== -1) {
53+
source = title.slice(idx + 3);
54+
title = title.slice(0, idx);
55+
}
56+
}
57+
58+
return {
59+
title,
60+
source,
61+
date: item['pubDate'] || '',
62+
url: item['link'] || '',
63+
};
64+
});
65+
},
66+
});

src/clis/google/search.ts

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
/**
2+
* Google Web Search via browser DOM extraction.
3+
* Uses browser mode to navigate google.com and extract results from the DOM.
4+
*
5+
* Extraction strategy (2026-03): Google no longer uses `.g` class containers.
6+
* Instead, we find all `a` tags containing `h3` within `#rso`, then walk up
7+
* to the result container (`div.tF2Cxc` or closest `div[data-hveid]`) to find
8+
* snippets. This approach is resilient to class name changes.
9+
*/
10+
11+
import { cli, Strategy } from '../../registry.js';
12+
import { CliError } from '../../errors.js';
13+
14+
cli({
15+
site: 'google',
16+
name: 'search',
17+
description: 'Search Google',
18+
domain: 'google.com',
19+
strategy: Strategy.COOKIE,
20+
browser: true,
21+
args: [
22+
{ name: 'keyword', positional: true, required: true, help: 'Search query' },
23+
{ name: 'limit', type: 'int', default: 10, help: 'Number of results (1-100)' },
24+
{ name: 'lang', default: 'en', help: 'Language short code (e.g. en, zh)' },
25+
],
26+
columns: ['type', 'title', 'url', 'snippet'],
27+
func: async (page, args) => {
28+
const limit = Math.max(1, Math.min(Number(args.limit), 100));
29+
const keyword = encodeURIComponent(args.keyword);
30+
const lang = encodeURIComponent(args.lang);
31+
const url = `https://www.google.com/search?q=${keyword}&hl=${lang}&num=${limit}`;
32+
33+
await page.goto(url);
34+
await page.wait(2);
35+
36+
const results = await page.evaluate(`
37+
(function() {
38+
var results = [];
39+
var seenUrls = {};
40+
var rso = document.querySelector('#rso');
41+
if (!rso) return results;
42+
43+
// -- Featured snippet (scoped to #rso to avoid matching unrelated elements) --
44+
var featuredEl = rso.querySelector('.xpdopen .hgKElc')
45+
|| rso.querySelector('.IZ6rdc');
46+
if (featuredEl) {
47+
var parentBlock = featuredEl.closest('[data-hveid]') || featuredEl.parentElement;
48+
var fLink = parentBlock ? parentBlock.querySelector('a[href]') : null;
49+
var fUrl = fLink ? fLink.href : '';
50+
if (fUrl) seenUrls[fUrl] = true;
51+
results.push({
52+
type: 'snippet',
53+
title: featuredEl.textContent.trim().slice(0, 200),
54+
url: fUrl,
55+
snippet: '',
56+
});
57+
}
58+
59+
// -- Standard search results --
60+
// Strategy: find all links containing h3 within #rso
61+
var allLinks = rso.querySelectorAll('a');
62+
for (var i = 0; i < allLinks.length; i++) {
63+
var link = allLinks[i];
64+
var h3 = link.querySelector('h3');
65+
if (!h3) continue;
66+
67+
var href = link.href || '';
68+
// Skip non-http, Google internal links, and duplicates
69+
if (!href.match(/^https?:\\/\\//)) continue;
70+
if (href.indexOf('google.com/search') !== -1) continue;
71+
if (seenUrls[href]) continue;
72+
seenUrls[href] = true;
73+
74+
// Walk up to find result container for snippet extraction
75+
var container = link;
76+
for (var j = 0; j < 6; j++) {
77+
if (container.parentElement && container.parentElement !== rso) {
78+
container = container.parentElement;
79+
}
80+
// Stop at a known result boundary
81+
if (container.getAttribute && container.getAttribute('data-hveid')) break;
82+
}
83+
84+
// Find snippet: look for descriptive text, skip breadcrumbs and metadata
85+
var snippetText = '';
86+
var titleText = h3.textContent.trim();
87+
var candidates = container.querySelectorAll('span, div');
88+
for (var k = 0; k < candidates.length; k++) {
89+
var el = candidates[k];
90+
if (el.querySelector('h3') || el.querySelector('a[href]')) continue;
91+
var text = el.textContent.trim();
92+
if (text.length < 40 || text.length > 500) continue;
93+
if (text === titleText) continue;
94+
// Skip URL breadcrumbs (e.g. "https://example.com › path..." or "Site Namehttps://...")
95+
if (text.indexOf('\u203A') !== -1) continue;
96+
if (new RegExp('https?://').test(text.slice(0, 60))) continue;
97+
snippetText = text;
98+
break;
99+
}
100+
101+
results.push({
102+
type: 'result',
103+
title: h3.textContent.trim(),
104+
url: href,
105+
snippet: snippetText.slice(0, 300),
106+
});
107+
}
108+
109+
// -- People Also Ask --
110+
var paaContainers = document.querySelectorAll('[data-sgrd="true"]');
111+
for (var i = 0; i < paaContainers.length; i++) {
112+
var questionEl = paaContainers[i].querySelector('span.CSkcDe');
113+
if (questionEl) {
114+
results.push({
115+
type: 'paa',
116+
title: questionEl.textContent.trim(),
117+
url: '',
118+
snippet: '',
119+
});
120+
}
121+
}
122+
123+
return results;
124+
})()
125+
`);
126+
127+
if (!Array.isArray(results) || results.length === 0) {
128+
throw new CliError('NOT_FOUND', 'No search results found', 'Try a different keyword or check for CAPTCHA');
129+
}
130+
131+
return results;
132+
},
133+
});

src/clis/google/suggest.ts

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/**
2+
* Google Search Suggestions via public JSON API.
3+
* Uses suggestqueries.google.com with client=firefox for pure JSON (not JSONP).
4+
*/
5+
6+
import { cli, Strategy } from '../../registry.js';
7+
import { CliError } from '../../errors.js';
8+
9+
cli({
10+
site: 'google',
11+
name: 'suggest',
12+
description: 'Get Google search suggestions',
13+
strategy: Strategy.PUBLIC,
14+
browser: false,
15+
args: [
16+
{ name: 'keyword', positional: true, required: true, help: 'Search query' },
17+
{ name: 'lang', default: 'zh-CN', help: 'Language code' },
18+
],
19+
columns: ['suggestion'],
20+
func: async (_page, args) => {
21+
const keyword = encodeURIComponent(args.keyword);
22+
const lang = encodeURIComponent(args.lang);
23+
const url = `https://suggestqueries.google.com/complete/search?client=firefox&q=${keyword}&hl=${lang}`;
24+
25+
const resp = await fetch(url);
26+
if (!resp.ok) {
27+
throw new CliError('FETCH_ERROR', `HTTP ${resp.status}`, 'Check your network connection');
28+
}
29+
30+
const data = await resp.json();
31+
// Response format: ["query", ["suggestion1", "suggestion2", ...]]
32+
const suggestions: string[] = Array.isArray(data) && Array.isArray(data[1]) ? data[1] : [];
33+
34+
if (!suggestions.length) {
35+
throw new CliError('NOT_FOUND', 'No suggestions found', 'Try a different keyword');
36+
}
37+
38+
return suggestions.map(s => ({ suggestion: s }));
39+
},
40+
});

src/clis/google/trends.ts

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
/**
2+
* Google Trends via public RSS feed.
3+
* Shows daily trending searches for a given region.
4+
*/
5+
6+
import { cli, Strategy } from '../../registry.js';
7+
import { CliError } from '../../errors.js';
8+
import { parseRssItems } from './utils.js';
9+
10+
cli({
11+
site: 'google',
12+
name: 'trends',
13+
description: 'Get Google Trends daily trending searches',
14+
strategy: Strategy.PUBLIC,
15+
browser: false,
16+
args: [
17+
{ name: 'region', default: 'US', help: 'Region code (e.g. US, CN, JP)' },
18+
{ name: 'limit', type: 'int', default: 20, help: 'Number of results' },
19+
],
20+
columns: ['title', 'traffic', 'date'],
21+
func: async (_page, args) => {
22+
const limit = Math.max(1, Math.min(Number(args.limit), 100));
23+
const region = encodeURIComponent(args.region);
24+
const url = `https://trends.google.com/trending/rss?geo=${region}`;
25+
26+
const resp = await fetch(url);
27+
if (!resp.ok) {
28+
throw new CliError('FETCH_ERROR', `HTTP ${resp.status}`, 'Check your network connection or region code');
29+
}
30+
31+
const xml = await resp.text();
32+
const items = parseRssItems(xml, ['title', 'pubDate', 'ht:approx_traffic']);
33+
34+
if (!items.length) {
35+
throw new CliError('NOT_FOUND', 'No trending data found', 'Try a different region code');
36+
}
37+
38+
return items.slice(0, limit).map(item => ({
39+
title: item['title'],
40+
traffic: item['ht:approx_traffic'], // raw string e.g. "1,000,000+", no numeric conversion
41+
date: item['pubDate'],
42+
}));
43+
},
44+
});

src/clis/google/utils.test.ts

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import { describe, it, expect } from 'vitest';
2+
import { parseRssItems } from './utils.js';
3+
4+
describe('parseRssItems', () => {
5+
it('extracts plain text fields', () => {
6+
const xml = `
7+
<channel>
8+
<item><title>Hello</title><link>https://example.com</link></item>
9+
<item><title>World</title><link>https://test.com</link></item>
10+
</channel>
11+
`;
12+
const items = parseRssItems(xml, ['title', 'link']);
13+
expect(items).toEqual([
14+
{ title: 'Hello', link: 'https://example.com' },
15+
{ title: 'World', link: 'https://test.com' },
16+
]);
17+
});
18+
19+
it('handles CDATA-wrapped content', () => {
20+
const xml = `
21+
<item><title><![CDATA[Breaking News]]></title><link>https://news.com</link></item>
22+
`;
23+
const items = parseRssItems(xml, ['title', 'link']);
24+
expect(items).toEqual([
25+
{ title: 'Breaking News', link: 'https://news.com' },
26+
]);
27+
});
28+
29+
it('handles namespaced fields like ht:approx_traffic', () => {
30+
const xml = `
31+
<item>
32+
<title>AI</title>
33+
<ht:approx_traffic>500,000+</ht:approx_traffic>
34+
<pubDate>Mon, 20 Mar 2026</pubDate>
35+
</item>
36+
`;
37+
const items = parseRssItems(xml, ['title', 'ht:approx_traffic', 'pubDate']);
38+
expect(items).toEqual([
39+
{ title: 'AI', 'ht:approx_traffic': '500,000+', pubDate: 'Mon, 20 Mar 2026' },
40+
]);
41+
});
42+
43+
it('returns empty string for missing fields', () => {
44+
const xml = `<item><title>Test</title></item>`;
45+
const items = parseRssItems(xml, ['title', 'missing']);
46+
expect(items).toEqual([{ title: 'Test', missing: '' }]);
47+
});
48+
49+
it('handles tags with attributes (e.g. <source url="...">)', () => {
50+
const xml = `
51+
<item>
52+
<title><![CDATA[AI reshapes everything - Reuters]]></title>
53+
<source url="https://reuters.com">Reuters</source>
54+
<link>https://news.google.com/123</link>
55+
</item>
56+
`;
57+
const items = parseRssItems(xml, ['title', 'source', 'link']);
58+
expect(items).toEqual([
59+
{ title: 'AI reshapes everything - Reuters', source: 'Reuters', link: 'https://news.google.com/123' },
60+
]);
61+
});
62+
63+
it('handles mixed CDATA and plain text in the same item', () => {
64+
const xml = `
65+
<item>
66+
<title><![CDATA[Breaking: Major event]]></title>
67+
<link>https://example.com/article</link>
68+
<pubDate>Fri, 21 Mar 2026</pubDate>
69+
</item>
70+
`;
71+
const items = parseRssItems(xml, ['title', 'link', 'pubDate']);
72+
expect(items).toEqual([
73+
{ title: 'Breaking: Major event', link: 'https://example.com/article', pubDate: 'Fri, 21 Mar 2026' },
74+
]);
75+
});
76+
77+
it('returns empty array for no items', () => {
78+
const xml = `<channel><title>Empty</title></channel>`;
79+
const items = parseRssItems(xml, ['title']);
80+
expect(items).toEqual([]);
81+
});
82+
});

src/clis/google/utils.ts

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
/**
2+
* Google adapter utilities.
3+
* Shared RSS parser for news and trends commands.
4+
*/
5+
6+
/**
7+
* Parse RSS XML by splitting into <item> blocks, then extracting fields per block.
8+
* Handles both plain text and CDATA-wrapped content.
9+
*/
10+
export function parseRssItems(xml: string, fields: string[]): Record<string, string>[] {
11+
const items = xml.match(/<item>([\s\S]*?)<\/item>/g) || [];
12+
return items.map(block => {
13+
const record: Record<string, string> = {};
14+
for (const field of fields) {
15+
// Escape regex special characters in field name (e.g. ht:approx_traffic is safe, but defensive)
16+
const escaped = field.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
17+
// Handle tags with attributes (e.g. <source url="...">text</source>) and CDATA wrapping
18+
// (?:\s[^>]*)? ensures we don't match prefix tags (e.g. <sourceUrl> when looking for <source>)
19+
const match = block.match(new RegExp(`<${escaped}(?:\\s[^>]*)?>(?:<!\\[CDATA\\[)?([\\s\\S]*?)(?:\\]\\]>)?</${escaped}>`));
20+
record[field] = match ? match[1].trim() : '';
21+
}
22+
return record;
23+
});
24+
}

0 commit comments

Comments
 (0)