feat(google): add search, suggest, news, and trends adapters

Astro-Han · Astro-Han · commit f9220f1907ba · 2026-03-21T04:20:11.000+08:00
Four new commands under `google`:
- search: browser-based DOM extraction from google.com/search
- suggest: public JSON API (suggestqueries.google.com)
- news: public RSS feed (top stories + keyword search)
- trends: public RSS feed (daily trending searches by region)

Shared RSS parser in utils.ts with attribute/CDATA support.
Unit tests for parseRssItems, E2E tests with network skip guards.
diff --git a/src/clis/google/news.ts b/src/clis/google/news.ts
@@ -0,0 +1,66 @@
+/**
+ * Google News via public RSS feed.
+ * Supports top stories (no keyword) and search (with keyword).
+ */
+
+import { cli, Strategy } from '../../registry.js';
+import { CliError } from '../../errors.js';
+import { parseRssItems } from './utils.js';
+
+cli({
+  site: 'google',
+  name: 'news',
+  description: 'Get Google News headlines',
+  strategy: Strategy.PUBLIC,
+  browser: false,
+  args: [
+    { name: 'keyword', positional: true, help: 'Search query (omit for top stories)' },
+    { name: 'limit', type: 'int', default: 10, help: 'Number of results' },
+    { name: 'lang', default: 'en', help: 'Language short code (e.g. en, zh)' },
+    { name: 'region', default: 'US', help: 'Region code (e.g. US, CN)' },
+  ],
+  columns: ['title', 'source', 'date', 'url'],
+  func: async (_page, args) => {
+    const limit = Math.max(1, Math.min(Number(args.limit), 100));
+    const lang = encodeURIComponent(args.lang);
+    const region = encodeURIComponent(args.region);
+    const ceid = `${args.region}:${args.lang}`;
+
+    // Top stories or search
+    const base = args.keyword
+      ? `https://news.google.com/rss/search?q=${encodeURIComponent(args.keyword)}&hl=${lang}&gl=${region}&ceid=${ceid}`
+      : `https://news.google.com/rss?hl=${lang}&gl=${region}&ceid=${ceid}`;
+
+    const resp = await fetch(base);
+    if (!resp.ok) {
+      throw new CliError('FETCH_ERROR', `HTTP ${resp.status}`, 'Check your network connection');
+    }
+
+    const xml = await resp.text();
+    const items = parseRssItems(xml, ['title', 'link', 'pubDate', 'source']);
+
+    if (!items.length) {
+      throw new CliError('NOT_FOUND', 'No news articles found', 'Try a different keyword or region');
+    }
+
+    return items.slice(0, limit).map(item => {
+      // Extract source: prefer <source> element, fallback to parsing title
+      let title = item['title'] || '';
+      let source = item['source'] || '';
+      if (!source) {
+        const idx = title.lastIndexOf(' - ');
+        if (idx !== -1) {
+          source = title.slice(idx + 3);
+          title = title.slice(0, idx);
+        }
+      }
+
+      return {
+        title,
+        source,
+        date: item['pubDate'] || '',
+        url: item['link'] || '',
+      };
+    });
+  },
+});
diff --git a/src/clis/google/search.ts b/src/clis/google/search.ts
@@ -0,0 +1,133 @@
+/**
+ * Google Web Search via browser DOM extraction.
+ * Uses browser mode to navigate google.com and extract results from the DOM.
+ *
+ * Extraction strategy (2026-03): Google no longer uses `.g` class containers.
+ * Instead, we find all `a` tags containing `h3` within `#rso`, then walk up
+ * to the result container (`div.tF2Cxc` or closest `div[data-hveid]`) to find
+ * snippets. This approach is resilient to class name changes.
+ */
+
+import { cli, Strategy } from '../../registry.js';
+import { CliError } from '../../errors.js';
+
+cli({
+  site: 'google',
+  name: 'search',
+  description: 'Search Google',
+  domain: 'google.com',
+  strategy: Strategy.COOKIE,
+  browser: true,
+  args: [
+    { name: 'keyword', positional: true, required: true, help: 'Search query' },
+    { name: 'limit', type: 'int', default: 10, help: 'Number of results (1-100)' },
+    { name: 'lang', default: 'en', help: 'Language short code (e.g. en, zh)' },
+  ],
+  columns: ['type', 'title', 'url', 'snippet'],
+  func: async (page, args) => {
+    const limit = Math.max(1, Math.min(Number(args.limit), 100));
+    const keyword = encodeURIComponent(args.keyword);
+    const lang = encodeURIComponent(args.lang);
+    const url = `https://www.google.com/search?q=${keyword}&hl=${lang}&num=${limit}`;
+
+    await page.goto(url);
+    await page.wait(2);
+
+    const results = await page.evaluate(`
+      (function() {
+        var results = [];
+        var seenUrls = {};
+        var rso = document.querySelector('#rso');
+        if (!rso) return results;
+
+        // -- Featured snippet (scoped to #rso to avoid matching unrelated elements) --
+        var featuredEl = rso.querySelector('.xpdopen .hgKElc')
+                      || rso.querySelector('.IZ6rdc');
+        if (featuredEl) {
+          var parentBlock = featuredEl.closest('[data-hveid]') || featuredEl.parentElement;
+          var fLink = parentBlock ? parentBlock.querySelector('a[href]') : null;
+          var fUrl = fLink ? fLink.href : '';
+          if (fUrl) seenUrls[fUrl] = true;
+          results.push({
+            type: 'snippet',
+            title: featuredEl.textContent.trim().slice(0, 200),
+            url: fUrl,
+            snippet: '',
+          });
+        }
+
+        // -- Standard search results --
+        // Strategy: find all links containing h3 within #rso
+        var allLinks = rso.querySelectorAll('a');
+        for (var i = 0; i < allLinks.length; i++) {
+          var link = allLinks[i];
+          var h3 = link.querySelector('h3');
+          if (!h3) continue;
+
+          var href = link.href || '';
+          // Skip non-http, Google internal links, and duplicates
+          if (!href.match(/^https?:\\/\\//)) continue;
+          if (href.indexOf('google.com/search') !== -1) continue;
+          if (seenUrls[href]) continue;
+          seenUrls[href] = true;
+
+          // Walk up to find result container for snippet extraction
+          var container = link;
+          for (var j = 0; j < 6; j++) {
+            if (container.parentElement && container.parentElement !== rso) {
+              container = container.parentElement;
+            }
+            // Stop at a known result boundary
+            if (container.getAttribute && container.getAttribute('data-hveid')) break;
+          }
+
+          // Find snippet: look for descriptive text, skip breadcrumbs and metadata
+          var snippetText = '';
+          var titleText = h3.textContent.trim();
+          var candidates = container.querySelectorAll('span, div');
+          for (var k = 0; k < candidates.length; k++) {
+            var el = candidates[k];
+            if (el.querySelector('h3') || el.querySelector('a[href]')) continue;
+            var text = el.textContent.trim();
+            if (text.length < 40 || text.length > 500) continue;
+            if (text === titleText) continue;
+            // Skip URL breadcrumbs (e.g. "https://example.com › path..." or "Site Namehttps://...")
+            if (text.indexOf('\u203A') !== -1) continue;
+            if (new RegExp('https?://').test(text.slice(0, 60))) continue;
+            snippetText = text;
+            break;
+          }
+
+          results.push({
+            type: 'result',
+            title: h3.textContent.trim(),
+            url: href,
+            snippet: snippetText.slice(0, 300),
+          });
+        }
+
+        // -- People Also Ask --
+        var paaContainers = document.querySelectorAll('[data-sgrd="true"]');
+        for (var i = 0; i < paaContainers.length; i++) {
+          var questionEl = paaContainers[i].querySelector('span.CSkcDe');
+          if (questionEl) {
+            results.push({
+              type: 'paa',
+              title: questionEl.textContent.trim(),
+              url: '',
+              snippet: '',
+            });
+          }
+        }
+
+        return results;
+      })()
+    `);
+
+    if (!Array.isArray(results) || results.length === 0) {
+      throw new CliError('NOT_FOUND', 'No search results found', 'Try a different keyword or check for CAPTCHA');
+    }
+
+    return results;
+  },
+});
diff --git a/src/clis/google/suggest.ts b/src/clis/google/suggest.ts
@@ -0,0 +1,40 @@
+/**
+ * Google Search Suggestions via public JSON API.
+ * Uses suggestqueries.google.com with client=firefox for pure JSON (not JSONP).
+ */
+
+import { cli, Strategy } from '../../registry.js';
+import { CliError } from '../../errors.js';
+
+cli({
+  site: 'google',
+  name: 'suggest',
+  description: 'Get Google search suggestions',
+  strategy: Strategy.PUBLIC,
+  browser: false,
+  args: [
+    { name: 'keyword', positional: true, required: true, help: 'Search query' },
+    { name: 'lang', default: 'zh-CN', help: 'Language code' },
+  ],
+  columns: ['suggestion'],
+  func: async (_page, args) => {
+    const keyword = encodeURIComponent(args.keyword);
+    const lang = encodeURIComponent(args.lang);
+    const url = `https://suggestqueries.google.com/complete/search?client=firefox&q=${keyword}&hl=${lang}`;
+
+    const resp = await fetch(url);
+    if (!resp.ok) {
+      throw new CliError('FETCH_ERROR', `HTTP ${resp.status}`, 'Check your network connection');
+    }
+
+    const data = await resp.json();
+    // Response format: ["query", ["suggestion1", "suggestion2", ...]]
+    const suggestions: string[] = Array.isArray(data) && Array.isArray(data[1]) ? data[1] : [];
+
+    if (!suggestions.length) {
+      throw new CliError('NOT_FOUND', 'No suggestions found', 'Try a different keyword');
+    }
+
+    return suggestions.map(s => ({ suggestion: s }));
+  },
+});
diff --git a/src/clis/google/trends.ts b/src/clis/google/trends.ts
@@ -0,0 +1,44 @@
+/**
+ * Google Trends via public RSS feed.
+ * Shows daily trending searches for a given region.
+ */
+
+import { cli, Strategy } from '../../registry.js';
+import { CliError } from '../../errors.js';
+import { parseRssItems } from './utils.js';
+
+cli({
+  site: 'google',
+  name: 'trends',
+  description: 'Get Google Trends daily trending searches',
+  strategy: Strategy.PUBLIC,
+  browser: false,
+  args: [
+    { name: 'region', default: 'US', help: 'Region code (e.g. US, CN, JP)' },
+    { name: 'limit', type: 'int', default: 20, help: 'Number of results' },
+  ],
+  columns: ['title', 'traffic', 'date'],
+  func: async (_page, args) => {
+    const limit = Math.max(1, Math.min(Number(args.limit), 100));
+    const region = encodeURIComponent(args.region);
+    const url = `https://trends.google.com/trending/rss?geo=${region}`;
+
+    const resp = await fetch(url);
+    if (!resp.ok) {
+      throw new CliError('FETCH_ERROR', `HTTP ${resp.status}`, 'Check your network connection or region code');
+    }
+
+    const xml = await resp.text();
+    const items = parseRssItems(xml, ['title', 'pubDate', 'ht:approx_traffic']);
+
+    if (!items.length) {
+      throw new CliError('NOT_FOUND', 'No trending data found', 'Try a different region code');
+    }
+
+    return items.slice(0, limit).map(item => ({
+      title: item['title'],
+      traffic: item['ht:approx_traffic'],  // raw string e.g. "1,000,000+", no numeric conversion
+      date: item['pubDate'],
+    }));
+  },
+});
diff --git a/src/clis/google/utils.test.ts b/src/clis/google/utils.test.ts
@@ -0,0 +1,82 @@
+import { describe, it, expect } from 'vitest';
+import { parseRssItems } from './utils.js';
+
+describe('parseRssItems', () => {
+  it('extracts plain text fields', () => {
+    const xml = `
+      <channel>
+        <item><title>Hello</title><link>https://example.com</link></item>
+        <item><title>World</title><link>https://test.com</link></item>
+      </channel>
+    `;
+    const items = parseRssItems(xml, ['title', 'link']);
+    expect(items).toEqual([
+      { title: 'Hello', link: 'https://example.com' },
+      { title: 'World', link: 'https://test.com' },
+    ]);
+  });
+
+  it('handles CDATA-wrapped content', () => {
+    const xml = `
+      <item><title><![CDATA[Breaking News]]></title><link>https://news.com</link></item>
+    `;
+    const items = parseRssItems(xml, ['title', 'link']);
+    expect(items).toEqual([
+      { title: 'Breaking News', link: 'https://news.com' },
+    ]);
+  });
+
+  it('handles namespaced fields like ht:approx_traffic', () => {
+    const xml = `
+      <item>
+        <title>AI</title>
+        <ht:approx_traffic>500,000+</ht:approx_traffic>
+        <pubDate>Mon, 20 Mar 2026</pubDate>
+      </item>
+    `;
+    const items = parseRssItems(xml, ['title', 'ht:approx_traffic', 'pubDate']);
+    expect(items).toEqual([
+      { title: 'AI', 'ht:approx_traffic': '500,000+', pubDate: 'Mon, 20 Mar 2026' },
+    ]);
+  });
+
+  it('returns empty string for missing fields', () => {
+    const xml = `<item><title>Test</title></item>`;
+    const items = parseRssItems(xml, ['title', 'missing']);
+    expect(items).toEqual([{ title: 'Test', missing: '' }]);
+  });
+
+  it('handles tags with attributes (e.g. <source url="...">)', () => {
+    const xml = `
+      <item>
+        <title><![CDATA[AI reshapes everything - Reuters]]></title>
+        <source url="https://reuters.com">Reuters</source>
+        <link>https://news.google.com/123</link>
+      </item>
+    `;
+    const items = parseRssItems(xml, ['title', 'source', 'link']);
+    expect(items).toEqual([
+      { title: 'AI reshapes everything - Reuters', source: 'Reuters', link: 'https://news.google.com/123' },
+    ]);
+  });
+
+  it('handles mixed CDATA and plain text in the same item', () => {
+    const xml = `
+      <item>
+        <title><![CDATA[Breaking: Major event]]></title>
+        <link>https://example.com/article</link>
+        <pubDate>Fri, 21 Mar 2026</pubDate>
+      </item>
+    `;
+    const items = parseRssItems(xml, ['title', 'link', 'pubDate']);
+    expect(items).toEqual([
+      { title: 'Breaking: Major event', link: 'https://example.com/article', pubDate: 'Fri, 21 Mar 2026' },
+    ]);
+  });
+
+  it('returns empty array for no items', () => {
+    const xml = `<channel><title>Empty</title></channel>`;
+    const items = parseRssItems(xml, ['title']);
+    expect(items).toEqual([]);
+  });
+});
diff --git a/src/clis/google/utils.ts b/src/clis/google/utils.ts
@@ -0,0 +1,24 @@
+/**
+ * Google adapter utilities.
+ * Shared RSS parser for news and trends commands.
+ */
+
+/**
+ * Parse RSS XML by splitting into <item> blocks, then extracting fields per block.
+ * Handles both plain text and CDATA-wrapped content.
+ */
+export function parseRssItems(xml: string, fields: string[]): Record<string, string>[] {
+  const items = xml.match(/<item>([\s\S]*?)<\/item>/g) || [];
+  return items.map(block => {
+    const record: Record<string, string> = {};
+    for (const field of fields) {
+      // Escape regex special characters in field name (e.g. ht:approx_traffic is safe, but defensive)
+      const escaped = field.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+      // Handle tags with attributes (e.g. <source url="...">text</source>) and CDATA wrapping
+      // (?:\s[^>]*)? ensures we don't match prefix tags (e.g. <sourceUrl> when looking for <source>)
+      const match = block.match(new RegExp(`<${escaped}(?:\\s[^>]*)?>(?:<!\\[CDATA\\[)?([\\s\\S]*?)(?:\\]\\]>)?</${escaped}>`));
+      record[field] = match ? match[1].trim() : '';
+    }
+    return record;
+  });
+}
diff --git a/tests/e2e/browser-public.test.ts b/tests/e2e/browser-public.test.ts
diff --git a/tests/e2e/public-commands.test.ts b/tests/e2e/public-commands.test.ts