From 815aeb00e0b16f19df97fa7706054a9a5008e1f1 Mon Sep 17 00:00:00 2001 From: jackwener Date: Sun, 22 Mar 2026 01:00:07 +0800 Subject: [PATCH] feat: add douban, sinablog, substack adapters; upgrade medium to TS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New adapters: - douban: book-hot, movie-hot, search (browser/cookie) - sinablog: hot, search, article, user (search uses public API) - substack: feed, publication, search (search uses public API) Medium upgrade (YAML → TS): - Replace tag.yaml/user.yaml/publication.yaml with TS adapters - feed.ts (tag feed by topic), search.ts, user.ts with browser scraping - Richer data: readTime, claps, description Core pipeline improvements: - template.ts: trim template before matching (supports multiline expressions) - template.ts: evalJsExpr fallback for JS expressions in YAML templates - template.ts: add urlencode/urldecode filters - transform.ts: inline select inside map params - build-manifest.ts: TS-over-YAML dedup with warning log - build-manifest.ts: export scanTs/shouldReplaceManifestEntry for testing Co-authored-by: Yuan --- src/build-manifest.test.ts | 72 ++++++++++- src/build-manifest.ts | 37 +++++- src/clis/douban/book-hot.ts | 15 +++ src/clis/douban/movie-hot.ts | 15 +++ src/clis/douban/search.ts | 17 +++ src/clis/douban/shared.ts | 165 ++++++++++++++++++++++++++ src/clis/medium/feed.ts | 16 +++ src/clis/medium/publication.yaml | 32 ----- src/clis/medium/search.ts | 16 +++ src/clis/medium/shared.ts | 83 +++++++++++++ src/clis/medium/tag.yaml | 32 ----- src/clis/medium/user.ts | 16 +++ src/clis/medium/user.yaml | 31 ----- src/clis/sinablog/article.ts | 15 +++ src/clis/sinablog/hot.ts | 15 +++ src/clis/sinablog/search.ts | 56 +++++++++ src/clis/sinablog/shared.ts | 198 +++++++++++++++++++++++++++++++ src/clis/sinablog/user.ts | 16 +++ src/clis/substack/feed.ts | 16 +++ src/clis/substack/publication.ts | 16 +++ src/clis/substack/search.ts | 91 ++++++++++++++ src/clis/substack/shared.ts | 132 +++++++++++++++++++++ src/pipeline/executor.test.ts | 30 ++++- src/pipeline/steps/transform.ts | 18 ++- src/pipeline/template.test.ts | 18 +++ src/pipeline/template.ts | 76 +++++++++++- src/pipeline/transform.test.ts | 13 ++ 27 files changed, 1146 insertions(+), 111 deletions(-) create mode 100644 src/clis/douban/book-hot.ts create mode 100644 src/clis/douban/movie-hot.ts create mode 100644 src/clis/douban/search.ts create mode 100644 src/clis/douban/shared.ts create mode 100644 src/clis/medium/feed.ts delete mode 100644 src/clis/medium/publication.yaml create mode 100644 src/clis/medium/search.ts create mode 100644 src/clis/medium/shared.ts delete mode 100644 src/clis/medium/tag.yaml create mode 100644 src/clis/medium/user.ts delete mode 100644 src/clis/medium/user.yaml create mode 100644 src/clis/sinablog/article.ts create mode 100644 src/clis/sinablog/hot.ts create mode 100644 src/clis/sinablog/search.ts create mode 100644 src/clis/sinablog/shared.ts create mode 100644 src/clis/sinablog/user.ts create mode 100644 src/clis/substack/feed.ts create mode 100644 src/clis/substack/publication.ts create mode 100644 src/clis/substack/search.ts create mode 100644 src/clis/substack/shared.ts diff --git a/src/build-manifest.test.ts b/src/build-manifest.test.ts index b4eabfea..935f9e48 100644 --- a/src/build-manifest.test.ts +++ b/src/build-manifest.test.ts @@ -1,5 +1,8 @@ -import { describe, expect, it } from 'vitest'; -import { parseTsArgsBlock } from './build-manifest.js'; +import { afterEach, describe, expect, it } from 'vitest'; +import * as fs from 'node:fs'; +import * as os from 'node:os'; +import * as path from 'node:path'; +import { parseTsArgsBlock, scanTs, shouldReplaceManifestEntry } from './build-manifest.js'; describe('parseTsArgsBlock', () => { it('keeps args with nested choices arrays', () => { @@ -62,3 +65,68 @@ describe('parseTsArgsBlock', () => { ]); }); }); + +describe('manifest helper rules', () => { + const tempDirs: string[] = []; + + afterEach(() => { + for (const dir of tempDirs.splice(0)) { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('prefers TS adapters over duplicate YAML adapters', () => { + expect(shouldReplaceManifestEntry( + { + site: 'demo', + name: 'search', + description: 'yaml', + strategy: 'public', + browser: false, + args: [], + type: 'yaml', + }, + { + site: 'demo', + name: 'search', + description: 'ts', + strategy: 'public', + browser: false, + args: [], + type: 'ts', + modulePath: 'demo/search.js', + }, + )).toBe(true); + + expect(shouldReplaceManifestEntry( + { + site: 'demo', + name: 'search', + description: 'ts', + strategy: 'public', + browser: false, + args: [], + type: 'ts', + modulePath: 'demo/search.js', + }, + { + site: 'demo', + name: 'search', + description: 'yaml', + strategy: 'public', + browser: false, + args: [], + type: 'yaml', + }, + )).toBe(false); + }); + + it('skips TS files that do not register a cli', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'opencli-manifest-')); + tempDirs.push(dir); + const file = path.join(dir, 'utils.ts'); + fs.writeFileSync(file, `export function helper() { return 'noop'; }`); + + expect(scanTs(file, 'demo')).toBeNull(); + }); +}); diff --git a/src/build-manifest.ts b/src/build-manifest.ts index ec32f417..46ece7f2 100644 --- a/src/build-manifest.ts +++ b/src/build-manifest.ts @@ -199,7 +199,7 @@ function scanYaml(filePath: string, site: string): ManifestEntry | null { } } -function scanTs(filePath: string, site: string): ManifestEntry | null { +export function scanTs(filePath: string, site: string): ManifestEntry | null { // TS adapters self-register via cli() at import time. // We statically parse the source to extract metadata for the manifest stub. const baseName = path.basename(filePath, path.extname(filePath)); @@ -263,8 +263,17 @@ function scanTs(filePath: string, site: string): ManifestEntry | null { } } +/** + * When both YAML and TS adapters exist for the same site/name, + * prefer the TS version (it self-registers and typically has richer logic). + */ +export function shouldReplaceManifestEntry(current: ManifestEntry, next: ManifestEntry): boolean { + if (current.type === next.type) return true; + return current.type === 'yaml' && next.type === 'ts'; +} + export function buildManifest(): ManifestEntry[] { - const manifest: ManifestEntry[] = []; + const manifest = new Map(); if (fs.existsSync(CLIS_DIR)) { for (const site of fs.readdirSync(CLIS_DIR)) { @@ -274,19 +283,37 @@ export function buildManifest(): ManifestEntry[] { const filePath = path.join(siteDir, file); if (file.endsWith('.yaml') || file.endsWith('.yml')) { const entry = scanYaml(filePath, site); - if (entry) manifest.push(entry); + if (entry) { + const key = `${entry.site}/${entry.name}`; + const existing = manifest.get(key); + if (!existing || shouldReplaceManifestEntry(existing, entry)) { + if (existing && existing.type !== entry.type) { + process.stderr.write(`⚠️ Duplicate adapter ${key}: ${existing.type} superseded by ${entry.type}\n`); + } + manifest.set(key, entry); + } + } } else if ( (file.endsWith('.ts') && !file.endsWith('.d.ts') && !file.endsWith('.test.ts') && file !== 'index.ts') || (file.endsWith('.js') && !file.endsWith('.d.js') && !file.endsWith('.test.js') && file !== 'index.js') ) { const entry = scanTs(filePath, site); - if (entry) manifest.push(entry); + if (entry) { + const key = `${entry.site}/${entry.name}`; + const existing = manifest.get(key); + if (!existing || shouldReplaceManifestEntry(existing, entry)) { + if (existing && existing.type !== entry.type) { + process.stderr.write(`⚠️ Duplicate adapter ${key}: ${existing.type} superseded by ${entry.type}\n`); + } + manifest.set(key, entry); + } + } } } } } - return manifest; + return [...manifest.values()]; } function main(): void { diff --git a/src/clis/douban/book-hot.ts b/src/clis/douban/book-hot.ts new file mode 100644 index 00000000..9605401a --- /dev/null +++ b/src/clis/douban/book-hot.ts @@ -0,0 +1,15 @@ +import { cli, Strategy } from '../../registry.js'; +import { loadDoubanBookHot } from './shared.js'; + +cli({ + site: 'douban', + name: 'book-hot', + description: '豆瓣图书热门榜单', + domain: 'book.douban.com', + strategy: Strategy.COOKIE, + args: [ + { name: 'limit', type: 'int', default: 20, help: '返回的图书数量' }, + ], + columns: ['rank', 'title', 'rating', 'quote', 'author', 'publisher', 'year', 'url'], + func: async (page, args) => loadDoubanBookHot(page, Number(args.limit) || 20), +}); diff --git a/src/clis/douban/movie-hot.ts b/src/clis/douban/movie-hot.ts new file mode 100644 index 00000000..8f72886c --- /dev/null +++ b/src/clis/douban/movie-hot.ts @@ -0,0 +1,15 @@ +import { cli, Strategy } from '../../registry.js'; +import { loadDoubanMovieHot } from './shared.js'; + +cli({ + site: 'douban', + name: 'movie-hot', + description: '豆瓣电影热门榜单', + domain: 'movie.douban.com', + strategy: Strategy.COOKIE, + args: [ + { name: 'limit', type: 'int', default: 20, help: '返回的电影数量' }, + ], + columns: ['rank', 'title', 'rating', 'quote', 'director', 'year', 'region', 'url'], + func: async (page, args) => loadDoubanMovieHot(page, Number(args.limit) || 20), +}); diff --git a/src/clis/douban/search.ts b/src/clis/douban/search.ts new file mode 100644 index 00000000..3b7fc458 --- /dev/null +++ b/src/clis/douban/search.ts @@ -0,0 +1,17 @@ +import { cli, Strategy } from '../../registry.js'; +import { searchDouban } from './shared.js'; + +cli({ + site: 'douban', + name: 'search', + description: '搜索豆瓣电影、图书或音乐', + domain: 'search.douban.com', + strategy: Strategy.COOKIE, + args: [ + { name: 'type', default: 'movie', choices: ['movie', 'book', 'music'], help: '搜索类型(movie=电影, book=图书, music=音乐)' }, + { name: 'keyword', required: true, help: '搜索关键词' }, + { name: 'limit', type: 'int', default: 20, help: '返回结果数量' }, + ], + columns: ['rank', 'title', 'rating', 'abstract', 'url'], + func: async (page, args) => searchDouban(page, args.type, args.keyword, Number(args.limit) || 20), +}); diff --git a/src/clis/douban/shared.ts b/src/clis/douban/shared.ts new file mode 100644 index 00000000..227debcb --- /dev/null +++ b/src/clis/douban/shared.ts @@ -0,0 +1,165 @@ +import { CliError } from '../../errors.js'; +import type { IPage } from '../../types.js'; + +function clampLimit(limit: number): number { + return Math.max(1, Math.min(limit || 20, 50)); +} + +async function ensureDoubanReady(page: IPage): Promise { + const state = await page.evaluate(` + (() => { + const title = (document.title || '').trim(); + const href = (location.href || '').trim(); + const blocked = href.includes('sec.douban.com') || /登录跳转/.test(title) || /异常请求/.test(document.body?.innerText || ''); + return { blocked, title, href }; + })() + `); + if (state?.blocked) { + throw new CliError( + 'AUTH_REQUIRED', + 'Douban requires a logged-in browser session before these commands can load data.', + 'Please sign in to douban.com in the browser that opencli reuses, then rerun the command.', + ); + } +} + +export async function loadDoubanBookHot(page: IPage, limit: number): Promise { + const safeLimit = clampLimit(limit); + await page.goto('https://book.douban.com/chart'); + await page.wait(4); + await ensureDoubanReady(page); + const data = await page.evaluate(` + (() => { + const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim(); + const books = []; + for (const el of Array.from(document.querySelectorAll('.media.clearfix'))) { + try { + const titleEl = el.querySelector('h2 a[href*="/subject/"]'); + const title = normalize(titleEl?.textContent); + let url = titleEl?.getAttribute('href') || ''; + if (!title || !url) continue; + if (!url.startsWith('http')) url = 'https://book.douban.com' + url; + + const info = normalize(el.querySelector('.subject-abstract, .pl, .pub')?.textContent); + const infoParts = info.split('/').map((part) => part.trim()).filter(Boolean); + const ratingText = normalize(el.querySelector('.subject-rating .font-small, .rating_nums, .rating')?.textContent); + const quote = Array.from(el.querySelectorAll('.subject-tags .tag')) + .map((node) => normalize(node.textContent)) + .filter(Boolean) + .join(' / '); + + books.push({ + rank: parseInt(normalize(el.querySelector('.green-num-box')?.textContent), 10) || books.length + 1, + title, + rating: parseFloat(ratingText) || 0, + quote, + author: infoParts[0] || '', + publisher: infoParts.find((part) => /出版社|出版公司|Press/i.test(part)) || infoParts[2] || '', + year: infoParts.find((part) => /\\d{4}(?:-\\d{1,2})?/.test(part))?.match(/\\d{4}/)?.[0] || '', + price: infoParts.find((part) => /元|USD|\\$|¥/.test(part)) || '', + url, + cover: el.querySelector('img')?.getAttribute('src') || '', + }); + } catch {} + } + return books.slice(0, ${safeLimit}); + })() + `); + return Array.isArray(data) ? data : []; +} + +export async function loadDoubanMovieHot(page: IPage, limit: number): Promise { + const safeLimit = clampLimit(limit); + await page.goto('https://movie.douban.com/chart'); + await page.wait(4); + await ensureDoubanReady(page); + const data = await page.evaluate(` + (() => { + const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim(); + const results = []; + for (const el of Array.from(document.querySelectorAll('.item'))) { + const titleEl = el.querySelector('.pl2 a'); + const title = normalize(titleEl?.textContent); + let url = titleEl?.getAttribute('href') || ''; + if (!title || !url) continue; + if (!url.startsWith('http')) url = 'https://movie.douban.com' + url; + + const info = normalize(el.querySelector('.pl2 p')?.textContent); + const infoParts = info.split('/').map((part) => part.trim()).filter(Boolean); + const releaseIndex = (() => { + for (let i = infoParts.length - 1; i >= 0; i -= 1) { + if (/\\d{4}-\\d{2}-\\d{2}|\\d{4}\\/\\d{2}\\/\\d{2}/.test(infoParts[i])) return i; + } + return -1; + })(); + const directorPart = releaseIndex >= 1 ? infoParts[releaseIndex - 1] : ''; + const regionPart = releaseIndex >= 2 ? infoParts[releaseIndex - 2] : ''; + const yearMatch = info.match(/\\b(19|20)\\d{2}\\b/); + results.push({ + rank: results.length + 1, + title, + rating: parseFloat(normalize(el.querySelector('.rating_nums')?.textContent)) || 0, + quote: normalize(el.querySelector('.inq')?.textContent), + director: directorPart.replace(/^导演:\\s*/, ''), + year: yearMatch?.[0] || '', + region: regionPart, + url, + cover: el.querySelector('img')?.getAttribute('src') || '', + }); + if (results.length >= ${safeLimit}) break; + } + return results; + })() + `); + return Array.isArray(data) ? data : []; +} + +export async function searchDouban(page: IPage, type: string, keyword: string, limit: number): Promise { + const safeLimit = clampLimit(limit); + await page.goto(`https://search.douban.com/${encodeURIComponent(type)}/subject_search?search_text=${encodeURIComponent(keyword)}`); + await page.wait(2); + await ensureDoubanReady(page); + const data = await page.evaluate(` + (async () => { + const type = ${JSON.stringify(type)}; + const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim(); + const seen = new Set(); + const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); + + for (let i = 0; i < 20; i += 1) { + if (document.querySelector('.item-root .title-text, .item-root .title a')) break; + await sleep(300); + } + + const items = Array.from(document.querySelectorAll('.item-root')); + + const results = []; + for (const el of items) { + const titleEl = el.querySelector('.title-text, .title a, a[title]'); + const title = normalize(titleEl?.textContent) || normalize(titleEl?.getAttribute('title')); + let url = titleEl?.getAttribute('href') || ''; + if (!title || !url) continue; + if (!url.startsWith('http')) url = 'https://search.douban.com' + url; + if (!url.includes('/subject/') || seen.has(url)) continue; + seen.add(url); + const ratingText = normalize(el.querySelector('.rating_nums')?.textContent); + const abstract = normalize( + el.querySelector('.meta.abstract, .meta, .abstract, p')?.textContent, + ); + results.push({ + rank: results.length + 1, + id: url.match(/subject\\/(\\d+)/)?.[1] || '', + type, + title, + rating: ratingText.includes('.') ? parseFloat(ratingText) : 0, + abstract: abstract.slice(0, 100) + (abstract.length > 100 ? '...' : ''), + url, + cover: el.querySelector('img')?.getAttribute('src') || '', + }); + if (results.length >= ${safeLimit}) break; + } + return results; + })() + `); + return Array.isArray(data) ? data : []; +} diff --git a/src/clis/medium/feed.ts b/src/clis/medium/feed.ts new file mode 100644 index 00000000..b4a5178f --- /dev/null +++ b/src/clis/medium/feed.ts @@ -0,0 +1,16 @@ +import { cli, Strategy } from '../../registry.js'; +import { buildMediumTagUrl, loadMediumPosts } from './shared.js'; + +cli({ + site: 'medium', + name: 'feed', + description: 'Medium 热门文章 Feed', + domain: 'medium.com', + strategy: Strategy.COOKIE, + args: [ + { name: 'topic', default: '', help: '话题标签(如 technology, programming, ai)' }, + { name: 'limit', type: 'int', default: 20, help: '返回的文章数量' }, + ], + columns: ['rank', 'title', 'author', 'date', 'readTime', 'claps'], + func: async (page, args) => loadMediumPosts(page, buildMediumTagUrl(args.topic), Number(args.limit) || 20), +}); diff --git a/src/clis/medium/publication.yaml b/src/clis/medium/publication.yaml deleted file mode 100644 index c772ae91..00000000 --- a/src/clis/medium/publication.yaml +++ /dev/null @@ -1,32 +0,0 @@ -site: medium -name: publication -description: Get recent articles from a Medium publication -domain: medium.com -strategy: public -browser: false - -args: - name: - type: string - required: true - description: The publication name/slug (e.g. netflix-techblog) - limit: - type: int - default: 10 - description: Max number of stories - -pipeline: - - fetch: - url: https://api.rss2json.com/v1/api.json?rss_url=https://medium.com/feed/${{ args.name }} - - - select: items - - - map: - title: "${{ item.title }}" - author: "${{ item.author }}" - date: "${{ item.pubDate }}" - url: "${{ item.link }}" - - - limit: ${{ args.limit }} - -columns: [title, author, date, url] diff --git a/src/clis/medium/search.ts b/src/clis/medium/search.ts new file mode 100644 index 00000000..fa93adad --- /dev/null +++ b/src/clis/medium/search.ts @@ -0,0 +1,16 @@ +import { cli, Strategy } from '../../registry.js'; +import { buildMediumSearchUrl, loadMediumPosts } from './shared.js'; + +cli({ + site: 'medium', + name: 'search', + description: '搜索 Medium 文章', + domain: 'medium.com', + strategy: Strategy.COOKIE, + args: [ + { name: 'keyword', required: true, help: '搜索关键词' }, + { name: 'limit', type: 'int', default: 20, help: '返回的文章数量' }, + ], + columns: ['rank', 'title', 'author', 'date', 'readTime', 'claps'], + func: async (page, args) => loadMediumPosts(page, buildMediumSearchUrl(args.keyword), Number(args.limit) || 20), +}); diff --git a/src/clis/medium/shared.ts b/src/clis/medium/shared.ts new file mode 100644 index 00000000..9d35a44e --- /dev/null +++ b/src/clis/medium/shared.ts @@ -0,0 +1,83 @@ +import type { IPage } from '../../types.js'; + +export function buildMediumTagUrl(topic?: string): string { + return topic ? `https://medium.com/tag/${encodeURIComponent(topic)}` : 'https://medium.com/tag/technology'; +} + +export function buildMediumSearchUrl(keyword: string): string { + return `https://medium.com/search?q=${encodeURIComponent(keyword)}`; +} + +export function buildMediumUserUrl(username: string): string { + return username.startsWith('@') ? `https://medium.com/${username}` : `https://medium.com/@${username}`; +} + +export async function loadMediumPosts(page: IPage, url: string, limit: number): Promise { + if (!page) throw new Error('Requires browser session'); + await page.goto(url); + await page.wait(5); + const data = await page.evaluate(` + (async () => { + await new Promise((resolve) => setTimeout(resolve, 3000)); + + const limit = ${Math.max(1, Math.min(limit, 50))}; + const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim(); + const posts = []; + const seen = new Set(); + + for (const article of Array.from(document.querySelectorAll('article'))) { + try { + const titleEl = article.querySelector('h2, h3, h1'); + const title = normalize(titleEl?.textContent); + if (!title) continue; + + const linkEl = titleEl?.closest('a') || article.querySelector('a[href*="/@"], a[href*="/p/"]'); + let url = linkEl?.getAttribute('href') || ''; + if (!url) continue; + if (!url.startsWith('http')) url = 'https://medium.com' + url; + if (seen.has(url)) continue; + + const author = normalize( + Array.from(article.querySelectorAll('a[href^="/@"]')) + .map((node) => normalize(node.textContent)) + .find((text) => text && text !== title), + ); + + const allText = normalize(article.textContent); + const dateEl = article.querySelector('time'); + const date = normalize(dateEl?.textContent) || + dateEl?.getAttribute('datetime') || + allText.match(/\\b(?:[A-Z][a-z]{2}\\s+\\d{1,2}|\\d+[dhmw]\\s+ago)\\b/)?.[0] || + ''; + + const readTime = allText.match(/(\\d+)\\s*min\\s*read/i)?.[0] || ''; + const claps = allText.match(/\\b(\\d+(?:\\.\\d+)?[KkMm]?)\\s*claps?\\b/i)?.[1] || ''; + + const description = normalize( + Array.from(article.querySelectorAll('h3, p')) + .map((node) => normalize(node.textContent)) + .find((text) => text && text !== title && text !== author && !/member-only story|response icon/i.test(text)), + ); + + seen.add(url); + posts.push({ + rank: posts.length + 1, + title, + author, + date, + readTime, + claps, + description: description ? description.slice(0, 150) : '', + url, + }); + + if (posts.length >= limit) break; + } catch {} + } + + return posts; + })() + `); + + return Array.isArray(data) ? data : []; +} diff --git a/src/clis/medium/tag.yaml b/src/clis/medium/tag.yaml deleted file mode 100644 index 8168c247..00000000 --- a/src/clis/medium/tag.yaml +++ /dev/null @@ -1,32 +0,0 @@ -site: medium -name: tag -description: Get top articles for a Medium tag -domain: medium.com -strategy: public -browser: false - -args: - tag: - type: string - required: true - description: The tag to search for (e.g. programming) - limit: - type: int - default: 10 - description: Max number of stories - -pipeline: - - fetch: - url: https://api.rss2json.com/v1/api.json?rss_url=https://medium.com/feed/tag/${{ args.tag }} - - - select: items - - - map: - title: "${{ item.title }}" - author: "${{ item.author }}" - date: "${{ item.pubDate }}" - url: "${{ item.link }}" - - - limit: ${{ args.limit }} - -columns: [title, author, date, url] diff --git a/src/clis/medium/user.ts b/src/clis/medium/user.ts new file mode 100644 index 00000000..ae59874c --- /dev/null +++ b/src/clis/medium/user.ts @@ -0,0 +1,16 @@ +import { cli, Strategy } from '../../registry.js'; +import { buildMediumUserUrl, loadMediumPosts } from './shared.js'; + +cli({ + site: 'medium', + name: 'user', + description: '获取 Medium 用户的文章列表', + domain: 'medium.com', + strategy: Strategy.COOKIE, + args: [ + { name: 'username', required: true, help: 'Medium 用户名(如 @username 或 username)' }, + { name: 'limit', type: 'int', default: 20, help: '返回的文章数量' }, + ], + columns: ['rank', 'title', 'date', 'readTime', 'claps', 'url'], + func: async (page, args) => loadMediumPosts(page, buildMediumUserUrl(args.username), Number(args.limit) || 20), +}); diff --git a/src/clis/medium/user.yaml b/src/clis/medium/user.yaml deleted file mode 100644 index 4a2aeed1..00000000 --- a/src/clis/medium/user.yaml +++ /dev/null @@ -1,31 +0,0 @@ -site: medium -name: user -description: Get recent articles by a Medium user -domain: medium.com -strategy: public -browser: false - -args: - username: - type: string - required: true - description: The medium username (without the @ symbol) - limit: - type: int - default: 10 - description: Max number of stories - -pipeline: - - fetch: - url: https://api.rss2json.com/v1/api.json?rss_url=https://medium.com/feed/@${{ args.username }} - - - select: items - - - map: - title: "${{ item.title }}" - date: "${{ item.pubDate }}" - url: "${{ item.link }}" - - - limit: ${{ args.limit }} - -columns: [title, date, url] diff --git a/src/clis/sinablog/article.ts b/src/clis/sinablog/article.ts new file mode 100644 index 00000000..d3260cc1 --- /dev/null +++ b/src/clis/sinablog/article.ts @@ -0,0 +1,15 @@ +import { cli, Strategy } from '../../registry.js'; +import { loadSinaBlogArticle } from './shared.js'; + +cli({ + site: 'sinablog', + name: 'article', + description: '获取新浪博客单篇文章详情', + domain: 'blog.sina.com.cn', + strategy: Strategy.COOKIE, + args: [ + { name: 'url', required: true, help: '文章URL(如 https://blog.sina.com.cn/s/blog_xxx.html)' }, + ], + columns: ['title', 'author', 'date', 'category', 'readCount', 'commentCount'], + func: async (page, args) => loadSinaBlogArticle(page, args.url), +}); diff --git a/src/clis/sinablog/hot.ts b/src/clis/sinablog/hot.ts new file mode 100644 index 00000000..4648ce3f --- /dev/null +++ b/src/clis/sinablog/hot.ts @@ -0,0 +1,15 @@ +import { cli, Strategy } from '../../registry.js'; +import { loadSinaBlogHot } from './shared.js'; + +cli({ + site: 'sinablog', + name: 'hot', + description: '获取新浪博客热门文章/推荐', + domain: 'blog.sina.com.cn', + strategy: Strategy.COOKIE, + args: [ + { name: 'limit', type: 'int', default: 20, help: '返回的文章数量' }, + ], + columns: ['rank', 'title', 'author', 'date', 'readCount', 'url'], + func: async (page, args) => loadSinaBlogHot(page, Number(args.limit) || 20), +}); diff --git a/src/clis/sinablog/search.ts b/src/clis/sinablog/search.ts new file mode 100644 index 00000000..59f05ae3 --- /dev/null +++ b/src/clis/sinablog/search.ts @@ -0,0 +1,56 @@ +import { cli, Strategy } from '../../registry.js'; + +function normalize(value: unknown): string { + return typeof value === 'string' ? value.replace(/\s+/g, ' ').trim() : ''; +} + +function stripHtml(value: string): string { + return value.replace(/<[^>]+>/g, ''); +} + +async function searchSinaBlog(keyword: string, limit: number): Promise { + const url = new URL('https://search.sina.com.cn/api/search'); + url.searchParams.set('q', keyword); + url.searchParams.set('tp', 'mix'); + url.searchParams.set('sort', '0'); + url.searchParams.set('page', '1'); + url.searchParams.set('size', String(Math.max(limit, 10))); + url.searchParams.set('from', 'search_result'); + + const resp = await fetch(url, { + headers: { + 'User-Agent': 'Mozilla/5.0', + Accept: 'application/json', + }, + }); + if (!resp.ok) throw new Error(`Sina blog search failed: HTTP ${resp.status}`); + + const data = await resp.json() as { data?: { list?: any[] } }; + const list = Array.isArray(data?.data?.list) ? data.data.list : []; + return list + .filter((item) => normalize(item?.url).includes('blog.sina.com.cn/s/blog_')) + .slice(0, limit) + .map((item, index) => ({ + rank: index + 1, + title: normalize(stripHtml(item?.title || '')), + author: normalize(item?.media_show || item?.author), + date: normalize(item?.time || item?.dataTime), + description: normalize(item?.intro || item?.searchSummary).slice(0, 150), + url: normalize(item?.url), + })); +} + +cli({ + site: 'sinablog', + name: 'search', + description: '搜索新浪博客文章(通过新浪搜索)', + domain: 'blog.sina.com.cn', + strategy: Strategy.PUBLIC, + browser: false, + args: [ + { name: 'keyword', required: true, help: '搜索关键词' }, + { name: 'limit', type: 'int', default: 20, help: '返回的文章数量' }, + ], + columns: ['rank', 'title', 'author', 'date', 'description', 'url'], + func: async (_page, args) => searchSinaBlog(args.keyword, Math.max(1, Math.min(Number(args.limit) || 20, 50))), +}); diff --git a/src/clis/sinablog/shared.ts b/src/clis/sinablog/shared.ts new file mode 100644 index 00000000..3e2b4f54 --- /dev/null +++ b/src/clis/sinablog/shared.ts @@ -0,0 +1,198 @@ +import type { IPage } from '../../types.js'; + +function clampLimit(limit: number): number { + return Math.max(1, Math.min(limit || 20, 50)); +} + +export function buildSinaBlogSearchUrl(keyword: string): string { + return `https://search.sina.com.cn/search?q=${encodeURIComponent(keyword)}&tp=mix`; +} + +export function buildSinaBlogUserUrl(uid: string): string { + return `https://blog.sina.com.cn/s/articlelist_${encodeURIComponent(uid)}_0_1.html`; +} + +export async function loadSinaBlogArticle(page: IPage, url: string): Promise { + await page.goto(url); + await page.wait(3); + return page.evaluate(` + (async () => { + await new Promise((resolve) => setTimeout(resolve, 1500)); + const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim(); + const title = normalize(document.querySelector('.articalTitle h2, .title h2, h1, h2.titName')?.textContent); + const titleParts = normalize(document.title).split('_').map((part) => normalize(part)).filter(Boolean); + const author = titleParts[1] || title.split(/[::]/)[0] || ''; + const timeText = normalize(document.querySelector('.time, .articalInfo .time')?.textContent).replace(/[()]/g, ''); + const date = timeText || normalize(document.body.innerText.match(/\\b\\d{4}-\\d{2}-\\d{2}(?:\\s+\\d{2}:\\d{2}:\\d{2})?\\b/)?.[0]); + const category = normalize(document.querySelector('.articalTag .blog_class a, .blog_class a')?.textContent); + const tags = Array.from(document.querySelectorAll('.blog_tag h3, .blog_tag a, .tag a, .artical_tag a')) + .map((node) => normalize(node.textContent)) + .filter(Boolean); + const content = normalize(document.querySelector('.articalContent, .blog_content, .content, #sina_keyword_ad_area2')?.textContent).slice(0, 500); + const images = Array.from(document.querySelectorAll('.articalContent img, .blog_content img, .content img')) + .map((img) => img.getAttribute('src') || img.getAttribute('real_src') || '') + .filter((src) => src && !src.includes('icon')) + .slice(0, 5); + return { + title, + author, + date, + category, + tags: tags.join(', '), + readCount: '', + commentCount: '', + content: content + (content.length >= 500 ? '...' : ''), + images: images.join(', '), + url: ${JSON.stringify(url)}, + }; + })() + `); +} + +export async function loadSinaBlogHot(page: IPage, limit: number): Promise { + const safeLimit = clampLimit(limit); + await page.goto('https://blog.sina.com.cn/'); + await page.wait(3); + const data = await page.evaluate(` + (async () => { + await new Promise((resolve) => setTimeout(resolve, 1500)); + const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim(); + const limit = ${safeLimit}; + const abs = (href) => { + if (!href) return ''; + if (href.startsWith('//')) return 'https:' + href; + if (href.startsWith('http')) return href; + return 'https://blog.sina.com.cn' + (href.startsWith('/') ? '' : '/') + href; + }; + const parseArticle = (doc, fallback) => { + const title = normalize(doc.querySelector('.articalTitle h2, .title h2, h1, h2.titName')?.textContent) || fallback.title; + const titleParts = normalize(doc.title).split('_').map((part) => normalize(part)).filter(Boolean); + const timeText = normalize(doc.querySelector('.time, .articalInfo .time')?.textContent).replace(/[()]/g, ''); + const articleId = fallback.url.match(/blog_([a-zA-Z0-9]+)\\.html/)?.[1] || ''; + return { + articleId, + title, + author: titleParts[1] || title.split(/[::]/)[0] || '', + date: timeText || '', + readCount: '', + description: normalize(doc.querySelector('.articalContent, .blog_content, .content, #sina_keyword_ad_area2')?.textContent).slice(0, 150), + }; + }; + + const seeds = []; + const seen = new Set(); + for (const link of Array.from(document.querySelectorAll('.day-hot-rank .art-list a[href*="/s/blog_"], .hot-rank .art-list a[href*="/s/blog_"]'))) { + const title = normalize(link.textContent); + const url = abs(link.getAttribute('href') || ''); + if (!title || !url || seen.has(url)) continue; + seen.add(url); + seeds.push({ rank: seeds.length + 1, title, url }); + if (seeds.length >= limit) break; + } + + const results = []; + for (const item of seeds) { + let merged = { + rank: item.rank, + articleId: item.url.match(/blog_([a-zA-Z0-9]+)\\.html/)?.[1] || '', + title: item.title, + author: '', + date: '', + readCount: '', + description: '', + url: item.url, + }; + try { + const resp = await fetch(item.url, { credentials: 'include' }); + if (resp.ok) { + const html = await resp.text(); + const doc = new DOMParser().parseFromString(html, 'text/html'); + merged = Object.assign(merged, parseArticle(doc, item)); + } + } catch {} + results.push(merged); + } + return results; + })() + `); + + return Array.isArray(data) ? data : []; +} + +export async function loadSinaBlogSearch(page: IPage, keyword: string, limit: number): Promise { + const safeLimit = clampLimit(limit); + await page.goto(buildSinaBlogSearchUrl(keyword)); + await page.wait(5); + const data = await page.evaluate(` + (async () => { + const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); + for (let i = 0; i < 20; i += 1) { + if (document.querySelector('.result-item')) break; + await sleep(500); + } + const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim(); + const limit = ${safeLimit}; + const items = Array.from(document.querySelectorAll('.result-item')); + const results = []; + for (const item of items) { + const link = item.querySelector('.result-title a[href*="blog.sina.com.cn/s/blog_"]'); + const title = normalize(link?.textContent); + const url = link?.getAttribute('href') || ''; + if (!title || !url) continue; + results.push({ + rank: results.length + 1, + title, + author: normalize(item.querySelector('.result-meta .source')?.textContent), + date: normalize(item.querySelector('.result-meta .time')?.textContent), + description: normalize(item.querySelector('.result-intro')?.textContent).slice(0, 150), + url, + }); + if (results.length >= limit) break; + } + return results; + })() + `); + + return Array.isArray(data) ? data : []; +} + +export async function loadSinaBlogUser(page: IPage, uid: string, limit: number): Promise { + const safeLimit = clampLimit(limit); + await page.goto(buildSinaBlogUserUrl(uid)); + await page.wait(3); + const data = await page.evaluate(` + (async () => { + await new Promise((resolve) => setTimeout(resolve, 1000)); + const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim(); + const limit = ${safeLimit}; + const author = normalize(document.title).split('_').map((part) => normalize(part)).filter(Boolean)[1] || ''; + const abs = (href) => { + if (!href) return ''; + if (href.startsWith('//')) return 'https:' + href; + if (href.startsWith('http')) return href; + return 'https://blog.sina.com.cn' + (href.startsWith('/') ? '' : '/') + href; + }; + const results = []; + for (const item of Array.from(document.querySelectorAll('.articleList .articleCell'))) { + const link = item.querySelector('.atc_title a[href*="/s/blog_"]'); + const title = normalize(link?.textContent); + const url = abs(link?.getAttribute('href') || ''); + if (!title || !url) continue; + results.push({ + rank: results.length + 1, + articleId: url.match(/blog_([a-zA-Z0-9]+)\\.html/)?.[1] || '', + title, + author, + date: normalize(item.querySelector('.atc_tm')?.textContent), + readCount: '', + description: '', + url, + }); + if (results.length >= limit) break; + } + return results; + })() + `); + + return Array.isArray(data) ? data : []; +} diff --git a/src/clis/sinablog/user.ts b/src/clis/sinablog/user.ts new file mode 100644 index 00000000..6096217b --- /dev/null +++ b/src/clis/sinablog/user.ts @@ -0,0 +1,16 @@ +import { cli, Strategy } from '../../registry.js'; +import { loadSinaBlogUser } from './shared.js'; + +cli({ + site: 'sinablog', + name: 'user', + description: '获取新浪博客用户的文章列表', + domain: 'blog.sina.com.cn', + strategy: Strategy.COOKIE, + args: [ + { name: 'uid', required: true, help: '新浪博客用户ID(如 1234567890)' }, + { name: 'limit', type: 'int', default: 20, help: '返回的文章数量' }, + ], + columns: ['rank', 'title', 'author', 'date', 'readCount', 'url'], + func: async (page, args) => loadSinaBlogUser(page, args.uid, Number(args.limit) || 20), +}); diff --git a/src/clis/substack/feed.ts b/src/clis/substack/feed.ts new file mode 100644 index 00000000..5f861e49 --- /dev/null +++ b/src/clis/substack/feed.ts @@ -0,0 +1,16 @@ +import { cli, Strategy } from '../../registry.js'; +import { buildSubstackBrowseUrl, loadSubstackFeed } from './shared.js'; + +cli({ + site: 'substack', + name: 'feed', + description: 'Substack 热门文章 Feed', + domain: 'substack.com', + strategy: Strategy.COOKIE, + args: [ + { name: 'category', default: 'all', help: '文章分类: all, tech, business, culture, politics, science, health' }, + { name: 'limit', type: 'int', default: 20, help: '返回的文章数量' }, + ], + columns: ['rank', 'title', 'author', 'date', 'readTime', 'url'], + func: async (page, args) => loadSubstackFeed(page, buildSubstackBrowseUrl(args.category), Number(args.limit) || 20), +}); diff --git a/src/clis/substack/publication.ts b/src/clis/substack/publication.ts new file mode 100644 index 00000000..5caa4b39 --- /dev/null +++ b/src/clis/substack/publication.ts @@ -0,0 +1,16 @@ +import { cli, Strategy } from '../../registry.js'; +import { loadSubstackArchive } from './shared.js'; + +cli({ + site: 'substack', + name: 'publication', + description: '获取特定 Substack Newsletter 的最新文章', + domain: 'substack.com', + strategy: Strategy.COOKIE, + args: [ + { name: 'url', required: true, help: 'Newsletter URL(如 https://example.substack.com)' }, + { name: 'limit', type: 'int', default: 20, help: '返回的文章数量' }, + ], + columns: ['rank', 'title', 'date', 'description', 'url'], + func: async (page, args) => loadSubstackArchive(page, args.url.replace(/\/$/, ''), Number(args.limit) || 20), +}); diff --git a/src/clis/substack/search.ts b/src/clis/substack/search.ts new file mode 100644 index 00000000..c447d08e --- /dev/null +++ b/src/clis/substack/search.ts @@ -0,0 +1,91 @@ +import { cli, Strategy } from '../../registry.js'; + +type SubstackPostResult = { + title: string; + author: string; + date: string; + description: string; + url: string; +}; + +function headers(): HeadersInit { + return { + 'User-Agent': 'Mozilla/5.0', + Accept: 'application/json', + }; +} + +function trim(value: unknown): string { + return typeof value === 'string' ? value.replace(/\s+/g, ' ').trim() : ''; +} + +function publicationBaseUrl(publication: any): string { + if (publication?.custom_domain) return `https://${publication.custom_domain}`; + if (publication?.subdomain) return `https://${publication.subdomain}.substack.com`; + return ''; +} + +async function searchPosts(keyword: string, limit: number): Promise { + const url = new URL('https://substack.com/api/v1/post/search'); + url.searchParams.set('query', keyword); + url.searchParams.set('page', '0'); + url.searchParams.set('includePlatformResults', 'true'); + + const resp = await fetch(url, { headers: headers() }); + if (!resp.ok) throw new Error(`Substack post search failed: HTTP ${resp.status}`); + + const data = await resp.json() as { results?: any[] }; + const results = Array.isArray(data?.results) ? data.results : []; + return results.slice(0, limit).map((item, index) => ({ + rank: index + 1, + title: trim(item?.title), + author: trim(item?.publishedBylines?.[0]?.name), + date: trim(item?.post_date).split('T')[0] || trim(item?.post_date), + description: trim(item?.description || item?.subtitle || item?.truncated_body_text).slice(0, 150), + url: trim(item?.canonical_url), + })); +} + +async function searchPublications(keyword: string, limit: number): Promise { + const url = new URL('https://substack.com/api/v1/profile/search'); + url.searchParams.set('query', keyword); + url.searchParams.set('page', '0'); + + const resp = await fetch(url, { headers: headers() }); + if (!resp.ok) throw new Error(`Substack publication search failed: HTTP ${resp.status}`); + + const data = await resp.json() as { results?: any[] }; + const results = Array.isArray(data?.results) ? data.results : []; + return results.slice(0, limit).map((item, index) => { + const publication = item?.primaryPublication || item?.publicationUsers?.[0]?.publication || {}; + return { + rank: index + 1, + title: trim(publication?.name || item?.name), + author: trim(item?.name), + date: '', + description: trim(publication?.hero_text || item?.bio).slice(0, 150), + url: publicationBaseUrl(publication), + }; + }); +} + +cli({ + site: 'substack', + name: 'search', + description: '搜索 Substack 文章和 Newsletter', + domain: 'substack.com', + strategy: Strategy.PUBLIC, + browser: false, + args: [ + { name: 'keyword', required: true, help: '搜索关键词' }, + { name: 'type', default: 'posts', choices: ['posts', 'publications'], help: '搜索类型(posts=文章, publications=Newsletter)' }, + { name: 'limit', type: 'int', default: 20, help: '返回结果数量' }, + ], + columns: ['rank', 'title', 'author', 'date', 'description', 'url'], + func: async (_page, args) => { + const limit = Math.max(1, Math.min(Number(args.limit) || 20, 50)); + return args.type === 'publications' + ? searchPublications(args.keyword, limit) + : searchPosts(args.keyword, limit); + }, +}); diff --git a/src/clis/substack/shared.ts b/src/clis/substack/shared.ts new file mode 100644 index 00000000..08673c9e --- /dev/null +++ b/src/clis/substack/shared.ts @@ -0,0 +1,132 @@ +import type { IPage } from '../../types.js'; + +export function buildSubstackBrowseUrl(category?: string): string { + if (!category || category === 'all') return 'https://substack.com/'; + const slug = category === 'tech' ? 'technology' : category; + return `https://substack.com/browse/${slug}`; +} + +export async function loadSubstackFeed(page: IPage, url: string, limit: number): Promise { + if (!page) throw new Error('Requires browser session'); + await page.goto(url); + await page.wait(5); + const data = await page.evaluate(` + (async () => { + await new Promise((resolve) => setTimeout(resolve, 3000)); + const limit = ${Math.max(1, Math.min(limit, 50))}; + const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim(); + const posts = []; + const seen = new Set(); + + const allLinks = Array.from(document.querySelectorAll('a')).filter((link) => { + const href = link.getAttribute('href') || ''; + return href.includes('/home/post/') || href.includes('/p/'); + }); + + for (const linkEl of allLinks) { + let postUrl = linkEl.getAttribute('href') || ''; + if (!postUrl) continue; + if (!postUrl.startsWith('http')) postUrl = 'https://substack.com' + postUrl; + if (seen.has(postUrl)) continue; + + const lines = (linkEl.innerText || '') + .split('\\n') + .map((line) => normalize(line)) + .filter(Boolean); + + const readMeta = lines.find((line) => /\\b(read|watch|listen)\\b/i.test(line)) || ''; + if (!readMeta) continue; + + const date = lines.find((line) => /^[A-Z]{3}\\s+\\d{1,2}$/i.test(line)) || ''; + const contentLines = lines.filter((line) => + line && + line !== date && + line !== readMeta && + line.toLowerCase() !== 'save' && + line.toLowerCase() !== 'more' && + !/^(sign in|create account|get app)$/i.test(line), + ); + + const metaParts = readMeta.split('∙').map((part) => normalize(part)); + const author = metaParts[0] || ''; + const readTime = metaParts.slice(1).join(' ∙ ') || readMeta; + const title = contentLines.length >= 2 ? contentLines[1] : (contentLines[0] || ''); + const description = contentLines.length >= 3 ? contentLines.slice(2).join(' ') : ''; + if (!title) continue; + + seen.add(postUrl); + posts.push({ + rank: posts.length + 1, + title, + author, + date, + readTime, + description: description.slice(0, 150), + url: postUrl, + }); + + if (posts.length >= limit) break; + } + + return posts; + })() + `); + + return Array.isArray(data) ? data : []; +} + +export async function loadSubstackArchive(page: IPage, baseUrl: string, limit: number): Promise { + if (!page) throw new Error('Requires browser session'); + await page.goto(`${baseUrl}/archive`); + await page.wait(5); + const data = await page.evaluate(` + (async () => { + await new Promise((resolve) => setTimeout(resolve, 3000)); + const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim(); + const limit = ${Math.max(1, Math.min(limit, 50))}; + const grouped = new Map(); + + for (const link of Array.from(document.querySelectorAll('a[href*="/p/"]'))) { + const rawHref = link.getAttribute('href') || ''; + if (!rawHref || rawHref === '/p/upgrade') continue; + + const url = rawHref.startsWith('http') ? rawHref : ${JSON.stringify(baseUrl)} + rawHref; + const text = normalize(link.textContent); + if (!text) continue; + if (/^(subscribe|paid|home|about|latest|top|discussions)$/i.test(text)) continue; + if (/^[\\d,]+$/.test(text)) continue; + + const entry = grouped.get(url) || { texts: new Set(), date: '' }; + entry.texts.add(text); + + const container = link.closest('article, section, div') || link.parentElement || link; + const containerText = normalize(container.textContent); + if (!entry.date) { + entry.date = containerText.match(/\\b(?:[A-Z]{3}\\s+\\d{1,2}|[A-Z][a-z]{2}\\s+\\d{1,2})\\b/)?.[0] || ''; + } + + grouped.set(url, entry); + } + + const posts = []; + for (const [url, entry] of Array.from(grouped.entries())) { + const texts = Array.from(entry.texts).map((text) => normalize(text)).filter((text) => text.length > 3).sort((a, b) => a.length - b.length); + const title = texts[0] || ''; + const description = texts.find((text) => text !== title) || ''; + if (!title) continue; + posts.push({ + rank: posts.length + 1, + title, + date: entry.date, + description: description.slice(0, 150), + url, + }); + if (posts.length >= limit) break; + } + + return posts; + })() + `); + + return Array.isArray(data) ? data : []; +} diff --git a/src/pipeline/executor.test.ts b/src/pipeline/executor.test.ts index 059def6e..19396111 100644 --- a/src/pipeline/executor.test.ts +++ b/src/pipeline/executor.test.ts @@ -16,8 +16,7 @@ function createMockPage(overrides: Partial = {}): IPage { click: vi.fn(), typeText: vi.fn(), pressKey: vi.fn(), - scrollTo: vi.fn().mockResolvedValue(undefined), - getFormState: vi.fn().mockResolvedValue({ forms: [], orphanFields: [] }), + getFormState: vi.fn().mockResolvedValue({}), wait: vi.fn(), tabs: vi.fn().mockResolvedValue([]), closeTab: vi.fn(), @@ -26,6 +25,7 @@ function createMockPage(overrides: Partial = {}): IPage { networkRequests: vi.fn().mockResolvedValue([]), consoleMessages: vi.fn().mockResolvedValue(''), scroll: vi.fn(), + scrollTo: vi.fn(), autoScroll: vi.fn(), installInterceptor: vi.fn(), getInterceptedRequests: vi.fn().mockResolvedValue([]), @@ -81,6 +81,32 @@ describe('executePipeline', () => { ]); }); + it('runs inline select inside map step', async () => { + const page = createMockPage({ + evaluate: vi.fn().mockResolvedValue({ + posts: [ + { title: 'First', rank: 1 }, + { title: 'Second', rank: 2 }, + ], + }), + }); + const result = await executePipeline(page, [ + { evaluate: 'test' }, + { + map: { + select: 'posts', + title: '${{ item.title }}', + rank: '${{ item.rank }}', + }, + }, + ]); + + expect(result).toEqual([ + { title: 'First', rank: 1 }, + { title: 'Second', rank: 2 }, + ]); + }); + it('executes limit step', async () => { const page = createMockPage({ evaluate: vi.fn().mockResolvedValue([1, 2, 3, 4, 5]), diff --git a/src/pipeline/steps/transform.ts b/src/pipeline/steps/transform.ts index cc909009..fa45dafe 100644 --- a/src/pipeline/steps/transform.ts +++ b/src/pipeline/steps/transform.ts @@ -20,13 +20,25 @@ export async function stepSelect(_page: any, params: any, data: any, args: Recor export async function stepMap(_page: any, params: any, data: any, args: Record): Promise { if (!data || typeof data !== 'object') return data; - let items: any[] = Array.isArray(data) ? data : [data]; - if (!Array.isArray(data) && typeof data === 'object' && 'data' in data) items = data.data; + let source = data; + + // Support inline select: { map: { select: 'path', key: '${{ item.x }}' } } + if (params && typeof params === 'object' && 'select' in params) { + source = await stepSelect(null, (params as any).select, data, args); + } + + if (!source || typeof source !== 'object') return source; + + let items: any[] = Array.isArray(source) ? source : [source]; + if (!Array.isArray(source) && typeof source === 'object' && 'data' in source) items = source.data; const result: any[] = []; for (let i = 0; i < items.length; i++) { const item = items[i]; const row: Record = {}; - for (const [key, template] of Object.entries(params)) row[key] = render(template, { args, data, item, index: i }); + for (const [key, template] of Object.entries(params)) { + if (key === 'select') continue; + row[key] = render(template, { args, data: source, item, index: i }); + } result.push(row); } return result; diff --git a/src/pipeline/template.test.ts b/src/pipeline/template.test.ts index ef02d8b2..266b4e99 100644 --- a/src/pipeline/template.test.ts +++ b/src/pipeline/template.test.ts @@ -57,6 +57,15 @@ describe('evalExpr', () => { it('resolves simple path', () => { expect(evalExpr('item.title', { item: { title: 'Test' } })).toBe('Test'); }); + it('evaluates JS helper expressions', () => { + expect(evalExpr('encodeURIComponent(args.keyword)', { args: { keyword: 'hello world' } })).toBe('hello%20world'); + }); + it('evaluates ternary expressions', () => { + expect(evalExpr("args.kind === 'tech' ? 'technology' : args.kind", { args: { kind: 'tech' } })).toBe('technology'); + }); + it('evaluates method calls on values', () => { + expect(evalExpr("args.username.startsWith('@') ? args.username : '@' + args.username", { args: { username: 'alice' } })).toBe('@alice'); + }); it('applies join filter', () => { expect(evalExpr('item.tags | join(,)', { item: { tags: ['a', 'b', 'c'] } })).toBe('a,b,c'); }); @@ -104,6 +113,15 @@ describe('render', () => { it('renders URL template', () => { expect(render('https://api.example.com/search?q=${{ args.keyword }}', { args: { keyword: 'test' } })).toBe('https://api.example.com/search?q=test'); }); + it('renders inline helper expressions', () => { + expect(render('https://example.com/search?q=${{ encodeURIComponent(args.keyword) }}', { args: { keyword: 'hello world' } })).toBe('https://example.com/search?q=hello%20world'); + }); + it('renders full multiline expressions', () => { + expect(render("${{\n args.topic ? `https://medium.com/tag/${args.topic}` : 'https://medium.com/tag/technology'\n}}", { args: { topic: 'ai' } })).toBe('https://medium.com/tag/ai'); + }); + it('renders block expressions with surrounding whitespace', () => { + expect(render("\n ${{ args.kind === 'tech' ? 'technology' : args.kind }}\n", { args: { kind: 'tech' } })).toBe('technology'); + }); }); describe('normalizeEvaluateSource', () => { diff --git a/src/pipeline/template.ts b/src/pipeline/template.ts index 5081e497..da1ef9a6 100644 --- a/src/pipeline/template.ts +++ b/src/pipeline/template.ts @@ -11,12 +11,13 @@ export interface RenderContext { export function render(template: any, ctx: RenderContext): any { if (typeof template !== 'string') return template; + const trimmed = template.trim(); // Full expression: entire string is a single ${{ ... }} // Use [^}] to prevent matching across }} boundaries (e.g. "${{ a }}-${{ b }}") - const fullMatch = template.match(/^\$\{\{\s*([^}]*(?:\}[^}][^}]*)*)\s*\}\}$/); - if (fullMatch && !template.includes('}}-') && !template.includes('}}${{')) return evalExpr(fullMatch[1].trim(), ctx); + const fullMatch = trimmed.match(/^\$\{\{\s*([^}]*(?:\}[^}][^}]*)*)\s*\}\}$/); + if (fullMatch && !trimmed.includes('}}-') && !trimmed.includes('}}${{')) return evalExpr(fullMatch[1].trim(), ctx); // Check if the entire string is a single expression (no other text around it) - const singleExpr = template.match(/^\$\{\{\s*([\s\S]*?)\s*\}\}$/); + const singleExpr = trimmed.match(/^\$\{\{\s*([\s\S]*?)\s*\}\}$/); if (singleExpr) { // Verify it's truly a single expression (no other ${{ inside) const inner = singleExpr[1]; @@ -68,7 +69,10 @@ export function evalExpr(expr: string, ctx: RenderContext): any { return right.replace(/^['"]|['"]$/g, ''); } - return resolvePath(expr, { args, item, data, index }); + const resolved = resolvePath(expr, { args, item, data, index }); + if (resolved !== null && resolved !== undefined) return resolved; + + return evalJsExpr(expr, { args, item, data, index }); } /** @@ -145,6 +149,10 @@ function applyFilter(filterExpr: string, value: any): any { const parts = value.split(/[/\\]/); return parts[parts.length - 1] || value; } + case 'urlencode': + return typeof value === 'string' ? encodeURIComponent(value) : value; + case 'urldecode': + return typeof value === 'string' ? decodeURIComponent(value) : value; default: return value; } @@ -171,6 +179,66 @@ export function resolvePath(pathStr: string, ctx: RenderContext): any { return obj; } +/** + * Evaluate arbitrary JS expressions as a last-resort fallback. + * + * ⚠️ SECURITY NOTE: Uses `new Function()` to execute the expression. + * This is acceptable here because: + * 1. YAML adapters are authored by trusted repo contributors only. + * 2. The expression runs in the same Node.js process (no sandbox). + * 3. Only a curated set of globals is exposed (no require/import/process/fs). + * If opencli ever loads untrusted third-party adapters, this MUST be replaced + * with a proper sandboxed evaluator. + */ +function evalJsExpr(expr: string, ctx: RenderContext): any { + // Guard against absurdly long expressions that could indicate injection. + if (expr.length > 2000) return undefined; + + const args = ctx.args ?? {}; + const item = ctx.item ?? {}; + const data = ctx.data; + const index = ctx.index ?? 0; + + try { + const fn = new Function( + 'args', + 'item', + 'data', + 'index', + 'encodeURIComponent', + 'decodeURIComponent', + 'JSON', + 'Math', + 'Number', + 'String', + 'Boolean', + 'Array', + 'Object', + 'Date', + `"use strict"; return (${expr});`, + ); + + return fn( + args, + item, + data, + index, + encodeURIComponent, + decodeURIComponent, + JSON, + Math, + Number, + String, + Boolean, + Array, + Object, + Date, + ); + } catch { + return undefined; + } +} + /** * Normalize JavaScript source for browser evaluate() calls. */ diff --git a/src/pipeline/transform.test.ts b/src/pipeline/transform.test.ts index b6780011..ff1943c1 100644 --- a/src/pipeline/transform.test.ts +++ b/src/pipeline/transform.test.ts @@ -58,6 +58,19 @@ describe('stepMap', () => { it('returns null/undefined as-is', async () => { expect(await stepMap(null, { x: '${{ item.x }}' }, null, {})).toBeNull(); }); + + it('supports inline select before mapping', async () => { + const result = await stepMap(null, { + select: 'posts', + title: '${{ item.title }}', + rank: '${{ index + 1 }}', + }, { posts: [{ title: 'One' }, { title: 'Two' }] }, {}); + + expect(result).toEqual([ + { title: 'One', rank: 1 }, + { title: 'Two', rank: 2 }, + ]); + }); }); describe('stepFilter', () => {