From bf30cdf1fdb5f027f497eaf0debc352e44066de0 Mon Sep 17 00:00:00 2001 From: KillerQueen-Z <1211904451@qq.com> Date: Thu, 11 Jun 2026 15:45:42 -0700 Subject: [PATCH 1/2] feat: add 'franklin predict' prediction mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A headless command + restricted capability profile for forecasting one real-world event. predictionCapabilities exposes only research tools (web search, webfetch, Exa, search X, prediction markets, market data) — no filesystem, shell, media, swaps, phone or sandbox. 'franklin predict --model M --question Q --json' runs the agent and prints a JSON envelope (finalText + tool trace + usage), so non-interactive callers (e.g. franklin.bet) can drive grounded, tool-using predictions over the CLI. --- src/commands/predict.ts | 155 ++++++++++++++++++++++++++++++++++++++++ src/index.ts | 12 ++++ src/tools/index.ts | 27 +++++++ 3 files changed, 194 insertions(+) create mode 100644 src/commands/predict.ts diff --git a/src/commands/predict.ts b/src/commands/predict.ts new file mode 100644 index 0000000..402c0ad --- /dev/null +++ b/src/commands/predict.ts @@ -0,0 +1,155 @@ +/** + * `franklin predict` — Franklin prediction mode (headless). + * + * Runs ONE model as a disciplined forecaster: it researches a single real-world + * event with a tight, read-only toolset (web search, source fetch, Exa, X, live + * prediction markets, a little market data) the way a bettor would before + * putting money down — then commits to a pick with a confidence. + * + * Designed for machine callers (e.g. BlockRun Oracle): with --json it emits a + * single JSON envelope on stdout containing the model's final answer, the full + * tool-call trace (what it searched and what it found), the terminal reason and + * token usage. Human-readable streaming otherwise. + * + * franklin predict --model anthropic/claude-opus-4.8 \ + * --question "Who wins the 2026 FIFA World Cup? Pick one country." --json + */ +import { interactiveSession } from '../agent/loop.js'; +import type { AgentConfig, StreamEvent, StreamTurnDone } from '../agent/types.js'; +import { predictionCapabilities, resetToolSessionState } from '../tools/index.js'; +import { loadChain, API_URLS } from '../config.js'; +import { resolveModel } from '../ui/model-picker.js'; + +export interface PredictOptions { + model?: string; + question?: string; + maxTurns?: string; + maxSpend?: string; + json?: boolean; + debug?: boolean; +} + +const PREDICTION_SYSTEM: string[] = [ + 'You are a sharp, disciplined forecasting analyst — think like a professional who is about to put real money on this question.', + 'Your job: predict the outcome of ONE real-world event. Before answering you MUST do research the way a bettor would:', + "1. Use web_search (and webfetch / exa tools) for the most CURRENT facts and news — today's real-world state matters far more than your training data.", + '2. Use search_prediction_markets to read the CURRENT market-implied odds (Polymarket, Kalshi, etc.) for this or a closely related question.', + '3. Weigh it: where is the consensus, where might the market be mispriced, what is your edge.', + 'Keep tool use focused — a handful of targeted calls, not dozens. When you have enough to decide, STOP researching and answer.', + 'Your FINAL message must end with EXACTLY ONE single-line minified JSON object and NOTHING after it:', + '{"pick": string, "confidence": number, "rationale": string, "analysis": string, "marketOdds": string}', + '- pick: one option from the question (a short label, e.g. a country, party, bucket, or Yes/No).', + '- confidence: your probability (0-1) that THIS pick is correct.', + '- rationale: one sharp sentence (max 22 words).', + '- analysis: 3-5 sentences citing what your research found, the strongest counter-argument, and why you still land here. No literal newlines inside the string.', + "- marketOdds: what the prediction market currently implies (e.g. 'Polymarket: France 18%'), or 'n/a' if none found.", + 'Be decisive. Do not hedge with "it depends".', +]; + +interface TraceEntry { + tool: string; + input: string; + output: string; + isError?: boolean; +} + +export async function predictCommand(options: PredictOptions): Promise { + const question = options.question?.trim(); + if (!question) { + process.stderr.write('predict: --question is required\n'); + process.exitCode = 1; + return; + } + if (!options.model) { + process.stderr.write('predict: --model is required\n'); + process.exitCode = 1; + return; + } + + const chain = loadChain(); + const apiUrl = API_URLS[chain]; + const model = resolveModel(options.model); + const asJson = options.json !== false; + + resetToolSessionState(); + + const agentConfig: AgentConfig = { + model, + apiUrl, + chain, + systemInstructions: PREDICTION_SYSTEM, + capabilities: predictionCapabilities, + maxTurns: options.maxTurns != null ? Number(options.maxTurns) : 12, + permissionMode: 'trust', + debug: !!options.debug, + showPrefetchStatus: false, + ...(options.maxSpend != null ? { maxSpendUsd: Number(options.maxSpend) } : {}), + }; + + let finalText = ''; + let turnReason: StreamTurnDone['reason'] = 'completed'; + let turnError: string | undefined; + let inputTokens = 0; + let outputTokens = 0; + const trace: TraceEntry[] = []; + const nameById = new Map(); + const inputById = new Map(); + const previewById = new Map(); + + let delivered = false; + const getInput = async (): Promise => { + if (delivered) return null; + delivered = true; + return question; + }; + + await interactiveSession(agentConfig, getInput, (event: StreamEvent) => { + switch (event.kind) { + case 'text_delta': + finalText += event.text; + if (!asJson) process.stdout.write(event.text); + break; + case 'capability_start': + nameById.set(event.id, event.name); + inputById.set(event.id, ''); + if (event.preview) previewById.set(event.id, event.preview); + if (!asJson) process.stderr.write(`\n · ${event.name}${event.preview ? ` ${event.preview}` : ''}\n`); + break; + case 'capability_input_delta': + inputById.set(event.id, (inputById.get(event.id) || '') + event.delta); + break; + case 'capability_done': { + const tool = nameById.get(event.id) || 'tool'; + const input = (inputById.get(event.id) || '').trim() || previewById.get(event.id) || ''; + const output = event.result?.fullOutput || event.result?.output || ''; + trace.push({ tool, input, output: output.slice(0, 1500), isError: event.result?.isError }); + break; + } + case 'usage': + inputTokens = event.inputTokens; + outputTokens = event.outputTokens; + break; + case 'turn_done': + turnReason = event.reason; + turnError = event.error; + break; + } + }); + + if (asJson) { + const envelope = { + model, + question, + finalText: finalText.trim(), + trace, + turnReason, + ...(turnError ? { error: turnError } : {}), + usage: { inputTokens, outputTokens }, + }; + process.stdout.write(JSON.stringify(envelope) + '\n'); + } else if (turnReason !== 'completed' && turnError) { + process.stderr.write(`\n${turnError}\n`); + } + + process.exitCode = turnReason === 'completed' ? 0 : 1; +} diff --git a/src/index.ts b/src/index.ts index 9e7dc7d..46ea431 100644 --- a/src/index.ts +++ b/src/index.ts @@ -29,6 +29,7 @@ import { uninitCommand } from './commands/uninit.js'; import { proxyCommand } from './commands/proxy.js'; import { buildTaskCommand } from './commands/task.js'; import { buildContentCommand } from './commands/content.js'; +import { predictCommand } from './commands/predict.js'; import { VERSION as version } from './config.js'; @@ -90,6 +91,17 @@ program .option('--debug', 'Enable debug logging') .action((options) => proxyCommand({ ...options, version })); +program + .command('predict') + .description('Prediction mode — forecast one real-world event with a research-only toolset (web/markets), headless') + .requiredOption('-m, --model ', 'Model to use (e.g. anthropic/claude-opus-4.8, openai/gpt-5.5)') + .requiredOption('-q, --question ', 'The event question to forecast (include the allowed options)') + .option('--max-turns ', 'Max agent turns before forcing an answer', '12') + .option('--max-spend ', 'Hard USD cap on this prediction run') + .option('--no-json', 'Human-readable streaming instead of a JSON envelope') + .option('--debug', 'Enable debug logging') + .action((options) => predictCommand(options)); + program .command('init') .description('Configure franklin auto-start (writes ~/.claude/settings.json + installs LaunchAgent on macOS)') diff --git a/src/tools/index.ts b/src/tools/index.ts index eb48d4a..e2c34d1 100644 --- a/src/tools/index.ts +++ b/src/tools/index.ts @@ -234,4 +234,31 @@ export { detachCapability, }; +/** + * "Franklin prediction mode" toolset. + * + * A deliberately tight, research-only capability set for forecasting a single + * real-world event the way a careful bettor would: gather current facts, read + * sources, check live prediction-market odds and a little market data — then + * decide. Everything else (filesystem, shell, media generation, swaps/trade + * execution, phone/voice, GPU sandbox, posting) is intentionally excluded: + * a forecaster looks things up, it does not act on the world or spend beyond + * the cheap read calls these tools make. + * + * Used by the `franklin predict` command and reusable by any headless caller + * (e.g. franklin.bet) that wants a grounded prediction. + */ +export const predictionCapabilities: CapabilityHandler[] = [ + webSearchCapability, // web_search — current news & facts + webFetchCapability, // webfetch — read a specific source URL + exaSearchCapability, // exa search — higher-quality web research + exaAnswerCapability, // exa answer — direct sourced answers + exaReadUrlsCapability, // exa read — pull full text of found URLs + searchXCapability, // search X — live sentiment / breaking signal + predictionMarketCapability, // search_prediction_markets — live implied odds + tradingSignalCapability, // market signal/indicators (for market-type events) + tradingMarketCapability, // market snapshot data + defiLlamaPriceCapability, // token price lookup (crypto-type events) +]; + export { createSubAgentCapability } from './subagent.js'; From a716b598e3cced50390014a7b23d97b637cc5b46 Mon Sep 17 00:00:00 2001 From: KillerQueen-Z <1211904451@qq.com> Date: Thu, 11 Jun 2026 17:22:25 -0700 Subject: [PATCH 2/2] feat(predict): governance for reliable one-shot answers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some models never stop calling tools and hit maxTurns with an EMPTY answer (wasting the spend). Adds opt-in agent governance, used by prediction mode: - forceAnswerOnFinalTurn / maxToolCalls: withhold tools once the tool budget (or final turn) is reached, so the model must commit to a text answer. - disableModelFallback: don't silently switch to a different model on empty/ stalled output — a one-shot caller wants a clean abstain from the asked model. - disableGroundingRetry: skip the 'ungrounded claims → force tool use' retry, which fought the forced answer and polluted structured output. predict defaults: maxTurns 8, maxToolCalls 6 (tool budget is the real limiter; turns are slack for a thinking turn + the forced-answer turn). Verified: GPT-5.5 and DeepSeek V4 Pro, which previously returned empty at maxTurns, now answer cleanly. (MiniMax M3 / Kimi K2.6 still fail — upstream OpenRouter function-call defects, not addressed here.) --- src/agent/loop.ts | 20 +++++++++++++++++--- src/agent/types.ts | 24 ++++++++++++++++++++++++ src/commands/predict.ts | 13 +++++++++++-- src/index.ts | 3 ++- 4 files changed, 54 insertions(+), 6 deletions(-) diff --git a/src/agent/loop.ts b/src/agent/loop.ts index beba48d..575d83f 100644 --- a/src/agent/loop.ts +++ b/src/agent/loop.ts @@ -1382,6 +1382,20 @@ export async function interactiveSession( callMaxTokens = 2048; // Short plan output callSystemPrompt = systemPrompt + '\n\n' + getPlanningPrompt(); } + // Force a final answer: withhold tools so the model must commit to text, + // either on the last turn or once the tool-call budget is spent. Without + // this, models that keep calling tools every turn hit maxTurns with no + // answer (and waste the spend). Opt-in per config. + const onFinalTurn = config.forceAnswerOnFinalTurn && loopCount === maxTurns; + const toolBudgetSpent = config.maxToolCalls != null && turnToolCalls >= config.maxToolCalls; + if ((onFinalTurn || toolBudgetSpent) && callToolDefs.length > 0) { + callToolDefs = []; + callSystemPrompt = systemPrompt + '\n\n' + + (toolBudgetSpent + ? `You have used your research budget (${config.maxToolCalls} tool calls) — no more tools are available.` + : 'This is your FINAL turn — no more tools are available.') + + ' Based on the research so far, output ONLY the final answer now, in the exact format requested.'; + } // ── Hallucination guard for weak models ── // Weak / free models (nemotron-ultra, GLM-4, qwen coder, free-profile @@ -1492,7 +1506,7 @@ export async function interactiveSession( if (!hasText && !hasTools && !hasThinking) { const EMPTY_FALLBACK_MODELS = ['nvidia/qwen3-coder-480b', 'nvidia/llama-4-maverick', 'zai/glm-5.1']; const nextModel = EMPTY_FALLBACK_MODELS.find(m => m !== config.model && !turnFailedModels.has(m)); - if (nextModel && recoveryAttempts < 2) { + if (nextModel && recoveryAttempts < 2 && !config.disableModelFallback) { recoveryAttempts++; turnFailedModels.add(config.model); const oldModel = config.model; @@ -1540,7 +1554,7 @@ export async function interactiveSession( const nextModel = TOOL_USE_FALLBACK_MODELS.find( m => m !== config.model && !turnFailedModels.has(m), ); - if (nextModel && recoveryAttempts < 2) { + if (nextModel && recoveryAttempts < 2 && !config.disableModelFallback) { recoveryAttempts++; turnFailedModels.add(config.model); const oldModel = config.model; @@ -2126,7 +2140,7 @@ export async function interactiveSession( .filter(p => p.type === 'text' && typeof (p as { text?: string }).text === 'string') .map(p => (p as { text: string }).text) .join(''); - if (shouldCheckGrounding(lastUserInput || '', assistantText)) { + if (!config.disableGroundingRetry && shouldCheckGrounding(lastUserInput || '', assistantText)) { const gResult = await checkGrounding(lastUserInput, history, assistantText, client, { abortSignal: abort.signal, }); diff --git a/src/agent/types.ts b/src/agent/types.ts index 1aaad00..dba7551 100644 --- a/src/agent/types.ts +++ b/src/agent/types.ts @@ -217,6 +217,30 @@ export interface AgentConfig { maxSpendUsd?: number; /** Show user-visible harness prefetch status lines (interactive UX only). */ showPrefetchStatus?: boolean; + /** + * On the final turn, withhold tools so the model must commit to a text answer + * instead of researching until cut off. For one-shot forecasting/extraction + * callers (e.g. `franklin predict`) where some models never stop calling tools + * and would otherwise hit maxTurns with no answer. + */ + forceAnswerOnFinalTurn?: boolean; + /** + * Hard cap on total tool calls for the turn. Once reached, tools are withheld + * and the model is forced to answer from what it has. Bounds research/cost + * deterministically (a turn budget alone doesn't — a turn may have no tool). + */ + maxToolCalls?: number; + /** + * Disable Franklin's automatic model-switching (empty-response / stalled-intent + * fallbacks). One-shot callers want a clean abstain from the requested model, + * not a silent switch to a different one. + */ + disableModelFallback?: boolean; + /** + * Disable the post-response "ungrounded claims → force a tool-use retry" guard. + * It fights the forced-answer path and pollutes one-shot structured output. + */ + disableGroundingRetry?: boolean; /** Mid-turn "research-bloat" compaction — summarizes history when a turn * racks up many tool calls + spend, to cut input-replay cost. Default on; * set false to disable (the desktop exposes this as a toggle). */ diff --git a/src/commands/predict.ts b/src/commands/predict.ts index 402c0ad..fe97079 100644 --- a/src/commands/predict.ts +++ b/src/commands/predict.ts @@ -24,6 +24,7 @@ export interface PredictOptions { model?: string; question?: string; maxTurns?: string; + maxToolCalls?: string; maxSpend?: string; json?: boolean; debug?: boolean; @@ -35,7 +36,7 @@ const PREDICTION_SYSTEM: string[] = [ "1. Use web_search (and webfetch / exa tools) for the most CURRENT facts and news — today's real-world state matters far more than your training data.", '2. Use search_prediction_markets to read the CURRENT market-implied odds (Polymarket, Kalshi, etc.) for this or a closely related question.', '3. Weigh it: where is the consensus, where might the market be mispriced, what is your edge.', - 'Keep tool use focused — a handful of targeted calls, not dozens. When you have enough to decide, STOP researching and answer.', + 'Budget your research: make AT MOST 4-5 focused tool calls in total. As soon as you have enough to decide, STOP calling tools and output the JSON. Do not keep researching — an answer with light research beats no answer.', 'Your FINAL message must end with EXACTLY ONE single-line minified JSON object and NOTHING after it:', '{"pick": string, "confidence": number, "rationale": string, "analysis": string, "marketOdds": string}', '- pick: one option from the question (a short label, e.g. a country, party, bucket, or Yes/No).', @@ -79,10 +80,18 @@ export async function predictCommand(options: PredictOptions): Promise { chain, systemInstructions: PREDICTION_SYSTEM, capabilities: predictionCapabilities, - maxTurns: options.maxTurns != null ? Number(options.maxTurns) : 12, + maxTurns: options.maxTurns != null ? Number(options.maxTurns) : 8, permissionMode: 'trust', debug: !!options.debug, showPrefetchStatus: false, + // Governance for one-shot forecasting: bound research by tool-call count and + // force an answer; don't silently switch models or fight a grounding retry. + // Tool budget (5) is the real research limiter; maxTurns (8) is just slack + // above it for a thinking turn + the forced-answer turn. + forceAnswerOnFinalTurn: true, + maxToolCalls: options.maxToolCalls != null ? Number(options.maxToolCalls) : 6, + disableModelFallback: true, + disableGroundingRetry: true, ...(options.maxSpend != null ? { maxSpendUsd: Number(options.maxSpend) } : {}), }; diff --git a/src/index.ts b/src/index.ts index 46ea431..7009a4c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -96,7 +96,8 @@ program .description('Prediction mode — forecast one real-world event with a research-only toolset (web/markets), headless') .requiredOption('-m, --model ', 'Model to use (e.g. anthropic/claude-opus-4.8, openai/gpt-5.5)') .requiredOption('-q, --question ', 'The event question to forecast (include the allowed options)') - .option('--max-turns ', 'Max agent turns before forcing an answer', '12') + .option('--max-turns ', 'Max agent turns before forcing an answer', '8') + .option('--max-tool-calls ', 'Max tool calls before forcing an answer', '6') .option('--max-spend ', 'Hard USD cap on this prediction run') .option('--no-json', 'Human-readable streaming instead of a JSON envelope') .option('--debug', 'Enable debug logging')