From fe6e1639395c67fabc03a2b99e807e956f3674c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=91=E5=B8=83=E6=9E=97?= <11641432+heiheiyouyou@user.noreply.gitee.com> Date: Wed, 3 Jun 2026 19:10:20 +0800 Subject: [PATCH 1/5] =?UTF-8?q?fix:=20update=20Bridge=20=C3=97=20Orchestra?= =?UTF-8?q?tor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../adapters/openclaw/bridge.ts | 229 ++++++++++--- .../agent-contract/memory-core.ts | 11 + .../core/config/defaults.ts | 2 +- apps/memos-local-plugin/core/config/schema.ts | 6 +- .../core/experience/corrective-signals.ts | 304 ++++++++++++++++++ .../core/experience/failure-builder.ts | 35 +- .../llm/prompts/failure-experience-sink.ts | 19 +- .../core/llm/prompts/reward.ts | 56 +++- .../core/memory/l2/subscriber.ts | 33 +- .../core/pipeline/memory-core.ts | 133 +++----- .../core/pipeline/orchestrator.ts | 43 ++- .../memos-local-plugin/core/pipeline/types.ts | 6 + .../core/reward/ALGORITHMS.md | 12 +- apps/memos-local-plugin/core/reward/README.md | 18 +- .../core/reward/human-scorer.ts | 4 +- apps/memos-local-plugin/core/reward/reward.ts | 14 +- .../core/reward/subscriber.ts | 116 +++---- .../core/reward/task-summary.ts | 72 +++++ apps/memos-local-plugin/core/reward/types.ts | 2 +- .../docs/CONFIG-ADVANCED.md | 2 +- .../tests/e2e/v7-full-chain.e2e.test.ts | 34 +- .../adapters/openclaw-full-chain.test.ts | 21 +- .../unit/adapters/openclaw-bridge.test.ts | 45 +++ .../experience/corrective-signals.test.ts | 115 +++++++ .../tests/unit/pipeline/memory-core.test.ts | 100 +++++- .../pipeline/resolve-open-episode.test.ts | 83 +++++ .../tests/unit/reward/human-scorer.test.ts | 40 ++- .../unit/reward/reward.integration.test.ts | 34 +- .../tests/unit/reward/subscriber.test.ts | 77 +++-- .../tests/unit/reward/task-summary.test.ts | 38 ++- 30 files changed, 1334 insertions(+), 370 deletions(-) create mode 100644 apps/memos-local-plugin/core/experience/corrective-signals.ts create mode 100644 apps/memos-local-plugin/tests/unit/experience/corrective-signals.test.ts create mode 100644 apps/memos-local-plugin/tests/unit/pipeline/resolve-open-episode.test.ts diff --git a/apps/memos-local-plugin/adapters/openclaw/bridge.ts b/apps/memos-local-plugin/adapters/openclaw/bridge.ts index 58b44912c..62584643c 100644 --- a/apps/memos-local-plugin/adapters/openclaw/bridge.ts +++ b/apps/memos-local-plugin/adapters/openclaw/bridge.ts @@ -801,6 +801,10 @@ const BEFORE_PROMPT_SOFT_TIMEOUT_MS = Number.parseInt( process.env.MEMOS_BEFORE_PROMPT_SOFT_TIMEOUT_MS ?? "12000", 10, ); +const AGENT_END_BINDING_CAP_MS = Math.min( + 60_000, + BEFORE_PROMPT_SOFT_TIMEOUT_MS * 5, +); const TOOL_FAILURE_REPAIR_HINT = "This tool has failed multiple times in a row. You may want to call `memos_search` for relevant past experience before deciding what to do next."; const TOOL_FAILURE_HINT_THRESHOLD = 3; @@ -1022,12 +1026,17 @@ export function createOpenClawBridge(opts: BridgeOptions): BridgeHandle { // clear its own turn without deleting a newer turn's mapping. type EpisodeBinding = { sessionId: SessionId; - episodeId: EpisodeId; + episodeId?: EpisodeId; seq: number; keys: string[]; + turnStart?: Promise; + turnGeneration: number; + runId?: string; + userText?: string; }; const latestEpisodeBySession = new Map(); const episodeBindingByTurnKey = new Map(); + const sessionTurnGeneration = new Map(); let episodeBindingSeq = 0; // Per-toolCallId start timestamps so `after_tool_call` can compute duration // when the host doesn't populate `durationMs`. @@ -1114,20 +1123,69 @@ export function createOpenClawBridge(opts: BridgeOptions): BridgeHandle { return keys; } - function rememberEpisodeBinding( - sessionId: SessionId, - episodeId: EpisodeId, - ctx: { runId?: string }, - userText: string | undefined, - seq: number = ++episodeBindingSeq, - ): EpisodeBinding { + function bindingStillLive(binding: EpisodeBinding): boolean { + for (const key of binding.keys) { + if (episodeBindingByTurnKey.get(key)?.seq === binding.seq) return true; + } + return false; + } + + function commitEpisodeId( + binding: EpisodeBinding, + pkt: RetrievalResultDTO, + ): void { + if (!bindingStillLive(binding)) return; + const candidate = pkt.query?.episodeId as EpisodeId | undefined; + if (!candidate) return; + const routedSessionId = (pkt.query?.sessionId ?? binding.sessionId) as SessionId; + if (opts.core.episodeExists(candidate)) { + binding.episodeId = candidate; + } else { + const fallback = opts.core.resolveOpenEpisodeId(routedSessionId); + if (fallback) binding.episodeId = fallback; + } + if (routedSessionId !== binding.sessionId) { + for (const key of turnBindingKeys( + routedSessionId, + { runId: binding.runId }, + binding.userText, + )) { + episodeBindingByTurnKey.set(key, binding); + } + } + } + + function rememberEpisodeBinding(input: { + sessionId: SessionId; + ctx: { runId?: string }; + userText: string | undefined; + turnStart?: Promise; + episodeId?: EpisodeId; + turnGeneration: number; + seq?: number; + }): EpisodeBinding { + if (!input.turnStart && input.episodeId === undefined) { + throw new Error( + "rememberEpisodeBinding: turnStart required when episodeId is pending", + ); + } + if (input.ctx.runId === undefined && input.userText?.trim()) { + opts.log.warn("memos.binding.missing_run_id", { + sessionId: input.sessionId, + }); + } + const seq = input.seq ?? ++episodeBindingSeq; const binding: EpisodeBinding = { - sessionId, - episodeId, + sessionId: input.sessionId, + episodeId: input.episodeId, seq, - keys: turnBindingKeys(sessionId, ctx, userText), + keys: turnBindingKeys(input.sessionId, input.ctx, input.userText), + turnStart: input.turnStart, + turnGeneration: input.turnGeneration, + runId: input.ctx.runId, + userText: input.userText, }; - latestEpisodeBySession.set(sessionId, binding); + latestEpisodeBySession.set(input.sessionId, binding); for (const key of binding.keys) { episodeBindingByTurnKey.set(key, binding); } @@ -1137,17 +1195,20 @@ export function createOpenClawBridge(opts: BridgeOptions): BridgeHandle { function findEpisodeBinding( sessionId: SessionId, ctx: { runId?: string }, - userText?: string, + userText: string | undefined, + mode: "strict_run" | "legacy" = "legacy", ): EpisodeBinding | undefined { for (const key of turnBindingKeys(sessionId, ctx, userText)) { const binding = episodeBindingByTurnKey.get(key); if (binding) return binding; } + if (mode === "strict_run") return undefined; return latestEpisodeBySession.get(sessionId); } function forgetEpisodeBinding(binding: EpisodeBinding | undefined): void { if (!binding) return; + binding.turnStart = undefined; for (const key of binding.keys) { if (episodeBindingByTurnKey.get(key)?.seq === binding.seq) { episodeBindingByTurnKey.delete(key); @@ -1158,6 +1219,49 @@ export function createOpenClawBridge(opts: BridgeOptions): BridgeHandle { } } + async function resolveEpisodeIdForTurnEnd( + sessionId: SessionId, + ctx: { runId?: string }, + userText: string, + ): Promise<{ episodeId: EpisodeId; binding: EpisodeBinding | undefined } | null> { + if (!ctx.runId) { + opts.log.warn("memos.agent_end.missing_run_id", { sessionId }); + } + const binding = findEpisodeBinding(sessionId, ctx, userText, "strict_run"); + + if (binding?.episodeId && bindingStillLive(binding)) { + if (opts.core.episodeExists(binding.episodeId)) { + return { episodeId: binding.episodeId, binding }; + } + } + + if (binding?.turnStart) { + const awaited = await withSoftTimeout(binding.turnStart, AGENT_END_BINDING_CAP_MS); + if (awaited.ok) { + commitEpisodeId(binding, awaited.value); + const id = binding.episodeId; + if (id && opts.core.episodeExists(id)) { + return { episodeId: id, binding }; + } + } else { + opts.log.warn("memos.agent_end.turn_start_await_timeout", { + sessionId, + timeoutMs: AGENT_END_BINDING_CAP_MS, + }); + } + } + + const opened = await opts.core.openEpisode({ + sessionId, + userMessage: userText, + }); + const canonical = opts.core.reconcileEpisodeId(sessionId, opened); + if (canonical && opts.core.isEpisodeWritable(canonical)) { + return { episodeId: canonical, binding }; + } + return null; + } + function forgetSessionBindings(sessionId: SessionId): void { latestEpisodeBySession.delete(sessionId); for (const [key, binding] of episodeBindingByTurnKey.entries()) { @@ -1166,7 +1270,9 @@ export function createOpenClawBridge(opts: BridgeOptions): BridgeHandle { } function currentEpisodeId(sessionId: SessionId): EpisodeId | undefined { - return latestEpisodeBySession.get(sessionId)?.episodeId; + const id = latestEpisodeBySession.get(sessionId)?.episodeId; + if (!id) return undefined; + return opts.core.isEpisodeWritable(id) ? id : undefined; } async function handleBeforePrompt( @@ -1239,12 +1345,28 @@ export function createOpenClawBridge(opts: BridgeOptions): BridgeHandle { }, }; + const turnGeneration = (sessionTurnGeneration.get(sessionId) ?? 0) + 1; + sessionTurnGeneration.set(sessionId, turnGeneration); + const turnStartPromise = opts.core.onTurnStart(turn); - turnStartPromise.catch((err) => { - opts.log.warn("memos.onTurnStart.late_failure", { - err: err instanceof Error ? err.message : String(err), - }); + const binding = rememberEpisodeBinding({ + sessionId, + ctx, + userText: prompt, + turnStart: turnStartPromise, + turnGeneration, }); + + turnStartPromise + .then((packet) => { + commitEpisodeId(binding, packet); + }) + .catch((err) => { + opts.log.warn("memos.onTurnStart.late_failure", { + err: err instanceof Error ? err.message : String(err), + }); + }); + const turnStartResult = await withSoftTimeout( turnStartPromise, BEFORE_PROMPT_SOFT_TIMEOUT_MS, @@ -1256,20 +1378,14 @@ export function createOpenClawBridge(opts: BridgeOptions): BridgeHandle { agentId: ctx.agentId, timeoutMs: BEFORE_PROMPT_SOFT_TIMEOUT_MS, }); + } else if (packet) { + commitEpisodeId(binding, packet); } - // The pipeline orchestrator (V7 §0.1) may have migrated the - // session id (new-task → new session) or reopened a closed - // episode (revision). We trust the ids returned in the packet, - // not our own derivation, so `onTurnEnd` lands on the same row. + const routedSessionId = (packet?.query.sessionId ?? sessionId) as SessionId; - const routedEpisodeId = packet?.query.episodeId as EpisodeId | undefined; - if (routedEpisodeId) { - const seq = ++episodeBindingSeq; - rememberEpisodeBinding(routedSessionId, routedEpisodeId, ctx, prompt, seq); - if (routedSessionId !== sessionId) { - rememberEpisodeBinding(sessionId, routedEpisodeId, ctx, prompt, seq); - } - } + const routedEpisodeId = + binding.episodeId ?? + (packet?.query.episodeId as EpisodeId | undefined); const renderedBlock = renderContextBlock(packet, { // Avoid making OpenClaw do a second tool-driven search when @@ -1397,30 +1513,51 @@ export function createOpenClawBridge(opts: BridgeOptions): BridgeHandle { const isSubagentAnnouncement = isOpenClawSubagentAnnouncementPrompt(turn.userText); const hasSubagentSpawn = toolCalls.some((tc) => tc.name === "sessions_spawn"); - // Resolve (or lazily open) the target episode. Three cases: - // 1. `before_prompt_build` already ran this turn → we have the - // routed episode binding for this run/user turn. - // 2. The host skipped `before_prompt_build` (e.g. /new with no - // prompt build) → create an episode on the fly so the write - // path has a real row to hang traces on. - // 3. Any failure here falls back to opening a new episode — - // better to capture under a fresh id than to drop the turn. - let binding = findEpisodeBinding(sessionId, ctx, turn.userText); - let episodeId = binding?.episodeId; - if (!episodeId) { - if (isSubagentAnnouncement) { + if (isSubagentAnnouncement) { + const probe = findEpisodeBinding( + sessionId, + ctx, + turn.userText, + "strict_run", + ); + if (!probe?.episodeId && !probe?.turnStart) { opts.log.info("memos.agent_end.skipped", { reason: "subagent_announcement_without_parent_episode", sessionKey: ctx.sessionKey, }); return; } - await opts.core.openSession({ agent: opts.agent, sessionId, namespace, meta: { namespace } }); - episodeId = await opts.core.openEpisode({ + } + + await opts.core.openSession({ + agent: opts.agent, + sessionId, + namespace, + meta: { namespace }, + }); + + const resolved = await resolveEpisodeIdForTurnEnd( + sessionId, + ctx, + turn.userText, + ); + if (!resolved) { + opts.log.warn("memos.agent_end.skipped", { + reason: "no_writable_episode", + sessionKey: ctx.sessionKey, + }); + return; + } + const { episodeId } = resolved; + let binding = resolved.binding; + if (!binding) { + binding = rememberEpisodeBinding({ sessionId, - userMessage: turn.userText, + ctx, + userText: turn.userText, + episodeId, + turnGeneration: sessionTurnGeneration.get(sessionId) ?? 0, }); - binding = rememberEpisodeBinding(sessionId, episodeId, ctx, turn.userText); } const turnResult: TurnResultDTO = { diff --git a/apps/memos-local-plugin/agent-contract/memory-core.ts b/apps/memos-local-plugin/agent-contract/memory-core.ts index 9c3ad826c..77dd9c76e 100644 --- a/apps/memos-local-plugin/agent-contract/memory-core.ts +++ b/apps/memos-local-plugin/agent-contract/memory-core.ts @@ -180,6 +180,17 @@ export interface MemoryCore { /** Optional initial user text (for adapters that know it). */ userMessage?: string; }): Promise; + /** Current open episode for the session, if any (no new row). */ + resolveOpenEpisodeId(sessionId: SessionId): EpisodeId | undefined; + /** Whether the episode exists and is still `open`. */ + isEpisodeWritable(episodeId: EpisodeId): boolean; + /** Whether the episode row exists (any status). */ + episodeExists(episodeId: EpisodeId): boolean; + /** Prefer orchestrator open episode; else a writable candidate. */ + reconcileEpisodeId( + sessionId: SessionId, + candidate?: EpisodeId, + ): EpisodeId | undefined; closeEpisode(episodeId: EpisodeId): Promise; // ── pipeline (per turn) ── diff --git a/apps/memos-local-plugin/core/config/defaults.ts b/apps/memos-local-plugin/core/config/defaults.ts index cf8801dd3..0fad2d956 100644 --- a/apps/memos-local-plugin/core/config/defaults.ts +++ b/apps/memos-local-plugin/core/config/defaults.ts @@ -97,7 +97,7 @@ export const DEFAULT_CONFIG: ResolvedConfig = { // user a short window to reply ("thanks", "no, try again") that // the scorer picks up as explicit feedback; when nothing // arrives, the implicit fallback fires promptly so downstream - // L2/L3/Skill stages aren't starved of signal. + // L2/L3/Skill stages aren't starved of signal. Must not be 0 (min 1s). feedbackWindowSec: 30, summaryMaxChars: 2_000, llmConcurrency: 2, diff --git a/apps/memos-local-plugin/core/config/schema.ts b/apps/memos-local-plugin/core/config/schema.ts index 71c1ef362..5bad29414 100644 --- a/apps/memos-local-plugin/core/config/schema.ts +++ b/apps/memos-local-plugin/core/config/schema.ts @@ -160,10 +160,10 @@ const AlgorithmSchema = Type.Object({ /** Auto-trigger backprop when R_human ≥ this from implicit signals. */ implicitThreshold: NumberInRange(0.2, 0, 1), /** - * Seconds to wait for explicit user feedback after `capture.done` before - * falling back to implicit-signals scoring. 0 disables the timer. + * Seconds to wait after `capture.done` before episode-level scoring. + * Minimum 1 second; values below 1 are rejected or clamped to 1. */ - feedbackWindowSec: NumberInRange(600, 0, 86_400), + feedbackWindowSec: NumberInRange(600, 1, 86_400), /** Max characters for the task summary fed into the human-scorer LLM. */ summaryMaxChars: NumberInRange(2_000, 200, 16_000), /** Concurrency for human-scoring LLM calls. */ diff --git a/apps/memos-local-plugin/core/experience/corrective-signals.ts b/apps/memos-local-plugin/core/experience/corrective-signals.ts new file mode 100644 index 000000000..a7d8cdaa8 --- /dev/null +++ b/apps/memos-local-plugin/core/experience/corrective-signals.ts @@ -0,0 +1,304 @@ +import type { EpisodeId, FeedbackId, FeedbackRow, TraceId, TraceRow } from "../types.js"; +import { feedbackText } from "./feedback-builder.js"; + +/** How far (ms) feedback.ts may sit from an anchored trace.ts and still count as "at turn end". */ +const AT_TURN_END_MS = 5_000; + +export type CorrectiveSignalKind = "human_feedback" | "verifier_directives"; + +export type CorrectiveTiming = + | "at_turn_end" + | "after_turn" + | "before_first_turn" + | "after_last_turn" + | "between_turns" + | "unanchored"; + +export interface EpisodeTurnTimeline { + turn_index: number; + turn_id: number; + trace_ids: TraceId[]; + started_at: number; + ended_at: number; + user_preview: string; +} + +export interface CorrectiveSignalEntry { + feedback_id: FeedbackId; + submitted_at: number; + channel: FeedbackRow["channel"]; + polarity: FeedbackRow["polarity"]; + kind: CorrectiveSignalKind; + text: string; + trace_id: TraceId | null; + turn_index: number | null; + timing: CorrectiveTiming; + /** Human-readable anchor for the LLM (includes turn index and deltas). */ + timing_label: string; + delta_ms_after_turn_end: number | null; + delta_ms_after_episode_start: number; + nearest_trace_id: TraceId | null; + nearest_trace_ts: number | null; +} + +export interface CorrectiveSignalsPayload { + episode_timeline: { + episode_id: EpisodeId; + trace_span: { first_ts: number; last_ts: number }; + turns: EpisodeTurnTimeline[]; + }; + corrective_signals: CorrectiveSignalEntry[]; +} + +interface TurnBucket { + turnIndex: number; + turnId: number; + traceIds: TraceId[]; + startedAt: number; + endedAt: number; + userPreview: string; +} + +export function buildCorrectiveSignalsForSink( + episodeId: EpisodeId, + traces: readonly TraceRow[], + feedbacks: readonly FeedbackRow[], +): CorrectiveSignalsPayload { + const ordered = [...traces].sort((a, b) => a.ts - b.ts); + const turns = buildTurnBuckets(ordered); + const turnById = new Map(turns.map((t) => [t.turnId, t])); + const traceById = new Map(ordered.map((t) => [t.id, t])); + const episodeStart = ordered[0]?.ts ?? 0; + const episodeEnd = ordered[ordered.length - 1]?.ts ?? episodeStart; + + const substantive = feedbacks + .filter(isSubstantiveFeedback) + .sort((a, b) => a.ts - b.ts); + + const corrective_signals = substantive.map((fb) => + anchorFeedback(fb, turns, turnById, traceById, episodeStart, episodeEnd), + ); + + return { + episode_timeline: { + episode_id: episodeId, + trace_span: { first_ts: episodeStart, last_ts: episodeEnd }, + turns: turns.map((t) => ({ + turn_index: t.turnIndex, + turn_id: t.turnId, + trace_ids: t.traceIds, + started_at: t.startedAt, + ended_at: t.endedAt, + user_preview: trimPreview(t.userPreview, 120), + })), + }, + corrective_signals, + }; +} + +function buildTurnBuckets(ordered: readonly TraceRow[]): TurnBucket[] { + const byTurn = new Map(); + for (const t of ordered) { + const key = t.turnId; + const list = byTurn.get(key) ?? []; + list.push(t); + byTurn.set(key, list); + } + const turnIds = [...byTurn.keys()].sort((a, b) => { + const minA = Math.min(...(byTurn.get(a) ?? []).map((t) => t.ts)); + const minB = Math.min(...(byTurn.get(b) ?? []).map((t) => t.ts)); + return minA - minB; + }); + return turnIds.map((turnId, idx) => { + const group = (byTurn.get(turnId) ?? []).sort((a, b) => a.ts - b.ts); + const userPreview = + group.find((t) => t.userText.trim().length > 0)?.userText.trim() ?? ""; + return { + turnIndex: idx + 1, + turnId, + traceIds: group.map((t) => t.id), + startedAt: group[0]?.ts ?? 0, + endedAt: group[group.length - 1]?.ts ?? 0, + userPreview, + }; + }); +} + +function anchorFeedback( + fb: FeedbackRow, + turns: TurnBucket[], + turnById: Map, + traceById: Map, + episodeStart: number, + episodeEnd: number, +): CorrectiveSignalEntry { + const text = trimPreview(feedbackText(fb), 800); + const kind = detectKind(fb); + const deltaEpisode = fb.ts - episodeStart; + + let bucket: TurnBucket | null = null; + let nearestTrace: TraceRow | null = null; + + if (fb.traceId) { + nearestTrace = traceById.get(fb.traceId) ?? null; + if (nearestTrace) bucket = turnById.get(nearestTrace.turnId) ?? null; + } + if (!bucket && turns.length > 0) { + bucket = inferTurnByTimestamp(fb.ts, turns); + if (bucket && !nearestTrace) { + nearestTrace = + traceById.get(bucket.traceIds[bucket.traceIds.length - 1]!) ?? null; + } + } + + const turn_index = bucket?.turnIndex ?? null; + const { timing, delta_ms_after_turn_end } = resolveTiming( + fb.ts, + bucket, + nearestTrace, + turns, + episodeEnd, + ); + + return { + feedback_id: fb.id, + submitted_at: fb.ts, + channel: fb.channel, + polarity: fb.polarity, + kind, + text, + trace_id: fb.traceId, + turn_index, + timing, + timing_label: formatTimingLabel({ + timing, + turn_index, + delta_ms_after_turn_end, + delta_ms_after_episode_start: deltaEpisode, + submitted_at: fb.ts, + trace_id: fb.traceId, + }), + delta_ms_after_turn_end, + delta_ms_after_episode_start: deltaEpisode, + nearest_trace_id: nearestTrace?.id ?? null, + nearest_trace_ts: nearestTrace?.ts ?? null, + }; +} + +function inferTurnByTimestamp(ts: number, turns: TurnBucket[]): TurnBucket | null { + for (const t of turns) { + if (ts >= t.startedAt && ts <= t.endedAt) return t; + } + let lastBefore: TurnBucket | null = null; + for (const t of turns) { + if (t.endedAt <= ts) lastBefore = t; + else break; + } + return lastBefore ?? turns[0] ?? null; +} + +function resolveTiming( + feedbackTs: number, + bucket: TurnBucket | null, + nearestTrace: TraceRow | null, + turns: TurnBucket[], + episodeEnd: number, +): { timing: CorrectiveTiming; delta_ms_after_turn_end: number | null } { + if (!bucket || turns.length === 0) { + return { timing: "unanchored", delta_ms_after_turn_end: null }; + } + + const deltaAfterTurnEnd = feedbackTs - bucket.endedAt; + + if (feedbackTs < turns[0]!.startedAt) { + return { timing: "before_first_turn", delta_ms_after_turn_end: null }; + } + + const next = turns.find((t) => t.turnIndex === bucket.turnIndex + 1); + if ( + next && + feedbackTs > bucket.endedAt && + feedbackTs < next.startedAt + ) { + return { timing: "between_turns", delta_ms_after_turn_end: deltaAfterTurnEnd }; + } + + if ( + nearestTrace && + Math.abs(feedbackTs - nearestTrace.ts) <= AT_TURN_END_MS && + feedbackTs <= bucket.endedAt + ) { + return { timing: "at_turn_end", delta_ms_after_turn_end: feedbackTs - nearestTrace.ts }; + } + + if (feedbackTs >= bucket.startedAt && feedbackTs <= bucket.endedAt) { + return { + timing: "at_turn_end", + delta_ms_after_turn_end: feedbackTs - bucket.endedAt, + }; + } + + if (feedbackTs > bucket.endedAt) { + const last = turns[turns.length - 1]!; + if ( + bucket.turnIndex === last.turnIndex && + feedbackTs > episodeEnd + AT_TURN_END_MS + ) { + return { timing: "after_last_turn", delta_ms_after_turn_end: deltaAfterTurnEnd }; + } + return { timing: "after_turn", delta_ms_after_turn_end: deltaAfterTurnEnd }; + } + + return { timing: "before_first_turn", delta_ms_after_turn_end: null }; +} + +function formatTimingLabel(args: { + timing: CorrectiveTiming; + turn_index: number | null; + delta_ms_after_turn_end: number | null; + delta_ms_after_episode_start: number; + submitted_at: number; + trace_id: TraceId | null; +}): string { + const turn = args.turn_index ?? "?"; + const dTurn = + args.delta_ms_after_turn_end != null + ? `${args.delta_ms_after_turn_end}ms after turn ${turn} ended` + : "no turn-end delta"; + const dEp = `${args.delta_ms_after_episode_start}ms after episode start`; + const trace = args.trace_id ? ` trace=${args.trace_id}` : ""; + return `${args.timing} @ turn ${turn} (submitted_at=${args.submitted_at}; ${dTurn}; ${dEp})${trace}`; +} + +function detectKind(fb: FeedbackRow): CorrectiveSignalKind { + const raw = fb.raw; + if (!raw || typeof raw !== "object") return "human_feedback"; + const rec = raw as Record; + if (rec.verifier && typeof rec.verifier === "object") return "verifier_directives"; + const keys = ["must", "must_not", "mustNot", "MUST", "MUST_NOT", "passed", "total"]; + if (keys.some((k) => k in rec)) return "verifier_directives"; + const lower = feedbackText(fb).toLowerCase(); + if (lower.includes("verifier")) return "verifier_directives"; + return "human_feedback"; +} + +export function isSubstantiveFeedback(feedback: FeedbackRow): boolean { + if (feedbackText(feedback).length > 0) return true; + const raw = feedback.raw; + if (!raw || typeof raw !== "object") return false; + const record = raw as Record; + const direct = ["text", "message", "reason", "content", "directive"] + .map((k) => record[k]) + .find((v) => typeof v === "string" && v.trim().length > 0); + if (typeof direct === "string" && direct.trim().length > 0) return true; + const must = ["must", "must_not", "mustNot", "MUST", "MUST_NOT"] + .map((k) => record[k]) + .find((v) => typeof v === "string" && v.trim().length > 0); + return typeof must === "string" && must.trim().length > 0; +} + +function trimPreview(text: string, max: number): string { + const t = text.trim(); + if (t.length <= max) return t; + return `${t.slice(0, max - 1)}…`; +} diff --git a/apps/memos-local-plugin/core/experience/failure-builder.ts b/apps/memos-local-plugin/core/experience/failure-builder.ts index 4d0cacfa9..f3ab0a492 100644 --- a/apps/memos-local-plugin/core/experience/failure-builder.ts +++ b/apps/memos-local-plugin/core/experience/failure-builder.ts @@ -5,26 +5,29 @@ import { languageSteeringLine, } from "../llm/prompts/index.js"; import { FAILURE_EXPERIENCE_SINK_PROMPT } from "../llm/prompts/failure-experience-sink.js"; -import { reflectionAsText } from "../capture/types.js"; import { ids } from "../id.js"; import { deriveMergeFamily } from "./merge-family.js"; import type { EpisodeId, FeedbackId, + FeedbackRow, PolicyId, PolicyRow, TraceRow, } from "../types.js"; import type { Repos } from "../storage/repos/index.js"; +import { buildCorrectiveSignalsForSink } from "./corrective-signals.js"; export interface RunL2FailureInput { episodeId: EpisodeId; sessionId: TraceRow["sessionId"]; traces: readonly TraceRow[]; + /** When omitted, loaded from `feedback.getForEpisode` in `runL2Failure`. */ + feedbacks?: readonly FeedbackRow[]; } export interface RunL2FailureDeps { - repos: Pick; + repos: Pick; llm: LlmClient | null; log: Logger; now?: () => number; @@ -43,9 +46,11 @@ export async function runL2Failure( if (!deps.llm) return { created: false, skippedReason: "llm_disabled" }; if (input.traces.length === 0) return { created: false, skippedReason: "no_traces" }; const now = deps.now?.() ?? Date.now(); - const payload = buildSinkInput(input); + const feedbacks = + input.feedbacks ?? deps.repos.feedback.getForEpisode(input.episodeId); + const payload = buildSinkInput(input, feedbacks); const lang = detectDominantLanguage( - input.traces.flatMap((t) => [t.userText, t.agentText, reflectionAsText(t.reflection)]), + input.traces.flatMap((t) => [t.userText, t.agentText]), ); try { const rsp = await deps.llm.completeJson<{ @@ -75,6 +80,7 @@ export async function runL2Failure( if (!hasActionableGuidance(norm.decisionGuidance)) { return { created: false, skippedReason: "empty_guidance" }; } + const sourceFeedbackIds = feedbacks.map((f) => f.id); const policyId = ids.policy() as PolicyId; const evidencePolarity = deriveEvidencePolarity(norm.decisionGuidance); const row: PolicyRow = { @@ -94,7 +100,10 @@ export async function runL2Failure( evidencePolarity, sourceEpisodeIds: [input.episodeId], sourceTraceIds: norm.supportTraceIds, - sourceFeedbackIds: [`f:sink:${input.episodeId}` as FeedbackId], + sourceFeedbackIds: + sourceFeedbackIds.length > 0 + ? sourceFeedbackIds + : [`f:sink:${input.episodeId}` as FeedbackId], inducedBy: `${FAILURE_EXPERIENCE_SINK_PROMPT.id}.v${FAILURE_EXPERIENCE_SINK_PROMPT.version}`, mergeFamily: deriveMergeFamily({ experienceType: norm.experienceType, @@ -118,11 +127,16 @@ export async function runL2Failure( } } -function buildSinkInput(input: RunL2FailureInput): Record { +function buildSinkInput( + input: RunL2FailureInput, + feedbacks: readonly FeedbackRow[], +): Record { const ordered = [...input.traces].sort((a, b) => a.ts - b.ts); const userGoal = ordered.find((t) => t.userText.trim().length > 0)?.userText ?? ""; const chunks = ordered.slice(-5).map((t) => ({ trace_id: t.id, + turn_id: t.turnId, + trace_ts: t.ts, user: trim(t.userText, 300), agent: trim(t.agentText, 500), tools: (t.toolCalls ?? []).slice(0, 3).map((tool) => ({ @@ -130,8 +144,12 @@ function buildSinkInput(input: RunL2FailureInput): Record { output: trim(safeStringify(tool.output), 240), error_code: tool.errorCode ?? null, })), - reflection: trim(reflectionAsText(t.reflection) ?? "", 220), })); + const anchored = buildCorrectiveSignalsForSink( + input.episodeId, + input.traces, + feedbacks, + ); return { episode_id: input.episodeId, session_id: input.sessionId, @@ -139,7 +157,8 @@ function buildSinkInput(input: RunL2FailureInput): Record { user_goal: trim(userGoal, 500), }, phase_chunks: chunks, - corrective_signals: [], + episode_timeline: anchored.episode_timeline, + corrective_signals: anchored.corrective_signals, }; } diff --git a/apps/memos-local-plugin/core/llm/prompts/failure-experience-sink.ts b/apps/memos-local-plugin/core/llm/prompts/failure-experience-sink.ts index d89ea3db6..6e89de606 100644 --- a/apps/memos-local-plugin/core/llm/prompts/failure-experience-sink.ts +++ b/apps/memos-local-plugin/core/llm/prompts/failure-experience-sink.ts @@ -3,19 +3,26 @@ import type { PromptDef } from "./index.js"; export const FAILURE_EXPERIENCE_SINK_PROMPT: PromptDef = { id: "failure.experience.sink", version: 1, - description: "Induce failure-aware candidate policy from one failed episode without direct corrective feedback.", + description: "Induce failure-aware candidate policy from a failed episode, including time-anchored corrective feedback.", system: `You induce a candidate policy from a failed agent episode. Goal: - Extract one reusable policy that helps avoid or repair similar failures. - The policy must be operational (trigger + procedure + verification), not generic commentary. +Input: +- phase_chunks: recent traces with trace_ts / turn_id (conversation + tools). +- episode_timeline.turns: ordered user turns with started_at / ended_at (epoch ms). +- corrective_signals: human or verifier feedback with turn_index, timing, and deltas vs trace/turn timestamps. + Feedback that arrives AFTER a turn ended often corrects the agent's reply on that turn — weight timing heavily. + Rules: -1) Stay grounded in provided phase chunks and tool evidence. Do not invent tests/files/errors. -2) Keep trigger task-level and recognizable at decision time. -3) If you can propose what to do, use "repair_instruction"; if only what to avoid, use "failure_avoidance". -4) decision_guidance.prefer should contain positive corrective hints (may be empty). -5) decision_guidance.avoid should contain anti-pattern hints (may be empty). +1) Stay grounded in phase_chunks, episode_timeline, and corrective_signals. Do not invent tests/files/errors. +2) When corrective_signals exist, merge their intent into decision_guidance; prefer signals with clear turn_index + after_turn timing for anti-patterns on that turn. +3) Keep trigger task-level and recognizable at decision time. +4) If you can propose what to do, use "repair_instruction"; if only what to avoid, use "failure_avoidance". +5) decision_guidance.prefer should contain positive corrective hints (may be empty). +6) decision_guidance.avoid should contain anti-pattern hints (may be empty). Return JSON: { diff --git a/apps/memos-local-plugin/core/llm/prompts/reward.ts b/apps/memos-local-plugin/core/llm/prompts/reward.ts index b04c8861e..59760f3ee 100644 --- a/apps/memos-local-plugin/core/llm/prompts/reward.ts +++ b/apps/memos-local-plugin/core/llm/prompts/reward.ts @@ -18,7 +18,7 @@ import type { PromptDef } from "./index.js"; */ export const REWARD_R_HUMAN_PROMPT: PromptDef = { id: "reward.r_human", - version: 3, + version: 4, description: "Score an episode's R_human from a multi-turn task summary + user feedback.", system: `You are a strict grader of AI-agent task execution. @@ -34,18 +34,33 @@ You receive: usually the truest signal of whether the agent is actually tracking where the user is now. - FEEDBACK — the user's own messages AFTER the task attempt - finished. May be short ("ok thanks"), explicit - ("try again with X"), or structured ("resolved, but - too slow"). Frequently empty. + finished. Format: [SOURCE/polarity @ISO-timestamp] + SOURCE=USER means the user directly wrote this; + SOURCE=INFERRED means the system inferred sentiment + (treat with lower confidence than USER). + May be empty. +- EXECUTION_OUTCOME — machine-derived summary of tool call results + across this episode. + task_completed_by_tool values: + "yes" — the last tool call in the episode + completed without error. + "no" — the last tool call errored, or only + verbal output followed tool failures. + "unknown" — no tool calls in this episode + (text-only task); do not penalize. Grade the agent on THREE INDEPENDENT AXES, each in [-1, 1]: 1. "goal_achievement" — did the agent address what the user ACTUALLY asked? - +1.0 every user ask across the exchange was addressed correctly. + +1.0 every user ask was correctly addressed AND (if tools were used) + EXECUTION_OUTCOME shows task_completed_by_tool=yes. +0.3 the last ask was addressed well; earlier asks had minor gaps. - 0.0 unclear if the user's ask was met. - -0.3 missed a significant portion of what was asked. - -1.0 fundamentally wrong answer / caused damage. + 0.0 unclear if the user's ask was met. + -0.3 agent verbally acknowledged the correct approach but did NOT + re-execute; or missed a significant portion of what was asked. + Use this when EXECUTION_OUTCOME shows task_completed_by_tool=no + and the last agent reply is explanatory text only. + -1.0 fundamentally wrong answer / caused damage / refused without reason. CRITICAL RULE — do NOT anchor on the first user turn. A user who starts with "上海天气" and later pivots to "再查北京天气" is a user @@ -55,10 +70,21 @@ Grade the agent on THREE INDEPENDENT AXES, each in [-1, 1]: toward the most recent exchange (which is where the user actually is now). + EXECUTION RULE — distinguish verbal acknowledgment from actual execution. + If EXECUTION_OUTCOME.task_completed_by_tool is "no", the agent's last + meaningful action was a failed tool call; any subsequent agent reply is + verbal-only. In this case goal_achievement must NOT exceed 0.0 unless + TASK_SUMMARY shows the agent successfully re-executed the task afterward. + A correct verbal description of what "should have been done" is NOT + the same as doing it. + 2. "process_quality" - +1.0 clean, minimal, correct reasoning across all turns. - 0.0 reasonable but not great. - -1.0 lots of thrashing, wrong tools, noisy output. + +1.0 clean, minimal, correct reasoning; tool calls efficient and successful. + +0.3 goal achieved but with redundant steps or minor tool retry. + 0.0 reasonable overall; path not clean but not harmful. + -0.3 one significant wrong tool call or reasoning error, self-corrected. + -1.0 repeated thrashing, wrong tools, severe noisy output, or left + task in broken state without recovery. 3. "user_satisfaction" (from FEEDBACK text tone + trailing user asks) +1.0 thanks / happy / "做的很好" / accepts and closes out. @@ -79,6 +105,14 @@ Rules: questions correctly. If hostModel/hostProvider are provided, treat them as the authoritative runtime context unless the conversation itself contains a correction. +- CONSISTENCY: if user_satisfaction ≤ -0.3, do NOT assign goal_achievement + above +0.3 unless TASK_SUMMARY contains explicit evidence of successful + recovery AFTER the negative feedback (a new successful tool call, or the + user explicitly accepting the outcome). Negative feedback is a strong + prior that goals were not fully met. +- If FEEDBACK contains explicit correction language ("no", "wrong", + "try again", "重做") with no subsequent acceptance signal, + goal_achievement must be ≤ 0.0. - Produce one short justification. Return JSON, EXACTLY this shape (no extra keys, no commentary): diff --git a/apps/memos-local-plugin/core/memory/l2/subscriber.ts b/apps/memos-local-plugin/core/memory/l2/subscriber.ts index 881cb938d..23656bcec 100644 --- a/apps/memos-local-plugin/core/memory/l2/subscriber.ts +++ b/apps/memos-local-plugin/core/memory/l2/subscriber.ts @@ -113,20 +113,12 @@ export function attachL2Subscriber(deps: L2SubscriberDeps): L2SubscriberHandle { }); return; } - // Failure episodes with explicit corrective feedback are handled by - // feedback.experience.v1; L2 induction is reserved for no-feedback sink. - if (hasCorrectiveSignals(feedbacks)) { - subLog.info("failure.route.feedback_experience", { - episodeId: result.episodeId, - feedbackCount: feedbacks.length, - }); - return; - } const sink = await runL2Failure( { episodeId: result.episodeId, sessionId: result.sessionId, traces, + feedbacks, }, { repos: deps.repos, @@ -403,29 +395,6 @@ function refreshPolicyGainOnSuccess(input: { } } -function hasCorrectiveSignals(feedbacks: readonly FeedbackRow[]): boolean { - return feedbacks.some(hasSubstantiveCorrectiveSignal); -} - -function hasSubstantiveCorrectiveSignal(feedback: FeedbackRow): boolean { - const rationale = typeof feedback.rationale === "string" ? feedback.rationale.trim() : ""; - if (rationale.length > 0) return true; - - const raw = feedback.raw; - if (!raw || typeof raw !== "object") return false; - const record = raw as Record; - - const direct = ["text", "message", "reason", "content", "directive"] - .map((k) => record[k]) - .find((v) => typeof v === "string" && v.trim().length > 0); - if (typeof direct === "string" && direct.trim().length > 0) return true; - - const must = ["must", "must_not", "mustNot", "MUST", "MUST_NOT"] - .map((k) => record[k]) - .find((v) => typeof v === "string" && v.trim().length > 0); - return typeof must === "string" && must.trim().length > 0; -} - function decideFailureGate(input: { traces: readonly TraceRow[]; verifierPassed: boolean | null; diff --git a/apps/memos-local-plugin/core/pipeline/memory-core.ts b/apps/memos-local-plugin/core/pipeline/memory-core.ts index 362529280..2ac8240d7 100644 --- a/apps/memos-local-plugin/core/pipeline/memory-core.ts +++ b/apps/memos-local-plugin/core/pipeline/memory-core.ts @@ -75,8 +75,7 @@ import type { import type { ResolvedConfig, ResolvedHome } from "../config/index.js"; import { loadConfig, resolveHome, SECRET_FIELD_PATHS } from "../config/index.js"; import { reflectionAsText } from "../capture/types.js"; -import { feedbackText, runFeedbackExperience } from "../experience/feedback-builder.js"; -import { isRepairCandidatePolicy, mintRepairCandidate } from "../skill/repair-candidate.js"; +import { feedbackText } from "../experience/feedback-builder.js"; import { rootLogger } from "../logger/index.js"; import type { Logger } from "../logger/types.js"; import { openDb } from "../storage/connection.js"; @@ -113,7 +112,6 @@ import type { RetrievalConfig, TraceCandidate, } from "../retrieval/types.js"; -import type { UserFeedback } from "../reward/types.js"; // ─── Public bootstrap helpers ─────────────────────────────────────────────── @@ -1563,6 +1561,8 @@ export function createMemoryCore( userMessage?: string; }): Promise { ensureLive(); + const existing = handle.resolveOpenEpisodeId(input.sessionId); + if (existing) return existing; const snap = await handle.sessionManager.startEpisode({ sessionId: input.sessionId, userMessage: input.userMessage?.trim() || "(adapter-initiated)", @@ -1571,6 +1571,42 @@ export function createMemoryCore( return snap.id as EpisodeId; } + function resolveOpenEpisodeId(sessionId: SessionId): EpisodeId | undefined { + ensureLive(); + return handle.resolveOpenEpisodeId(sessionId); + } + + function isEpisodeWritable(episodeId: EpisodeId): boolean { + ensureLive(); + const snap = handle.sessionManager.getEpisode(episodeId); + return snap?.status === "open"; + } + + function episodeExists(episodeId: EpisodeId): boolean { + ensureLive(); + return handle.sessionManager.getEpisode(episodeId) != null; + } + + function reconcileEpisodeId( + sessionId: SessionId, + candidate?: EpisodeId, + ): EpisodeId | undefined { + ensureLive(); + const canonical = handle.resolveOpenEpisodeId(sessionId); + if (canonical) { + if (candidate && candidate !== canonical) { + log.warn("reconcileEpisodeId.canonical_override", { + sessionId, + candidate, + canonical, + }); + } + return canonical; + } + if (candidate && isEpisodeWritable(candidate)) return candidate; + return undefined; + } + async function closeEpisode(episodeId: EpisodeId): Promise { ensureLive(); const snap = handle.sessionManager.getEpisode(episodeId); @@ -1898,33 +1934,6 @@ export function createMemoryCore( return toFeedbackDTO(row); } - if (episode && sessionId) { - const rewardFeedback: UserFeedback = { - id: row.id as UserFeedback["id"], - episodeId: episode.id, - sessionId, - traceId: row.traceId as TraceId | null, - ts: row.ts, - channel: row.channel, - polarity: row.polarity, - magnitude: row.magnitude, - text: text || null, - rationale: row.rationale, - }; - try { - await handle.rewardRunner.run({ - episodeId: episode.id, - feedback: [rewardFeedback], - trigger: "explicit_feedback", - }); - } catch (err) { - log.warn("feedback.reward_failed", { - episodeId: episode.id, - err: err instanceof Error ? err.message : String(err), - }); - } - } - if (targetTrace) { // Keep explicit trace feedback as the final source of truth for that trace. // Reward backprop updates episode-wide values and may dilute single-trace @@ -1959,66 +1968,6 @@ export function createMemoryCore( } } - let policyId: PolicyId | undefined; - try { - const experience = await runFeedbackExperience( - { feedback: row, episode, trace }, - { - repos: handle.repos, - embedder: handle.embedder, - llm: handle.llm ?? undefined, - namespace: handle.namespace, - now: Date.now, - }, - ); - policyId = experience.policyId; - } catch (err) { - log.warn("feedback.experience_failed", { - episodeId: episode?.id, - err: err instanceof Error ? err.message : String(err), - }); - } - - try { - await handle.l2.drain(); - if (policyId) { - // A constructive negative (failure + named fix) mints an unproven - // repair *candidate* skill that earns trust via trials. The normal - // crystallization below skips negatives, so there is no conflict; the - // candidate dedups against it via sourcePolicyIds. - const pol = handle.repos.policies.getById(policyId); - if (pol && isRepairCandidatePolicy(pol)) { - // Best-effort: a mint failure must never block crystallization / L3. - try { - mintRepairCandidate(pol, { - repos: handle.repos, - embedder: handle.embedder, - now: Date.now, - log, - }); - } catch (err) { - log.warn("feedback.repair_candidate_failed", { - policyId, - err: err instanceof Error ? err.message : String(err), - }); - } - } - await handle.skills.runOnce({ trigger: "manual", policyId }); - } - if (episode) { - await handle.l3.runOnce({ trigger: "manual", episodeId: episode.id }); - } - await handle.skills.flush(); - await handle.feedback.flush(); - await handle.l3.drain(); - } catch (err) { - log.warn("feedback.downstream_flush_failed", { - episodeId: episode?.id, - policyId, - err: err instanceof Error ? err.message : String(err), - }); - } - if (telemetry) { telemetry.trackFeedback( handle.namespace.agentKind, @@ -4537,6 +4486,10 @@ export function createMemoryCore( openSession, closeSession, openEpisode, + resolveOpenEpisodeId, + isEpisodeWritable, + episodeExists, + reconcileEpisodeId, closeEpisode, onTurnStart, onTurnEnd, diff --git a/apps/memos-local-plugin/core/pipeline/orchestrator.ts b/apps/memos-local-plugin/core/pipeline/orchestrator.ts index fc8d18345..ecc4e11e7 100644 --- a/apps/memos-local-plugin/core/pipeline/orchestrator.ts +++ b/apps/memos-local-plugin/core/pipeline/orchestrator.ts @@ -818,6 +818,32 @@ export function createPipeline(deps: PipelineDeps): PipelineHandle { openEpisodeBySession.delete(sessionId); } + /** Same authority as `onTurnEnd` fallback — adapters must not mint a second open row. */ + function resolveOpenEpisodeId(sessionId: SessionId): EpisodeId | undefined { + const fromMap = openEpisodeBySession.get(sessionId); + if (fromMap) { + const snap = session.sessionManager.getEpisode(fromMap); + if (snap?.status === "open") return fromMap; + } + const openRows = session.sessionManager + .listEpisodes(sessionId) + .filter((e) => e.status === "open"); + if (openRows.length === 1) { + return openRows[0]!.id as EpisodeId; + } + if (openRows.length > 1) { + if (fromMap && openRows.some((e) => e.id === fromMap)) { + return fromMap; + } + log.warn("resolveOpenEpisodeId.multiple_open", { + sessionId, + count: openRows.length, + }); + return undefined; + } + return undefined; + } + function staleTopicWindowMs(): number { return Math.max( algorithm.session.mergeMaxGapMs * 2, @@ -1227,9 +1253,23 @@ export function createPipeline(deps: PipelineDeps): PipelineHandle { const explicitEpisode = result.episodeId ? session.sessionManager.getEpisode(result.episodeId) : null; - const episodeId = explicitEpisode + let episodeId = explicitEpisode ? result.episodeId : openEpisodeBySession.get(sessionId) ?? result.episodeId; + const canonical = resolveOpenEpisodeId(sessionId); + const givenSnap = episodeId + ? session.sessionManager.getEpisode(episodeId) + : null; + if (!episodeId || !givenSnap) { + if (canonical) episodeId = canonical; + } else if (canonical && canonical !== episodeId) { + log.warn("onTurnEnd.episode_id_mismatch", { + sessionId, + given: episodeId, + canonical, + givenStatus: givenSnap.status, + }); + } if (!episodeId) { throw new Error( "pipeline.onTurnEnd: no open episode for session " + sessionId, @@ -1620,6 +1660,7 @@ export function createPipeline(deps: PipelineDeps): PipelineHandle { flush, shutdown, retrievalDeps: () => retrievalDeps, + resolveOpenEpisodeId, }; log.info("pipeline.ready", { diff --git a/apps/memos-local-plugin/core/pipeline/types.ts b/apps/memos-local-plugin/core/pipeline/types.ts index 7969b7d39..09307ffaf 100644 --- a/apps/memos-local-plugin/core/pipeline/types.ts +++ b/apps/memos-local-plugin/core/pipeline/types.ts @@ -216,6 +216,12 @@ export interface PipelineHandle { /** Compose a retrieval-deps instance scoped to this pipeline. Used by tests. */ retrievalDeps(): RetrievalDeps; + + /** + * Current open episode for a session (orchestrator map + single open row). + * Used by adapters when lazy-opening must not mint a phantom id. + */ + resolveOpenEpisodeId(sessionId: SessionId): EpisodeId | undefined; } export interface PipelineBuses { diff --git a/apps/memos-local-plugin/core/reward/ALGORITHMS.md b/apps/memos-local-plugin/core/reward/ALGORITHMS.md index 71fefd6aa..96fee6c6f 100644 --- a/apps/memos-local-plugin/core/reward/ALGORITHMS.md +++ b/apps/memos-local-plugin/core/reward/ALGORITHMS.md @@ -125,15 +125,15 @@ is that it must eventually happen, and preferably once per "task-level feedback process". We implement that with a small state machine: ``` -capture.done ──────────▶ pending{episodeId} +capture.done ──────────▶ pending{episodeId} + schedule (windowSec ≥ 1) │ - │ cfg.feedbackWindowSec > 0 ▼ - setTimeout(run implicit_fallback) - ▲ - │ clearTimeout if submitFeedback comes first + setTimeout → run({ feedback: [], trigger: implicit_fallback }) │ -explicit user feedback ▶ run explicit_feedback (merges prior pending row list) +drain() ───────────────▶ flush ALL pending (same run shape) + +memory-core.submitFeedback → SQLite only (no subscriber run) +reward.run merges feedbackRepo.getForEpisode → meta.trigger may become explicit_feedback ``` - **`trigger` field** on the run is metadata only; downstream consumers diff --git a/apps/memos-local-plugin/core/reward/README.md b/apps/memos-local-plugin/core/reward/README.md index 8142d068d..d50ec2194 100644 --- a/apps/memos-local-plugin/core/reward/README.md +++ b/apps/memos-local-plugin/core/reward/README.md @@ -10,14 +10,16 @@ Two triggers, both non-blocking: 1. **Auto fallback** — `capture.done` fires for an episode with ≥1 trace. - The subscriber schedules a `feedbackWindowSec` timer. When it expires, - we score with whatever feedback we have (often none → heuristic = 0). -2. **Explicit submit** — `adapter.onFeedback` → feedback repo → the - orchestrator calls `subscription.submitFeedback(row)`. The timer is - cancelled and the run starts immediately with `trigger="explicit_feedback"`. + The subscriber schedules a `feedbackWindowSec` timer (minimum **1** second). + When it expires—or when `drain()` runs—we score once with `feedback: []`; + `reward.run` merges persisted rows from `feedbackRepo.getForEpisode`. +2. **Manual** — `subscription.runManually(episodeId)` for tests and recovery. -Either way the run is async and logged to `core.reward.*` channels; failures -are reported via `onError` and surfaced as `reward.failed` events. +`subscription.submitFeedback` is a **no-op**; use `memory-core.submitFeedback` +to persist feedback to SQLite. Episode `r_task` is written only by `reward.run`. + +Either scheduled run is async and logged to `core.reward.*`; failures use +`onError` / `reward.failed`. ## 2. Inputs @@ -84,7 +86,7 @@ backprop. | `decayHalfLifeDays` | 30 | Half-life for priority decay | | `llmScoring` | true | Use LLM rubric (v2); off = heuristic only | | `implicitThreshold` | 0.2 | Fire-or-not threshold for implicit signals (reserved for classifier) | -| `feedbackWindowSec` | 600 | Time to wait after `capture.done` for explicit feedback; 0 disables | +| `feedbackWindowSec` | 600 | Seconds after `capture.done` before scoring (min **1**; values <1 clamp) | | `summaryMaxChars` | 2000 | Cap on the task-summary string fed to the LLM | | `llmConcurrency` | 2 | Max parallel R_human LLM calls (reserved for pool scheduler) | diff --git a/apps/memos-local-plugin/core/reward/human-scorer.ts b/apps/memos-local-plugin/core/reward/human-scorer.ts index c1a4d58a1..0ce7053dd 100644 --- a/apps/memos-local-plugin/core/reward/human-scorer.ts +++ b/apps/memos-local-plugin/core/reward/human-scorer.ts @@ -195,7 +195,9 @@ function formatFeedback(feedback: readonly UserFeedback[]): string { for (const f of feedback.slice(0, 8)) { const text = (f.text ?? f.rationale ?? "").trim(); if (!text) continue; - lines.push(`- [${f.channel}/${f.polarity}] ${text.slice(0, 800)}`); + const tsLabel = new Date(f.ts).toISOString(); + const sourceLabel = f.channel === "explicit" ? "USER" : "INFERRED"; + lines.push(`- [${sourceLabel}/${f.polarity} @${tsLabel}] ${text.slice(0, 800)}`); } return lines.join("\n"); } diff --git a/apps/memos-local-plugin/core/reward/reward.ts b/apps/memos-local-plugin/core/reward/reward.ts index 8a577ac7b..c5f7aef2e 100644 --- a/apps/memos-local-plugin/core/reward/reward.ts +++ b/apps/memos-local-plugin/core/reward/reward.ts @@ -212,6 +212,7 @@ export function createRewardRunner(deps: RewardDeps): RewardRunner { input.feedback, deps.feedbackRepo.getForEpisode(input.episodeId) as unknown as UserFeedback[], ); + const effectiveTrigger = resolveEffectiveTrigger(input.trigger, mergedFeedback); const humanScore = await scoreHuman( { episodeSummary: summary, feedback: mergedFeedback }, { llm: deps.llm, cfg: { llmScoring: deps.cfg.llmScoring } }, @@ -296,7 +297,7 @@ export function createRewardRunner(deps: RewardDeps): RewardRunner { axes: humanScore.axes, reason: humanScore.reason, scoredAt: startedAt, - trigger: input.trigger, + trigger: effectiveTrigger, traceCount: bp.updates.length, traceIds: bp.updates.map((u) => u.traceId), }, @@ -339,7 +340,7 @@ export function createRewardRunner(deps: RewardDeps): RewardRunner { source: humanScore.source, feedbackCount: mergedFeedback.length, traces: bp.updates.length, - trigger: input.trigger, + trigger: effectiveTrigger, totalMs: result.timings.total, warnings: warnings.length, }); @@ -555,6 +556,15 @@ function decideSkipReason( return null; } +function resolveEffectiveTrigger( + inputTrigger: RewardInput["trigger"], + mergedFeedback: readonly UserFeedback[], +): RewardInput["trigger"] { + if (mergedFeedback.length > 0) return "explicit_feedback"; + if (inputTrigger === "manual") return "manual"; + return "implicit_fallback"; +} + function mergeFeedback( a: readonly UserFeedback[], b: readonly UserFeedback[], diff --git a/apps/memos-local-plugin/core/reward/subscriber.ts b/apps/memos-local-plugin/core/reward/subscriber.ts index 3c72a193c..4c5a02883 100644 --- a/apps/memos-local-plugin/core/reward/subscriber.ts +++ b/apps/memos-local-plugin/core/reward/subscriber.ts @@ -2,19 +2,17 @@ * `subscriber` — glue between `core/capture` and `core/reward`. * * Model: - * 1. When `capture.done` fires on the capture bus, we start a - * "feedback window" for that episode. - * 2. If explicit feedback arrives inside the window, score immediately - * with `trigger="explicit_feedback"`. - * 3. If the window expires without explicit feedback, fall back to - * `trigger="implicit_fallback"` — the human-scorer uses whatever - * implicit signals were persisted by the session/feedback classifier. - * 4. `cfg.feedbackWindowSec = 0` disables the timer entirely; only - * `submitFeedback(...)` / `runManually(...)` can trigger a run. + * 1. When `capture.done` fires for an episode with traces, register it in + * `pending` and schedule one reward run after `feedbackWindowSec` (≥1s). + * 2. Explicit feedback is persisted via `memory-core.submitFeedback` (DB); + * the scheduled run passes `feedback: []` and `reward.run` merges from + * `feedbackRepo.getForEpisode`. + * 3. When the window expires (or `drain()`), run once with + * `trigger="implicit_fallback"` (overridden in `reward.run` when DB has + * feedback rows). + * 4. `submitFeedback` on this subscription is a no-op — do not score here. * - * This module is intentionally small. Phase 15's pipeline orchestrator - * can layer on smarter retry / batching; this subscriber is enough for - * the MVP loop used by integration tests. + * `pendingCount()` = scheduled-but-not-started episodes + in-flight runs. */ import type { @@ -32,23 +30,34 @@ export interface RewardSubscriberOptions { } export interface RewardSubscription { - /** Submit a feedback row and schedule a run if the episode has one in-flight. */ + /** + * Legacy hook — no-op. Episode scoring uses DB feedback at window end; + * use `memory-core.submitFeedback` instead. + */ submitFeedback(feedback: UserFeedback): void; - /** Manual trigger — run NOW, regardless of window or feedback. */ + /** Manual trigger — run NOW, regardless of window. */ runManually(episodeId: EpisodeId, trigger?: "manual" | "explicit_feedback"): Promise; /** Detach from the capture bus. In-flight runs continue. */ stop(): void; - /** Wait for every in-flight run to finish. */ + /** Flush all pending episodes and wait for in-flight runs. */ drain(): Promise; + /** Scheduled episodes (timer not fired) plus in-flight reward runs. */ pendingCount(): number; } interface PendingEpisode { episodeId: EpisodeId; - feedback: UserFeedback[]; timer: ReturnType | null; } +function resolveWindowSec( + cfg: RewardConfig, + opts: RewardSubscriberOptions, +): number { + const raw = opts.feedbackWindowSec ?? cfg.feedbackWindowSec; + return Math.max(1, raw); +} + export function attachRewardSubscriber( captureBus: CaptureEventBus, runner: RewardRunner, @@ -56,7 +65,8 @@ export function attachRewardSubscriber( opts: RewardSubscriberOptions = {}, ): RewardSubscription { const log = rootLogger.child({ channel: "core.reward" }); - const windowMs = (opts.feedbackWindowSec ?? cfg.feedbackWindowSec) * 1_000; + const windowSec = resolveWindowSec(cfg, opts); + const windowMs = windowSec * 1_000; const pending = new Map(); const inflight = new Set>(); @@ -66,12 +76,11 @@ export function attachRewardSubscriber( if (entry.timer) clearTimeout(entry.timer); entry.timer = setTimeout(() => { pending.delete(episodeId); - const feedback = entry.feedback; runInBackground(() => runner.run({ episodeId, - feedback, - trigger: feedback.length > 0 ? "explicit_feedback" : "implicit_fallback", + feedback: [], + trigger: "implicit_fallback", }), ); }, delayMs); @@ -92,49 +101,20 @@ export function attachRewardSubscriber( const unsub = captureBus.on("capture.done", (evt) => { if (evt.kind !== "capture.done") return; const eid = evt.result.episodeId; - // No traces → nothing to backprop onto. Skip scheduling altogether. if (evt.result.traceIds.length === 0) { log.debug("skip.empty_capture", { episodeId: eid }); return; } - // If window is disabled (0s), the subscriber just listens for - // explicit submitFeedback calls; no auto fallback. - if (windowMs === 0) { - pending.set(eid, { episodeId: eid, feedback: [], timer: null }); - return; - } - pending.set(eid, { episodeId: eid, feedback: [], timer: null }); + pending.set(eid, { episodeId: eid, timer: null }); schedule(eid, windowMs); }); return { submitFeedback(feedback: UserFeedback): void { - const eid = feedback.episodeId; - const entry = pending.get(eid); - if (!entry) { - // No pending capture — run immediately (e.g., late feedback on a - // previously-closed episode). - runInBackground(() => - runner.run({ - episodeId: eid, - feedback: [feedback], - trigger: "explicit_feedback", - }), - ); - return; - } - entry.feedback.push(feedback); - // Fire immediately; no point waiting further once we've got explicit - // feedback. - if (entry.timer) clearTimeout(entry.timer); - pending.delete(eid); - runInBackground(() => - runner.run({ - episodeId: eid, - feedback: entry.feedback, - trigger: "explicit_feedback", - }), - ); + log.debug("submitFeedback.noop", { + episodeId: feedback.episodeId, + hint: "persist via memory-core.submitFeedback; score at window end", + }); }, async runManually(episodeId, trigger = "manual") { const entry = pending.get(episodeId); @@ -142,7 +122,7 @@ export function attachRewardSubscriber( pending.delete(episodeId); await runner.run({ episodeId, - feedback: entry?.feedback ?? [], + feedback: [], trigger, }); }, @@ -154,43 +134,27 @@ export function attachRewardSubscriber( unsub(); }, async drain() { - // Step 1: kick every still-pending episode immediately. The - // scheduled `setTimeout` would normally wait `feedbackWindowSec` - // (default 30 s) before firing the implicit fallback. On - // process shutdown we don't have 30 s to wait — without this - // the bridge exits, all timers get GC'd, and the episode - // permanently has `r_task = null` (which then starves L2 / L3 / - // Skill induction of any positive evidence). const flushed: PendingEpisode[] = []; for (const entry of pending.values()) { - if (entry.timer) { - clearTimeout(entry.timer); - flushed.push(entry); - } else if (entry.feedback.length > 0) { - flushed.push(entry); - } + if (entry.timer) clearTimeout(entry.timer); + flushed.push(entry); } pending.clear(); for (const entry of flushed) { runInBackground(() => runner.run({ episodeId: entry.episodeId, - feedback: entry.feedback, - trigger: - entry.feedback.length > 0 - ? "explicit_feedback" - : "implicit_fallback", + feedback: [], + trigger: "implicit_fallback", }), ); } - // Step 2: now wait for every in-flight reward computation - // (including those just kicked above) to settle. while (inflight.size > 0) { await Promise.all(Array.from(inflight)); } }, pendingCount() { - return inflight.size; + return pending.size + inflight.size; }, }; } diff --git a/apps/memos-local-plugin/core/reward/task-summary.ts b/apps/memos-local-plugin/core/reward/task-summary.ts index 88c16720c..db54ec7c0 100644 --- a/apps/memos-local-plugin/core/reward/task-summary.ts +++ b/apps/memos-local-plugin/core/reward/task-summary.ts @@ -89,6 +89,8 @@ export function buildTaskSummary(input: SummaryInput): TaskSummary { ``, `MOST_RECENT_AGENT_REPLY:`, clampAgentText(pairs.length > 0 ? pairs[pairs.length - 1]!.agentText : outcome), + ``, + formatExecutionOutcome(traces), ].join("\n"); const { text, truncated } = clampText(body, cfg.summaryMaxChars); @@ -279,3 +281,73 @@ function clampText(text: string, max: number): { text: string; truncated: boolea truncated: true, }; } + +// ─── execution outcome ─────────────────────────────────────────────────────── + +interface ExecutionOutcome { + totalToolCalls: number; + successCount: number; + errorCount: number; + lastToolResult: "SUCCESS" | "ERROR" | "NONE"; + lastToolName: string | null; + lastErrorCode: string | null; + taskCompletedByTool: "yes" | "no" | "unknown"; +} + +function buildExecutionOutcome(traces: readonly TraceRow[]): ExecutionOutcome { + let totalToolCalls = 0; + let successCount = 0; + let errorCount = 0; + + const sorted = [...traces].sort((a, b) => a.ts - b.ts); + + let lastTraceWithTools: TraceRow | null = null; + for (const trace of sorted) { + const calls = (trace.toolCalls ?? []) as Array<{ name?: string; errorCode?: string }>; + if (calls.length > 0) lastTraceWithTools = trace; + for (const c of calls) { + totalToolCalls++; + if (c.errorCode) errorCount++; + else successCount++; + } + } + + if (!lastTraceWithTools) { + return { + totalToolCalls: 0, successCount: 0, errorCount: 0, + lastToolResult: "NONE", lastToolName: null, lastErrorCode: null, + taskCompletedByTool: "unknown", + }; + } + + const calls = (lastTraceWithTools.toolCalls ?? []) as Array<{ name?: string; errorCode?: string }>; + const lastCall = calls[calls.length - 1]!; + const lastToolResult: "SUCCESS" | "ERROR" = lastCall.errorCode ? "ERROR" : "SUCCESS"; + + return { + totalToolCalls, + successCount, + errorCount, + lastToolResult, + lastToolName: lastCall.name ?? null, + lastErrorCode: lastCall.errorCode ?? null, + taskCompletedByTool: lastToolResult === "SUCCESS" ? "yes" : "no", + }; +} + +function formatExecutionOutcome(traces: readonly TraceRow[]): string { + const o = buildExecutionOutcome(traces); + const lines = ["EXECUTION_OUTCOME:"]; + if (o.totalToolCalls === 0) { + lines.push(" total_tool_calls: 0"); + lines.push(" last_tool_result: NONE"); + lines.push(" task_completed_by_tool: unknown"); + } else { + lines.push(` total_tool_calls: ${o.totalToolCalls} (success: ${o.successCount}, error: ${o.errorCount})`); + const toolLabel = o.lastToolName ? ` [tool: ${o.lastToolName}]` : ""; + const errLabel = o.lastErrorCode ? `, code: ${o.lastErrorCode}` : ""; + lines.push(` last_tool_result: ${o.lastToolResult}${toolLabel}${errLabel}`); + lines.push(` task_completed_by_tool: ${o.taskCompletedByTool}`); + } + return lines.join("\n"); +} diff --git a/apps/memos-local-plugin/core/reward/types.ts b/apps/memos-local-plugin/core/reward/types.ts index 2c8df6548..d8360ae8e 100644 --- a/apps/memos-local-plugin/core/reward/types.ts +++ b/apps/memos-local-plugin/core/reward/types.ts @@ -35,7 +35,7 @@ export interface RewardConfig { llmScoring: boolean; /** Magnitude threshold (|R_human|) that triggers backprop on implicit signals. */ implicitThreshold: number; - /** Seconds to wait for explicit feedback. 0 disables the timer. */ + /** Seconds to wait after capture.done before scoring (minimum 1). */ feedbackWindowSec: number; /** Max chars in the task summary handed to the LLM. */ summaryMaxChars: number; diff --git a/apps/memos-local-plugin/docs/CONFIG-ADVANCED.md b/apps/memos-local-plugin/docs/CONFIG-ADVANCED.md index 4fef78fdd..bb17819c8 100644 --- a/apps/memos-local-plugin/docs/CONFIG-ADVANCED.md +++ b/apps/memos-local-plugin/docs/CONFIG-ADVANCED.md @@ -97,7 +97,7 @@ algorithm: decayHalfLifeDays: 30 # priority decay half-life (V7 §3.3) llmScoring: true # use rubric LLM for R_human; off = heuristic only implicitThreshold: 0.2 # |R_human| threshold for implicit-feedback runs - feedbackWindowSec: 600 # wait this long after capture.done for explicit feedback (0 disables) + feedbackWindowSec: 600 # seconds after capture.done before episode scoring (minimum 1) summaryMaxChars: 2000 # cap on the task summary handed to the scorer LLM llmConcurrency: 2 # parallel R_human LLM calls l2Induction: diff --git a/apps/memos-local-plugin/tests/e2e/v7-full-chain.e2e.test.ts b/apps/memos-local-plugin/tests/e2e/v7-full-chain.e2e.test.ts index a0ff9dd71..05b00f781 100644 --- a/apps/memos-local-plugin/tests/e2e/v7-full-chain.e2e.test.ts +++ b/apps/memos-local-plugin/tests/e2e/v7-full-chain.e2e.test.ts @@ -325,12 +325,9 @@ function newUserSegment(userContent: string): string { * set to `"merge_follow_ups"` (default) so same-topic follow-ups land * in the same episode. * - * `algorithm.reward.feedbackWindowSec` is forced to 0 so the reward - * subscriber never enqueues a 10-minute timer we'd then have to sleep - * through. Tests score each finalised episode explicitly via - * `pipeline.rewardRunner.run(...)` right after `flush()`, which mirrors - * the integration-level harness used in - * `tests/unit/reward/reward.integration.test.ts`. + * `algorithm.reward.feedbackWindowSec` is set to 1 so tests do not wait on + * the production window. `pipeline.flush()` runs `reward.drain()`, which + * flushes every pending episode immediately (see `core/reward/subscriber.ts`). */ function buildPipeline( db: TmpDbHandle, @@ -350,7 +347,7 @@ function buildPipeline( }, reward: { ...baseCfg.algorithm.reward, - feedbackWindowSec: 0, + feedbackWindowSec: 1, // Each `runTurn` is one user→assistant exchange; the e2e flow // deliberately tests single-turn completions so disable the // production triviality gate. @@ -374,23 +371,6 @@ function buildPipeline( return createPipeline(deps); } -/** - * Run the reward pass for every known episode. We do this manually - * because `feedbackWindowSec = 0` disables the auto-scheduler — see - * `core/reward/subscriber.ts`. - */ -async function scoreAllEpisodes(pipeline: PipelineHandle): Promise { - const episodes = pipeline.repos.episodes.list({}); - for (const ep of episodes) { - if (ep.status !== "closed") continue; - await pipeline.rewardRunner.run({ - episodeId: ep.id, - feedback: [], - trigger: "manual", - }); - } -} - /** * Run one full turn: retrieval → agent response → finalize. The caller * supplies the user text + agent's scripted response + reflection. We @@ -616,12 +596,8 @@ describe("V7 full-chain E2E (Python programming task)", () => { pipeline.sessionManager.closeSession(s4Ep1.sessionId, "test.topic_end"); // ── Drain the async chain (capture → reward → L2 → L3 → skill) ── - // capture is fire-and-forget per episode, but we disabled the reward - // auto-scheduler (see buildPipeline). Score every closed episode - // explicitly so V gets back-propagated. + // reward.drain() scores every pending episode (feedbackWindowSec=1). await pipeline.flush(); - await scoreAllEpisodes(pipeline); - await pipeline.flush(); // drain downstream (L2 / L3 / skill) reactions // ── Assertions on each V7 layer ──────────────────────────────────── diff --git a/apps/memos-local-plugin/tests/integration/adapters/openclaw-full-chain.test.ts b/apps/memos-local-plugin/tests/integration/adapters/openclaw-full-chain.test.ts index 6ec654173..1aa0e484f 100644 --- a/apps/memos-local-plugin/tests/integration/adapters/openclaw-full-chain.test.ts +++ b/apps/memos-local-plugin/tests/integration/adapters/openclaw-full-chain.test.ts @@ -344,12 +344,10 @@ function buildPipeline(db: TmpDbHandle, llm: LlmClient): PipelineHandle { algorithm: { ...DEFAULT_CONFIG.algorithm, lightweightMemory: { enabled: false }, - // Disable the 30 s fallback timer — we'll call the reward - // runner synchronously at the end of the test so tests stay - // deterministic. + // Short window; flush() runs reward.drain() to score immediately. reward: { ...DEFAULT_CONFIG.algorithm.reward, - feedbackWindowSec: 0, + feedbackWindowSec: 1, minExchangesForCompletion: 0, minContentCharsForCompletion: 0, }, @@ -548,19 +546,7 @@ describe("OpenClaw adapter integration — multi-session full V7 chain", () => { // Drain the async capture pipeline first. await pipeline!.flush(); - // Score every closed episode manually — with `feedbackWindowSec=0` - // the auto-scheduler sits idle waiting for explicit feedback. - // This is what real openclaw achieves via the 30 s timer. - const eps = db!.repos.episodes.list({}).filter((e) => e.status === "closed"); - for (const ep of eps) { - await pipeline!.rewardRunner.run({ - episodeId: ep.id, - feedback: [], - trigger: "manual", - }); - } - - // Drain the downstream cascade (L2 / L3 / Skill). + // reward.drain() inside flush scores pending episodes (window=1s). await pipeline!.flush(); // ── Assertions ──────────────────────────────────────────────── @@ -598,6 +584,7 @@ describe("OpenClaw adapter integration — multi-session full V7 chain", () => { } // 5) DB snapshot dump for visual inspection + const eps = repos.episodes.list({}); const snapshot = { episodes: eps.map((e) => ({ id: e.id, diff --git a/apps/memos-local-plugin/tests/unit/adapters/openclaw-bridge.test.ts b/apps/memos-local-plugin/tests/unit/adapters/openclaw-bridge.test.ts index c87c01bee..45ee745ad 100644 --- a/apps/memos-local-plugin/tests/unit/adapters/openclaw-bridge.test.ts +++ b/apps/memos-local-plugin/tests/unit/adapters/openclaw-bridge.test.ts @@ -1183,6 +1183,51 @@ describe("createOpenClawBridge", () => { expect(new Set(ids).size).toBe(1); }); + it("keeps a single episode when before_prompt soft-times out but onTurnStart finishes later", async () => { + vi.stubEnv("MEMOS_BEFORE_PROMPT_SOFT_TIMEOUT_MS", "5"); + const mc = buildCore(); + await mc.init(); + const originalOnTurnStart = mc.onTurnStart.bind(mc); + vi.spyOn(mc, "onTurnStart").mockImplementation(async (turn) => { + await new Promise((resolve) => setTimeout(resolve, 30)); + return originalOnTurnStart(turn); + }); + + const bridge = createOpenClawBridge({ + agent: "openclaw", + core: mc, + log: silentLogger(), + }); + const sessionKey = "s-soft-timeout"; + const ctx = hookCtx({ sessionKey, runId: "run-slow-1" }); + + const prepend = await bridge.handleBeforePrompt( + { prompt: "hello slow binding", messages: [] }, + ctx, + ); + expect(prepend?.prependContext).toBeUndefined(); + + await bridge.handleAgentEnd( + { + success: true, + messages: [ + { role: "user", content: "hello slow binding" }, + { role: "assistant", content: "done" }, + ], + }, + ctx, + ); + await (pipeline as PipelineHandle).flush(); + + const sessionId = bridgeSessionId("main", sessionKey); + const rows = await mc.listEpisodeRows({ sessionId, limit: 10 }); + expect(rows.length).toBeGreaterThan(0); + expect( + rows.filter((row) => row.turnCount > 0 || row.hasAssistantReply).length, + ).toBe(1); + vi.unstubAllEnvs(); + }, 15_000); + it("does not let a delayed agent_end clear the next turn's episode binding", async () => { // OpenClaw hooks can overlap: the next before_prompt_build may route // a fresh episode before the previous agent_end finishes. The bridge diff --git a/apps/memos-local-plugin/tests/unit/experience/corrective-signals.test.ts b/apps/memos-local-plugin/tests/unit/experience/corrective-signals.test.ts new file mode 100644 index 000000000..5041eddd4 --- /dev/null +++ b/apps/memos-local-plugin/tests/unit/experience/corrective-signals.test.ts @@ -0,0 +1,115 @@ +import { describe, expect, it } from "vitest"; + +import { buildCorrectiveSignalsForSink } from "../../../core/experience/corrective-signals.js"; +import type { FeedbackRow, TraceRow } from "../../../core/types.js"; + +const EP = "ep_cs" as TraceRow["episodeId"]; +const TURN1 = 1_700_000_000_000; +const TURN2 = 1_700_000_100_000; + +function trace( + id: string, + ts: number, + turnId: number, + user = "", +): TraceRow { + return { + id: id as TraceRow["id"], + episodeId: EP, + sessionId: "s" as TraceRow["sessionId"], + ts: ts as TraceRow["ts"], + userText: user, + agentText: "agent reply", + reflection: null, + value: 0.5, + alpha: 0.5 as TraceRow["alpha"], + rHuman: null, + priority: 0, + tags: [], + toolCalls: [], + vecSummary: null, + vecAction: null, + turnId: turnId as TraceRow["turnId"], + schemaVersion: 1, + ownerAgentKind: "openclaw", + ownerProfileId: "default", + ownerWorkspaceId: null, + }; +} + +function feedback( + id: string, + ts: number, + traceId: string | null, + rationale: string, +): FeedbackRow { + return { + id: id as FeedbackRow["id"], + ts: ts as FeedbackRow["ts"], + episodeId: EP, + traceId: traceId as FeedbackRow["traceId"], + channel: "explicit", + polarity: "negative", + magnitude: 1, + rationale, + raw: null, + ownerAgentKind: "openclaw", + ownerProfileId: "default", + ownerWorkspaceId: null, + }; +} + +describe("buildCorrectiveSignalsForSink", () => { + it("maps traceId to turn_index and timing vs trace window", () => { + const traces = [ + trace("tr1", TURN1, TURN1, "goal"), + trace("tr2", TURN1 + 2_000, TURN1), + trace("tr3", TURN2, TURN2, "follow up"), + ]; + const fb = feedback( + "fb1", + TURN1 + 60_000, + "tr2", + "wrong package name", + ); + const out = buildCorrectiveSignalsForSink(EP, traces, [fb]); + expect(out.episode_timeline.turns).toHaveLength(2); + expect(out.corrective_signals).toHaveLength(1); + const sig = out.corrective_signals[0]!; + expect(sig.turn_index).toBe(1); + expect(sig.timing).toBe("between_turns"); + expect(sig.delta_ms_after_turn_end).toBe(58_000); + expect(sig.text).toContain("wrong package"); + expect(sig.trace_id).toBe("tr2"); + }); + + it("labels after_turn when feedback follows the last turn only", () => { + const traces = [ + trace("tr1", TURN1, TURN1, "solo"), + trace("tr2", TURN1 + 3_000, TURN1), + ]; + const fb = feedback("fb3", TURN1 + 6_000, "tr2", "too verbose"); + const out = buildCorrectiveSignalsForSink(EP, traces, [fb]); + expect(out.corrective_signals[0]?.timing).toBe("after_turn"); + expect(out.corrective_signals[0]?.turn_index).toBe(1); + }); + + it("infers turn from timestamp when traceId is missing", () => { + const traces = [ + trace("tr1", TURN1, TURN1, "a"), + trace("tr2", TURN2, TURN2, "b"), + ]; + const fb = feedback("fb2", TURN2, null, "use poetry style"); + const out = buildCorrectiveSignalsForSink(EP, traces, [fb]); + const sig = out.corrective_signals[0]!; + expect(sig.turn_index).toBe(2); + expect(sig.timing).toBe("at_turn_end"); + }); + + it("skips non-substantive feedback rows", () => { + const traces = [trace("tr1", TURN1, TURN1)]; + const empty = feedback("fb_empty", TURN1, "tr1", ""); + const out = buildCorrectiveSignalsForSink(EP, traces, [empty]); + expect(out.corrective_signals).toHaveLength(0); + }); +}); diff --git a/apps/memos-local-plugin/tests/unit/pipeline/memory-core.test.ts b/apps/memos-local-plugin/tests/unit/pipeline/memory-core.test.ts index 2569d44e1..a8d5ad5b6 100644 --- a/apps/memos-local-plugin/tests/unit/pipeline/memory-core.test.ts +++ b/apps/memos-local-plugin/tests/unit/pipeline/memory-core.test.ts @@ -895,7 +895,105 @@ describe("MemoryCore façade", () => { const scored = db!.repos.traces.getById(end.traceId as never)!; expect(scored.value).toBeCloseTo(1 / 3); expect(scored.rHuman).toBeCloseTo(1 / 3); - expect(scored.priority).toBeCloseTo(1 / 3); + // priority keeps the max of prior priority and |value| (not recomputed down) + expect(scored.priority).toBe(1); + }); + + it("submitFeedback does not set episode r_task or mint experience policies", async () => { + pipeline = createPipeline(buildDeps(db!)); + core = createMemoryCore( + pipeline, + resolveHome("openclaw", "/tmp/memos-mc-test"), + "test", + ); + await core.init(); + + const policiesBefore = db!.repos.policies.list({ limit: 100 }).length; + const start = await core.onTurnStart({ + agent: "openclaw", + sessionId: "s-no-r-task", + userText: "deploy with docker", + ts: 1_700_000_200_000, + }); + const end = await core.onTurnEnd({ + agent: "openclaw", + sessionId: start.query.sessionId!, + episodeId: start.query.episodeId!, + agentText: "done", + toolCalls: [], + ts: 1_700_000_200_500, + }); + + await core.submitFeedback({ + channel: "explicit", + polarity: "negative", + magnitude: 1, + rationale: "wrong image tag, use latest not stable", + traceId: end.traceId, + episodeId: end.episodeId, + }); + + const ep = db!.repos.episodes.getById(end.episodeId as never); + expect(ep?.rTask).toBeNull(); + expect(db!.repos.policies.list({ limit: 100 }).length).toBe(policiesBefore); + }); + + it("submitFeedback with text runs repair and persists decision_repairs", async () => { + pipeline = createPipeline(buildDeps(db!)); + core = createMemoryCore( + pipeline, + resolveHome("openclaw", "/tmp/memos-mc-test"), + "test", + ); + await core.init(); + + const start = await core.onTurnStart({ + agent: "openclaw", + sessionId: "s-repair-fb", + userText: "install openssl on alpine", + ts: 1_700_000_300_000, + }); + const end = await core.onTurnEnd({ + agent: "openclaw", + sessionId: start.query.sessionId!, + episodeId: start.query.episodeId!, + agentText: "pip install cryptography failed: MODULE_NOT_FOUND", + toolCalls: [], + ts: 1_700_000_300_500, + }); + db!.repos.traces.updateScore(end.traceId as never, { + value: -0.7, + alpha: db!.repos.traces.getById(end.traceId as never)!.alpha, + rHuman: -0.7, + priority: 0.7, + }); + const end2 = await core.onTurnEnd({ + agent: "openclaw", + sessionId: start.query.sessionId!, + episodeId: start.query.episodeId!, + agentText: "apk add openssl-dev succeeded", + toolCalls: [], + ts: 1_700_000_301_000, + }); + db!.repos.traces.updateScore(end2.traceId as never, { + value: 0.9, + alpha: db!.repos.traces.getById(end2.traceId as never)!.alpha, + rHuman: 0.9, + priority: 0.9, + }); + + const repairsBefore = db!.repos.decisionRepairs.list().length; + await core.submitFeedback({ + channel: "explicit", + polarity: "negative", + magnitude: 1, + rationale: "use apk add openssl-dev instead of pip on alpine", + episodeId: start.query.episodeId!, + traceId: end.traceId, + }); + + const repairsAfter = db!.repos.decisionRepairs.list(); + expect(repairsAfter.length).toBeGreaterThan(repairsBefore); }); it("submitFeedback rejects unknown trace ids before SQLite FK failure", async () => { diff --git a/apps/memos-local-plugin/tests/unit/pipeline/resolve-open-episode.test.ts b/apps/memos-local-plugin/tests/unit/pipeline/resolve-open-episode.test.ts new file mode 100644 index 000000000..d962a047f --- /dev/null +++ b/apps/memos-local-plugin/tests/unit/pipeline/resolve-open-episode.test.ts @@ -0,0 +1,83 @@ +import { afterEach, beforeEach, describe, expect, it } from "vitest"; + +import { + createMemoryCore, + createPipeline, + type PipelineDeps, + type PipelineHandle, +} from "../../../core/pipeline/index.js"; +import type { MemoryCore } from "../../../agent-contract/memory-core.js"; +import { DEFAULT_CONFIG } from "../../../core/config/defaults.js"; +import { resolveHome } from "../../../core/config/paths.js"; +import { rootLogger } from "../../../core/logger/index.js"; +import { makeTmpDb, type TmpDbHandle } from "../../helpers/tmp-db.js"; +import { fakeEmbedder } from "../../helpers/fake-embedder.js"; +import { bridgeSessionId } from "../../../adapters/openclaw/bridge.js"; + +let db: TmpDbHandle | null = null; +let pipeline: PipelineHandle | null = null; +let core: MemoryCore | null = null; + +function buildDeps(h: TmpDbHandle): PipelineDeps { + return { + agent: "openclaw", + home: resolveHome("openclaw", "/tmp/memos-resolve-test"), + config: DEFAULT_CONFIG, + db: h.db, + repos: h.repos, + llm: null, + reflectLlm: null, + embedder: fakeEmbedder({ dimensions: 384 }), + log: rootLogger.child({ channel: "test.pipeline.resolve" }), + namespace: { agentKind: "openclaw", profileId: "main" }, + now: () => 1_700_000_000_000, + }; +} + +beforeEach(() => { + db = makeTmpDb(); + pipeline = createPipeline(buildDeps(db)); + core = createMemoryCore( + pipeline, + resolveHome("openclaw", "/tmp/memos-resolve-test"), + "test", + ); +}); + +afterEach(async () => { + if (core) { + try { + await core.shutdown(); + } catch { + /* ignore */ + } + } + core = null; + pipeline = null; + db?.cleanup(); + db = null; +}); + +describe("resolveOpenEpisodeId / openEpisode", () => { + it("openEpisode returns the existing open row instead of minting a second id", async () => { + await core!.init(); + const sessionKey = "agent:main:resolve-test"; + const sessionId = bridgeSessionId("main", sessionKey); + await core!.openSession({ agent: "openclaw", sessionId }); + + const first = await core!.openEpisode({ + sessionId, + userMessage: "task one", + }); + const second = await core!.openEpisode({ + sessionId, + userMessage: "task one continued", + }); + + expect(second).toBe(first); + expect(pipeline!.resolveOpenEpisodeId(sessionId)).toBe(first); + + const rows = await core!.listEpisodeRows({ sessionId, limit: 10 }); + expect(rows.filter((r) => r.status === "open")).toHaveLength(1); + }); +}); diff --git a/apps/memos-local-plugin/tests/unit/reward/human-scorer.test.ts b/apps/memos-local-plugin/tests/unit/reward/human-scorer.test.ts index 1a4805eeb..aea8af200 100644 --- a/apps/memos-local-plugin/tests/unit/reward/human-scorer.test.ts +++ b/apps/memos-local-plugin/tests/unit/reward/human-scorer.test.ts @@ -64,7 +64,7 @@ describe("reward/human-scorer", () => { it("LLM mode: happy path, uses the LLM and reports llm source", async () => { const llm = fakeLlm({ completeJson: { - "reward.reward.r_human.v3": { + "reward.reward.r_human.v4": { goal_achievement: 0.9, process_quality: 0.5, user_satisfaction: 0.8, @@ -89,7 +89,7 @@ describe("reward/human-scorer", () => { it("LLM mode: clamps axes to [-1, 1]", async () => { const llm = fakeLlm({ completeJson: { - "reward.reward.r_human.v3": { + "reward.reward.r_human.v4": { goal_achievement: 5, process_quality: -3, user_satisfaction: 2, @@ -112,7 +112,7 @@ describe("reward/human-scorer", () => { it("LLM mode: rejects non-numeric axes (via validate) → falls back to heuristic", async () => { const llm = fakeLlm({ completeJson: { - "reward.reward.r_human.v3": { goal_achievement: "yes", process_quality: 0, user_satisfaction: 0 }, + "reward.reward.r_human.v4": { goal_achievement: "yes", process_quality: 0, user_satisfaction: 0 }, }, }); const out = await scoreHuman( @@ -148,4 +148,38 @@ describe("reward/human-scorer", () => { ); expect(out.source).toBe("explicit"); }); + + it("LLM mode: feedback formatted with USER/INFERRED labels and ISO timestamp", async () => { + let capturedUserContent = ""; + const llm = fakeLlm({ + completeJson: { + "reward.reward.r_human.v4": (input: unknown) => { + const msgs = input as Array<{ role: string; content: string }>; + capturedUserContent = msgs.find((m) => m.role === "user")?.content ?? ""; + return { + goal_achievement: 0.8, + process_quality: 0.5, + user_satisfaction: 0.7, + label: "success", + reason: "ok", + }; + }, + }, + }); + + await scoreHuman( + { + episodeSummary: makeSummary(), + feedback: [ + makeFeedback({ channel: "explicit", polarity: "positive", ts: 1_700_000_000_000 as UserFeedback["ts"] }), + makeFeedback({ id: "fb_2" as never, channel: "implicit", polarity: "negative", ts: 1_700_000_001_000 as UserFeedback["ts"], text: "bad response" }), + ], + }, + { llm, cfg: { llmScoring: true } }, + ); + + expect(capturedUserContent).toContain("[USER/positive @"); + expect(capturedUserContent).toContain("[INFERRED/negative @"); + expect(capturedUserContent).toMatch(/\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}/); + }); }); diff --git a/apps/memos-local-plugin/tests/unit/reward/reward.integration.test.ts b/apps/memos-local-plugin/tests/unit/reward/reward.integration.test.ts index 1a3c6bdae..f0c35e857 100644 --- a/apps/memos-local-plugin/tests/unit/reward/reward.integration.test.ts +++ b/apps/memos-local-plugin/tests/unit/reward/reward.integration.test.ts @@ -202,7 +202,7 @@ describe("reward/integration", () => { const llm = fakeLlm({ completeJson: { - "reward.reward.r_human.v3": { + "reward.reward.r_human.v4": { goal_achievement: 0.9, process_quality: 0.7, user_satisfaction: 0.8, @@ -281,11 +281,43 @@ describe("reward/integration", () => { expect(res.rHuman).toBe(0); expect(res.humanScore.source).toBe("heuristic"); + const epMeta = (handle.repos.episodes.getById(eid as unknown as EpisodeRow["id"])! + .meta ?? {}) as { reward?: { trigger?: string } }; + expect(epMeta.reward?.trigger).toBe("implicit_fallback"); + const t = handle.repos.traces.getById("tr_x" as unknown as TraceRow["id"])!; expect(t.value).toBe(0); expect(t.priority).toBe(0); }); + it("overrides meta.reward.trigger to explicit_feedback when DB has feedback rows", async () => { + const sid = "s_int_trig"; + const eid = "ep_int_trig"; + seedEpisode(handle, eid, sid, ["tr_trig"]); + seedTrace(handle, "tr_trig", eid, sid, { alpha: 0.5 }); + seedFeedback(handle, "fb_trig", eid, { polarity: "positive", rationale: "good" }); + + const runner = createRewardRunner({ + tracesRepo: handle.repos.traces, + episodesRepo: handle.repos.episodes, + feedbackRepo: handle.repos.feedback, + llm: null, + bus: createRewardEventBus(), + cfg: cfg(), + now: () => NOW, + }); + + await runner.run({ + episodeId: eid as unknown as Parameters[0]["episodeId"], + feedback: [], + trigger: "implicit_fallback", + }); + + const ep = handle.repos.episodes.getById(eid as unknown as EpisodeRow["id"])!; + const rewardMeta = (ep.meta ?? {}) as { reward?: { trigger?: string } }; + expect(rewardMeta.reward?.trigger).toBe("explicit_feedback"); + }); + it("episodes with no traces still score R_human but skip backprop", async () => { const sid = "s_int_3"; const eid = "ep_int_3"; diff --git a/apps/memos-local-plugin/tests/unit/reward/subscriber.test.ts b/apps/memos-local-plugin/tests/unit/reward/subscriber.test.ts index 7516cc82b..9e51a1303 100644 --- a/apps/memos-local-plugin/tests/unit/reward/subscriber.test.ts +++ b/apps/memos-local-plugin/tests/unit/reward/subscriber.test.ts @@ -52,7 +52,7 @@ function makeRunner(spy: RunSpy, behavior: "ok" | "pending" | "error" = "ok") { }; } -function cfg(windowSec = 0): RewardConfig { +function cfg(windowSec = 5): RewardConfig { return { gamma: 0.9, lambda: 0.5, @@ -128,63 +128,61 @@ describe("reward/subscriber", () => { sub.stop(); }); - it("submitFeedback before window expires fires immediately with explicit trigger", async () => { + it("submitFeedback during window does not score until timer or drain", async () => { const spy: RunSpy = { calls: [] }; const bus = createCaptureEventBus(); const sub = attachRewardSubscriber(bus, makeRunner(spy, "ok"), cfg(10), {}); bus.emit(makeCaptureDone("ep_B")); sub.submitFeedback(makeFeedback("ep_B")); - // advance past window to prove we don't double-fire - await vi.advanceTimersByTimeAsync(20_000); + expect(spy.calls).toHaveLength(0); + + await vi.advanceTimersByTimeAsync(10_000); await sub.drain(); expect(spy.calls).toHaveLength(1); - expect(spy.calls[0]!.trigger).toBe("explicit_feedback"); - expect(spy.calls[0]!.feedback).toHaveLength(1); + expect(spy.calls[0]!.trigger).toBe("implicit_fallback"); + expect(spy.calls[0]!.feedback).toHaveLength(0); sub.stop(); }); - it("submitFeedback for unknown episode still triggers a run", async () => { + it("submitFeedback without pending episode is a no-op", async () => { const spy: RunSpy = { calls: [] }; const bus = createCaptureEventBus(); - const sub = attachRewardSubscriber(bus, makeRunner(spy, "ok"), cfg(0), {}); + const sub = attachRewardSubscriber(bus, makeRunner(spy, "ok"), cfg(5), {}); sub.submitFeedback(makeFeedback("ep_X")); await sub.drain(); - expect(spy.calls).toHaveLength(1); - expect(spy.calls[0]!.episodeId).toBe("ep_X"); - expect(spy.calls[0]!.trigger).toBe("explicit_feedback"); + expect(spy.calls).toHaveLength(0); sub.stop(); }); - it("skips capture results with zero traces", async () => { + it("feedbackWindowSec=0 is clamped to 1s; capture.done + drain scores once", async () => { const spy: RunSpy = { calls: [] }; const bus = createCaptureEventBus(); - const sub = attachRewardSubscriber(bus, makeRunner(spy, "ok"), cfg(5), {}); + const sub = attachRewardSubscriber(bus, makeRunner(spy, "ok"), cfg(0), {}); - bus.emit(makeCaptureDone("ep_empty", [])); - await vi.advanceTimersByTimeAsync(10_000); + bus.emit(makeCaptureDone("ep_C")); + expect(sub.pendingCount()).toBe(1); await sub.drain(); - expect(spy.calls).toHaveLength(0); + expect(spy.calls).toHaveLength(1); + expect(spy.calls[0]!.trigger).toBe("implicit_fallback"); + expect(spy.calls[0]!.feedback).toHaveLength(0); sub.stop(); }); - it("feedbackWindowSec=0 disables auto-fallback; only manual/explicit fires", async () => { + it("skips capture results with zero traces", async () => { const spy: RunSpy = { calls: [] }; const bus = createCaptureEventBus(); - const sub = attachRewardSubscriber(bus, makeRunner(spy, "ok"), cfg(0), {}); + const sub = attachRewardSubscriber(bus, makeRunner(spy, "ok"), cfg(5), {}); - bus.emit(makeCaptureDone("ep_C")); - await vi.advanceTimersByTimeAsync(100_000); + bus.emit(makeCaptureDone("ep_empty", [])); + await vi.advanceTimersByTimeAsync(10_000); await sub.drain(); - expect(spy.calls).toHaveLength(0); - await sub.runManually("ep_C" as unknown as Parameters[0], "manual"); - expect(spy.calls).toHaveLength(1); - expect(spy.calls[0]!.trigger).toBe("manual"); + expect(spy.calls).toHaveLength(0); sub.stop(); }); @@ -215,7 +213,22 @@ describe("reward/subscriber", () => { expect(spy.calls).toHaveLength(0); }); - it("pendingCount reports in-flight runs", async () => { + it("pendingCount includes scheduled episodes and in-flight runs", async () => { + const spy: RunSpy = { calls: [] }; + const bus = createCaptureEventBus(); + const sub = attachRewardSubscriber(bus, makeRunner(spy, "ok"), cfg(10), {}); + + bus.emit(makeCaptureDone("ep_sched")); + expect(sub.pendingCount()).toBe(1); + + await vi.advanceTimersByTimeAsync(10_100); + expect(sub.pendingCount()).toBe(0); + await sub.drain(); + expect(sub.pendingCount()).toBe(0); + sub.stop(); + }); + + it("pendingCount tracks in-flight run until settled", async () => { const spy: RunSpy = { calls: [] }; const bus = createCaptureEventBus(); const sub = attachRewardSubscriber(bus, makeRunner(spy, "pending"), cfg(1), {}); @@ -252,4 +265,18 @@ describe("reward/subscriber", () => { expect(sub.pendingCount()).toBe(0); sub.stop(); }); + + it("runManually still triggers a run", async () => { + const spy: RunSpy = { calls: [] }; + const bus = createCaptureEventBus(); + const sub = attachRewardSubscriber(bus, makeRunner(spy, "ok"), cfg(5), {}); + + bus.emit(makeCaptureDone("ep_manual")); + await sub.runManually("ep_manual" as unknown as Parameters[0], "manual"); + await sub.drain(); + + expect(spy.calls.length).toBeGreaterThanOrEqual(1); + expect(spy.calls.some((c) => c.trigger === "manual")).toBe(true); + sub.stop(); + }); }); diff --git a/apps/memos-local-plugin/tests/unit/reward/task-summary.test.ts b/apps/memos-local-plugin/tests/unit/reward/task-summary.test.ts index 986f3f30b..80a44fc51 100644 --- a/apps/memos-local-plugin/tests/unit/reward/task-summary.test.ts +++ b/apps/memos-local-plugin/tests/unit/reward/task-summary.test.ts @@ -107,10 +107,13 @@ describe("reward/task-summary", () => { ], }); const traces: TraceRow[] = []; - const sum = buildTaskSummary({ episode: ep, traces, cfg: { summaryMaxChars: 200 } }); + // Use 400 chars so the tail (180 chars from end) includes both + // EXECUTION_OUTCOME (~90 chars) and OUTCOME_MARKER_END before it. + const sum = buildTaskSummary({ episode: ep, traces, cfg: { summaryMaxChars: 400 } }); expect(sum.truncated).toBe(true); expect(sum.text).toMatch(/truncated/); expect(sum.text).toContain("OUTCOME_MARKER_END"); + expect(sum.text).toContain("EXECUTION_OUTCOME:"); }); it("falls back to descriptive placeholders when episode has no user/agent text", () => { @@ -130,6 +133,39 @@ describe("reward/task-summary", () => { expect(sum.agentActions).not.toMatch(/this text should NOT appear/); }); + it("EXECUTION_OUTCOME: successful tool call → task_completed_by_tool=yes", () => { + const ep = makeEpisode(); + const trace = makeTrace(1, { tool: "file_write" }); + const sum = buildTaskSummary({ episode: ep, traces: [trace], cfg: { summaryMaxChars: 2000 } }); + expect(sum.text).toContain("EXECUTION_OUTCOME:"); + expect(sum.text).toContain("task_completed_by_tool: yes"); + expect(sum.text).toContain("last_tool_result: SUCCESS"); + expect(sum.text).toContain("tool: file_write"); + }); + + it("EXECUTION_OUTCOME: last tool call with errorCode → task_completed_by_tool=no", () => { + const ep = makeEpisode(); + const traceWithError: TraceRow = { + ...makeTrace(1), + toolCalls: [ + { name: "docker_deploy", input: {}, errorCode: "TIMEOUT" }, + ] as TraceRow["toolCalls"], + }; + const sum = buildTaskSummary({ episode: ep, traces: [traceWithError], cfg: { summaryMaxChars: 2000 } }); + expect(sum.text).toContain("task_completed_by_tool: no"); + expect(sum.text).toContain("last_tool_result: ERROR"); + expect(sum.text).toContain("TIMEOUT"); + }); + + it("EXECUTION_OUTCOME: no tool calls → task_completed_by_tool=unknown", () => { + const ep = makeEpisode(); + const textOnlyTrace = makeTrace(1, { text: "Here is the answer" }); + const sum = buildTaskSummary({ episode: ep, traces: [textOnlyTrace], cfg: { summaryMaxChars: 2000 } }); + expect(sum.text).toContain("task_completed_by_tool: unknown"); + expect(sum.text).toContain("last_tool_result: NONE"); + expect(sum.text).toContain("total_tool_calls: 0"); + }); + it("includes host and evaluator model context for identity-sensitive scoring", () => { const ep = makeEpisode({ meta: { From a466d42d5bc874283c2e78fc72ce71ea3661ba3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=91=E5=B8=83=E6=9E=97?= <11641432+heiheiyouyou@user.noreply.gitee.com> Date: Wed, 3 Jun 2026 20:52:50 +0800 Subject: [PATCH 2/5] fix: update prompt --- .../llm/prompts/failure-experience-sink.ts | 52 ++++++++++++------- .../core/pipeline/memory-core.ts | 2 +- .../tests/unit/llm/prompts.test.ts | 15 ++++++ .../tests/unit/memory/l2/subscriber.test.ts | 8 +-- .../viewer/src/stores/i18n.ts | 4 +- 5 files changed, 55 insertions(+), 26 deletions(-) diff --git a/apps/memos-local-plugin/core/llm/prompts/failure-experience-sink.ts b/apps/memos-local-plugin/core/llm/prompts/failure-experience-sink.ts index 6e89de606..33f21cbb3 100644 --- a/apps/memos-local-plugin/core/llm/prompts/failure-experience-sink.ts +++ b/apps/memos-local-plugin/core/llm/prompts/failure-experience-sink.ts @@ -2,34 +2,48 @@ import type { PromptDef } from "./index.js"; export const FAILURE_EXPERIENCE_SINK_PROMPT: PromptDef = { id: "failure.experience.sink", - version: 1, - description: "Induce failure-aware candidate policy from a failed episode, including time-anchored corrective feedback.", - system: `You induce a candidate policy from a failed agent episode. + version: 4, + description: + "Induce reusable task-completion guidance from a failed attempt and corrective feedback.", + system: `You induce a candidate policy from an episode where the task was not finished satisfactorily. Goal: -- Extract one reusable policy that helps avoid or repair similar failures. -- The policy must be operational (trigger + procedure + verification), not generic commentary. +- Extract one reusable policy that helps a similar task reach a satisfactory finish. +- Make it operational: trigger + procedure + verification. Prefer practical guidance (priorities, sequencing, closure checks) over abstract commentary. +- Use corrective_signals to see what the goal still needed; use phase_chunks and episode_timeline for context. Input: -- phase_chunks: recent traces with trace_ts / turn_id (conversation + tools). -- episode_timeline.turns: ordered user turns with started_at / ended_at (epoch ms). -- corrective_signals: human or verifier feedback with turn_index, timing, and deltas vs trace/turn timestamps. - Feedback that arrives AFTER a turn ended often corrects the agent's reply on that turn — weight timing heavily. - -Rules: -1) Stay grounded in phase_chunks, episode_timeline, and corrective_signals. Do not invent tests/files/errors. -2) When corrective_signals exist, merge their intent into decision_guidance; prefer signals with clear turn_index + after_turn timing for anti-patterns on that turn. -3) Keep trigger task-level and recognizable at decision time. -4) If you can propose what to do, use "repair_instruction"; if only what to avoid, use "failure_avoidance". -5) decision_guidance.prefer should contain positive corrective hints (may be empty). -6) decision_guidance.avoid should contain anti-pattern hints (may be empty). +- task_context.user_goal: task framing and requirements (may be truncated). +- phase_chunks: recent traces (conversation + limited tool output snippets). +- episode_timeline.turns: ordered user turns with timing. +- corrective_signals: feedback with turn_index and timing relative to turns. + +Evidence: +1) Ground only in the fields above. Do not invent tests, files, errors, or violations. +2) task_context states requirements; it does not by itself show what went wrong in the attempt. +3) Tie each claim to a quotable phenomenon (e.g. external judgment still open, requested substance missing, timeout without deliverable, feedback naming an unmet acceptance criterion). +4) If evidence is thin, keep the policy narrow and note limits in boundary. + +Guidance: +5) prefer: habits that advance completion (may be empty). +6) avoid: habits that leave the goal unmet—outcome/behavior gaps only. Do not name tools or channels; do not use "do not use / never call" style lines. +7) procedure and verification must be checkable from visible outcomes or judgments in the input. +8) verification: how to tell the task is done or accepted. + +Types: +9) "failure_avoidance" when feedback shows the goal stayed open and you mainly generalize what to stop doing before ending. +10) "repair_instruction" when you can give a repeatable completion pattern (what to finish or confirm before done). + +Other: +11) trigger: task-level, recognizable when a similar task starts or nears closure. +12) support_trace_ids: only traces you actually used. Return JSON: { "title": "short title", "trigger": "state condition", - "procedure": "step-by-step action template", - "verification": "how to verify fix", + "procedure": "step-by-step guidance", + "verification": "how to verify completion", "boundary": "scope/limits", "experience_type": "repair_instruction | failure_avoidance", "decision_guidance": { diff --git a/apps/memos-local-plugin/core/pipeline/memory-core.ts b/apps/memos-local-plugin/core/pipeline/memory-core.ts index 2ac8240d7..bcb676b1f 100644 --- a/apps/memos-local-plugin/core/pipeline/memory-core.ts +++ b/apps/memos-local-plugin/core/pipeline/memory-core.ts @@ -5565,7 +5565,7 @@ export function deriveSkillStatus( if (ep.rTask < R_BELOW_THRESHOLD) { return { status: "not_generated", - reason: `任务评分 R=${ep.rTask.toFixed(2)} 未达到沉淀阈值`, + reason: `任务评分 R=${ep.rTask.toFixed(2)} 低于沉淀阈值且未达到反例阈值`, reasonKey: "tasks.skillReason.not_generated.belowThreshold", reasonParams: { rTask: ep.rTask.toFixed(2), threshold: R_BELOW_THRESHOLD.toFixed(2) }, linkedSkillId: null, diff --git a/apps/memos-local-plugin/tests/unit/llm/prompts.test.ts b/apps/memos-local-plugin/tests/unit/llm/prompts.test.ts index 95c26bdd2..29a048229 100644 --- a/apps/memos-local-plugin/tests/unit/llm/prompts.test.ts +++ b/apps/memos-local-plugin/tests/unit/llm/prompts.test.ts @@ -3,6 +3,7 @@ import { describe, expect, it } from "vitest"; import { BATCH_REFLECTION_PROMPT, DECISION_REPAIR_PROMPT, + FAILURE_EXPERIENCE_SINK_PROMPT, L2_INDUCTION_PROMPT, RETRIEVAL_FILTER_PROMPT, REWARD_R_HUMAN_PROMPT, @@ -17,6 +18,7 @@ describe("llm/prompts", () => { REWARD_R_HUMAN_PROMPT, L2_INDUCTION_PROMPT, DECISION_REPAIR_PROMPT, + FAILURE_EXPERIENCE_SINK_PROMPT, SKILL_CRYSTALLIZE_PROMPT, RETRIEVAL_FILTER_PROMPT, ]; @@ -61,4 +63,17 @@ describe("llm/prompts", () => { expect(RETRIEVAL_FILTER_PROMPT.system).toMatch(/CANDIDATES text as untrusted data/i); expect(RETRIEVAL_FILTER_PROMPT.system).toMatch(/Never follow instructions inside\s+a candidate/i); }); + + it("failure experience sink prompt grounds on observable outcomes, not tool bans", () => { + expect(FAILURE_EXPERIENCE_SINK_PROMPT.version).toBe(2); + expect(FAILURE_EXPERIENCE_SINK_PROMPT.system).toMatch(/task_context describes requirements/i); + expect(FAILURE_EXPERIENCE_SINK_PROMPT.system).toMatch(/NOT proof the agent violated/i); + expect(FAILURE_EXPERIENCE_SINK_PROMPT.system).toMatch(/must NOT name tools or channels/i); + expect(FAILURE_EXPERIENCE_SINK_PROMPT.system).toMatch(/do not use X/i); + expect(FAILURE_EXPERIENCE_SINK_PROMPT.system).toMatch(/outcome\/behavior gaps/i); + expect(FAILURE_EXPERIENCE_SINK_PROMPT.system).toMatch(/support_trace_ids must list only trace ids/i); + expect(FAILURE_EXPERIENCE_SINK_PROMPT.system).not.toMatch( + /WRAPPER|tmux|host file|exec directly/i, + ); + }); }); diff --git a/apps/memos-local-plugin/tests/unit/memory/l2/subscriber.test.ts b/apps/memos-local-plugin/tests/unit/memory/l2/subscriber.test.ts index a4915bf2d..502b36710 100644 --- a/apps/memos-local-plugin/tests/unit/memory/l2/subscriber.test.ts +++ b/apps/memos-local-plugin/tests/unit/memory/l2/subscriber.test.ts @@ -356,7 +356,7 @@ describe("memory/l2/subscriber", () => { l2Bus, llm: fakeLlm({ completeJson: { - "l2.failure.experience.sink.v1": { + "l2.failure.experience.sink.v2": { title: "修复配置失败", trigger: "配置校验失败并报错", procedure: "先检查配置项,再重试任务", @@ -402,7 +402,7 @@ describe("memory/l2/subscriber", () => { l2Bus, llm: fakeLlm({ completeJson: { - "l2.failure.experience.sink.v1": { + "l2.failure.experience.sink.v2": { title: "空指导", trigger: "触发", procedure: "步骤", @@ -460,7 +460,7 @@ describe("memory/l2/subscriber", () => { l2Bus, llm: fakeLlm({ completeJson: { - "l2.failure.experience.sink.v1": { + "l2.failure.experience.sink.v2": { title: "修复空反馈失败", trigger: "失败触发", procedure: "先检查输入", @@ -724,7 +724,7 @@ describe("memory/l2/subscriber", () => { l2Bus, llm: fakeLlm({ completeJson: { - "l2.failure.experience.sink.v1": { + "l2.failure.experience.sink.v2": { title: "runOnce sink", trigger: "失败触发", procedure: "检查配置", diff --git a/apps/memos-local-plugin/viewer/src/stores/i18n.ts b/apps/memos-local-plugin/viewer/src/stores/i18n.ts index 1946b27e0..6766a4bb1 100644 --- a/apps/memos-local-plugin/viewer/src/stores/i18n.ts +++ b/apps/memos-local-plugin/viewer/src/stores/i18n.ts @@ -568,7 +568,7 @@ const en = { "tasks.skillReason.skipped": "Low task score (R={rTask}), recorded as a counterexample; the system will try to derive avoidance guidance from this episode's traces and will not generate an invokable skill.", "tasks.skillReason.not_generated.belowThreshold": - "Task score R={rTask} is below the induction threshold (≥ {threshold}) — the conversation was normal, but not strong enough to generalize into an L2 experience; similar tasks will accumulate over time.", + "Task score R={rTask} is below the induction threshold (≥ {threshold}) but above the negative-example floor (≤ -0.50); this task has weak or negative signal, so no L2 experience will be generated yet. Similar future tasks can accumulate into reusable experience if they score higher.", "tasks.skillReason.not_generated.noPolicy": "No L2 experience is linked to this task yet — induction may still be processing asynchronously. Refresh in a moment to see the latest status.", "tasks.skillReason.generated": @@ -1425,7 +1425,7 @@ const zh: Record = { "tasks.skillReason.skipped": "任务评分较低 (R={rTask}),记为反例;系统会尝试从本次任务轨迹归纳规避建议,不会生成可调用技能。", "tasks.skillReason.not_generated.belowThreshold": - "任务评分 R={rTask} 未达到沉淀阈值 (≥ {threshold})——对话本身正常,只是还不够强到能泛化成 L2 经验;多做几个相似任务后会自动积累。", + "任务评分 R={rTask} 低于沉淀阈值 (≥ {threshold}),但未达到反例阈值 (≤ -0.50);本次任务信号偏弱或带有负向反馈,暂不会生成 L2 经验。后续相似任务若获得更高评分,会继续累积为可复用经验。", "tasks.skillReason.not_generated.noPolicy": "该任务暂未关联到 L2 经验——可能仍在异步归纳处理中。稍后刷新可查看最新状态。", "tasks.skillReason.generated": From 8ba7e764501600a3499b80aea1253c4032afac74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=91=E5=B8=83=E6=9E=97?= <11641432+heiheiyouyou@user.noreply.gitee.com> Date: Thu, 4 Jun 2026 14:18:33 +0800 Subject: [PATCH 3/5] fix: capture error --- .../core/capture/capture.ts | 34 ++++++++++++++----- apps/memos-local-plugin/core/pipeline/deps.ts | 13 +++++++ .../core/session/episode-manager.ts | 9 +++-- 3 files changed, 44 insertions(+), 12 deletions(-) diff --git a/apps/memos-local-plugin/core/capture/capture.ts b/apps/memos-local-plugin/core/capture/capture.ts index dbcf3bc45..d2e9d95b7 100644 --- a/apps/memos-local-plugin/core/capture/capture.ts +++ b/apps/memos-local-plugin/core/capture/capture.ts @@ -71,6 +71,13 @@ export interface CaptureDeps { bus: CaptureEventBus; cfg: CaptureConfig; now?: () => number; + /** + * Called after the lite cursor is advanced so the session layer can + * propagate the new value into its in-memory episode snapshot. Without + * this hook, getEpisode() returns cursor=0 on every subsequent turn and + * runLite falls back to full extractSteps instead of the incremental path. + */ + onLiteCursorAdvanced?: (episodeId: string, turnCount: number) => void; } export interface CaptureRunner { @@ -164,6 +171,12 @@ export function createCaptureRunner(deps: CaptureDeps): CaptureRunner { // combo silently re-inserted everything past the 50-row cap and any // multi-step turn that collided on the same ms, producing triplet rows // in the traces table (one per re-entry from lite / reflect / recovery). + // + // IMPORTANT: normalizeSteps runs BEFORE the signature comparison so that + // both sides use the same truncated text. Raw steps carry the full + // episode-turn text while DB rows store normalizeSteps-truncated text; + // comparing across that boundary causes false negatives (step appears + // novel, passes dedup, and a duplicate trace row is inserted). const anchorTurnId = resolveAnchorTurnId(input.episode); const extractStart = now(); const rawAll = extractIncrementalSteps(input.episode); @@ -171,23 +184,22 @@ export function createCaptureRunner(deps: CaptureDeps): CaptureRunner { const seenSignatures = new Set( existingTraces.map((row) => traceIdentitySignature(row, anchorTurnId)), ); - const raw = rawAll.filter( + const extractMs = now() - extractStart; + const normStart = now(); + const normalizedAll = normalizeSteps(rawAll, deps.cfg); + const normalized = normalizedAll.filter( (s) => !seenSignatures.has(stepIdentitySignature(s, anchorTurnId)), ); - const extractMs = now() - extractStart; + const normalizeMs = now() - normStart; log.debug("stage.extract.done", { phase: "lite", episodeId: input.episode.id, - steps: raw.length, - novel: raw.length, - skipped: rawAll.length - raw.length, + steps: normalized.length, + novel: normalized.length, + skipped: normalizedAll.length - normalized.length, durationMs: extractMs, }); - const normStart = now(); - const normalized = normalizeSteps(raw, deps.cfg); - const normalizeMs = now() - normStart; - if (normalized.length === 0) { advanceLiteCaptureCursor(input); const result = emptyResult(input, startedAt, { @@ -715,6 +727,10 @@ export function createCaptureRunner(deps: CaptureDeps): CaptureRunner { ...input.episode.meta, [CAPTURE_LITE_TURN_CURSOR_META]: turnCount, }; + // Propagate into the episode manager's authoritative in-memory snapshot + // so the next getEpisode() call returns the correct cursor and runLite + // uses the incremental path instead of full extractSteps. + deps.onLiteCursorAdvanced?.(input.episode.id, turnCount); } catch (err) { rootLogger.child({ channel: "core.capture" }).warn("capture.cursor_update_failed", { episodeId: input.episode.id, diff --git a/apps/memos-local-plugin/core/pipeline/deps.ts b/apps/memos-local-plugin/core/pipeline/deps.ts index 2cfbf0317..38308c0ca 100644 --- a/apps/memos-local-plugin/core/pipeline/deps.ts +++ b/apps/memos-local-plugin/core/pipeline/deps.ts @@ -91,6 +91,8 @@ import type { RetrievalDeps, RetrievalEventBus, } from "../retrieval/index.js"; +import type { EpisodeId } from "../../agent-contract/dto.js"; +import { CAPTURE_LITE_TURN_CURSOR_META } from "../episode/turn-anchor.js"; import type { PipelineAlgorithmConfig, @@ -214,6 +216,17 @@ export function buildPipelineSubscribers( bus: buses.capture, cfg: algorithm.capture, now: deps.now, + onLiteCursorAdvanced: session + ? (episodeId, turnCount) => { + try { + session.episodeManager.patchMeta(episodeId as EpisodeId, { + [CAPTURE_LITE_TURN_CURSOR_META]: turnCount, + }); + } catch { + // best-effort; next runLite will still dedup via DB signatures + } + } + : undefined, }); const rewardRunner = createRewardRunner({ diff --git a/apps/memos-local-plugin/core/session/episode-manager.ts b/apps/memos-local-plugin/core/session/episode-manager.ts index aeb8089a1..03da2bf98 100644 --- a/apps/memos-local-plugin/core/session/episode-manager.ts +++ b/apps/memos-local-plugin/core/session/episode-manager.ts @@ -334,8 +334,7 @@ export function createEpisodeManager(deps: EpisodeManagerDeps): EpisodeManager { previousScoredAt: rewardScoredAt(snap.meta), } : {}; - snap.meta = { - ...snap.meta, + const metaDelta: Record = { closeReason: undefined, topicState: "active", reopenedAt: now(), @@ -351,7 +350,11 @@ export function createEpisodeManager(deps: EpisodeManagerDeps): EpisodeManager { } : {}), }; - deps.episodesRepo.reopen(id, snap.meta); + snap.meta = { ...snap.meta, ...metaDelta }; + // Pass only the delta so updateMeta's merge doesn't overwrite fields + // (e.g. CAPTURE_LITE_TURN_CURSOR_META) that were correctly advanced in + // DB by advanceLiteCaptureCursor but are stale (=0) in the in-memory snap. + deps.episodesRepo.reopen(id, metaDelta); log.info("episode.reopened", { episodeId: id, sessionId: snap.sessionId, From 2993e4aa900d2a306e624303f93f16e21eb37e4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=91=E5=B8=83=E6=9E=97?= <11641432+heiheiyouyou@user.noreply.gitee.com> Date: Thu, 4 Jun 2026 19:00:44 +0800 Subject: [PATCH 4/5] fix: reward --- .../agent-contract/episode-status.ts | 2 +- .../core/config/defaults.ts | 2 +- apps/memos-local-plugin/core/config/schema.ts | 2 +- .../core/episode/outcome.ts | 2 +- .../core/experience/feedback-builder.ts | 338 +++++++++++------- .../core/experience/trace-selection.ts | 194 ++++++++++ .../core/llm/prompts/reward.ts | 54 +-- .../core/pipeline/memory-core.ts | 4 +- .../core/reward/task-summary.ts | 40 ++- .../core/skill/repair-candidate.ts | 68 +++- .../core/skill/subscriber.ts | 6 +- .../tests/unit/episode/outcome.test.ts | 6 +- .../unit/experience/feedback-builder.test.ts | 167 ++++++++- .../tests/unit/reward/human-scorer.test.ts | 8 +- .../unit/reward/reward.integration.test.ts | 4 +- .../tests/unit/server/http.test.ts | 2 +- .../tests/unit/skill/_helpers.ts | 2 +- .../tests/unit/skill/repair-candidate.test.ts | 58 ++- .../viewer/src/stores/i18n.ts | 8 +- 19 files changed, 744 insertions(+), 223 deletions(-) create mode 100644 apps/memos-local-plugin/core/experience/trace-selection.ts diff --git a/apps/memos-local-plugin/agent-contract/episode-status.ts b/apps/memos-local-plugin/agent-contract/episode-status.ts index 7c0adad46..f14e52718 100644 --- a/apps/memos-local-plugin/agent-contract/episode-status.ts +++ b/apps/memos-local-plugin/agent-contract/episode-status.ts @@ -41,7 +41,7 @@ export type DerivedTaskStatus = Exclude; * the task list — the soft-fail framing (未达沉淀阈值) lives on the * skill pipeline pill, not the main task status. */ -export const R_NEGATIVE_FLOOR = -0.5; +export const R_NEGATIVE_FLOOR = -0.15; /** * Recently-finalized grace window: a closed-but-just-ended episode diff --git a/apps/memos-local-plugin/core/config/defaults.ts b/apps/memos-local-plugin/core/config/defaults.ts index 0fad2d956..01ab88d8a 100644 --- a/apps/memos-local-plugin/core/config/defaults.ts +++ b/apps/memos-local-plugin/core/config/defaults.ts @@ -193,7 +193,7 @@ export const DEFAULT_CONFIG: ResolvedConfig = { repairCandidateMinEta: 0.5, outputLanguageMode: "follow_policy", outcomeRTaskSuccessThreshold: 0.5, - outcomeRTaskFailureThreshold: -0.5, + outcomeRTaskFailureThreshold: -0.15, failureEpisodeScorePenalty: 0, failureEpisodeMaxRatio: 0.4, }, diff --git a/apps/memos-local-plugin/core/config/schema.ts b/apps/memos-local-plugin/core/config/schema.ts index 5bad29414..c2bb8cb97 100644 --- a/apps/memos-local-plugin/core/config/schema.ts +++ b/apps/memos-local-plugin/core/config/schema.ts @@ -291,7 +291,7 @@ const AlgorithmSchema = Type.Object({ { default: "follow_policy" }, ), outcomeRTaskSuccessThreshold: NumberInRange(0.5, -1, 1), - outcomeRTaskFailureThreshold: NumberInRange(-0.5, -1, 1), + outcomeRTaskFailureThreshold: NumberInRange(-0.15, -1, 1), failureEpisodeScorePenalty: NumberInRange(0, 0, 2), failureEpisodeMaxRatio: NumberInRange(0.4, 0, 1), }, { default: {} }), diff --git a/apps/memos-local-plugin/core/episode/outcome.ts b/apps/memos-local-plugin/core/episode/outcome.ts index e597e481a..0f2690c08 100644 --- a/apps/memos-local-plugin/core/episode/outcome.ts +++ b/apps/memos-local-plugin/core/episode/outcome.ts @@ -16,7 +16,7 @@ export interface OutcomeThresholds { export const DEFAULT_OUTCOME_THRESHOLDS: OutcomeThresholds = { successThreshold: 0.5, - failureThreshold: -0.5, + failureThreshold: -0.15, }; /** diff --git a/apps/memos-local-plugin/core/experience/feedback-builder.ts b/apps/memos-local-plugin/core/experience/feedback-builder.ts index be1ef3f31..e8b2826ed 100644 --- a/apps/memos-local-plugin/core/experience/feedback-builder.ts +++ b/apps/memos-local-plugin/core/experience/feedback-builder.ts @@ -21,6 +21,7 @@ import type { Logger } from "../logger/types.js"; import type { Repos } from "../storage/repos/index.js"; import type { EmbeddingVector } from "../types.js"; import { MemosError, ERROR_CODES } from "../../agent-contract/errors.js"; +import { compressFeedbackEpisodeTraces, formatFeedbackTraceTurn } from "./trace-selection.js"; export interface FeedbackExperienceResult { created: boolean; @@ -75,6 +76,7 @@ interface DraftExperience { confidence: number; skillEligible: boolean; verifierMeta: Record | null; + similarityKey: FeedbackSimilarityKey | null; vectorText: string; refineFallback?: RefineFallbackEvent; } @@ -91,6 +93,7 @@ interface EpisodeContext { const MIN_SIGNIFICANCE = 0.5; const MERGE_SIMILARITY = 0.72; +const STRICT_SIMILARITY = 0.82; const MAX_TITLE_CHARS = 120; const MAX_LINE_CHARS = 360; const REFINE_TIMEOUT_MS = 30_000; @@ -99,6 +102,16 @@ const REFINE_MAX_CONTEXT_CHARS = 16_000; // reward scales — anything short of 1 means the task was not fully solved). const FULL_PASS_REWARD = 1; +type FeedbackSourceKind = "verifier" | "evaluator" | "user_feedback" | "manual" | "unknown"; +type FeedbackOutcomeKind = "pass" | "fail" | "partial" | "unknown"; + +interface FeedbackSimilarityKey { + sourceKind: FeedbackSourceKind; + outcomeKind: FeedbackOutcomeKind; + taskKind?: string; + issueKind?: string; +} + export async function runFeedbackExperience( input: FeedbackExperienceInput, deps: FeedbackExperienceDeps, @@ -272,6 +285,8 @@ async function buildDraft(args: { // Objective outcome dominates; lexical signals only decide when it is unknown. const pass = outcome === "pass" || (outcome === "unknown" && lexicalPass && !lexicalFail); const fail = outcome === "fail" || (outcome === "unknown" && lexicalFail); + const sourceKind = feedbackSourceKind(args.feedback.raw, verifier, lower); + const metaOnlyFeedback = isMetaOnlyFeedback(args.text, verifier); const hasAvoid = /\b(avoid|do not|don't|never|stop|wrong|incorrect|failed|fail)\b/i.test(args.text) || /不要|别|不能|错误|失败|反例/.test(args.text); @@ -281,7 +296,6 @@ async function buildDraft(args: { if (pass) { type = "success_pattern"; polarity = "positive"; - skillEligible = true; } else if (fail) { // Objective failure: never a positive exemplar, never skill-eligible. type = hasAvoid ? "failure_avoidance" : verifier ? "verifier_feedback" : "repair_instruction"; @@ -302,6 +316,13 @@ async function buildDraft(args: { type = "repair_instruction"; polarity = "neutral"; } + const similarityKey = deriveFeedbackSimilarityKey({ + sourceKind, + outcome, + text: args.text, + type, + polarity, + }); // Try LLM refinement for better guidance extraction let title: string; @@ -412,6 +433,16 @@ async function buildDraft(args: { antiPattern: guidance.antiPattern, }; } + if (pass) { + skillEligible = !metaOnlyFeedback && !isMetaOnlyDerivedGuidance([ + title, + trigger, + procedure, + verification, + ...guidance.preference, + ...guidance.antiPattern, + ]); + } const boundary = [ "Use only for similar task shape, evaluator expectation, or user preference.", @@ -433,6 +464,7 @@ async function buildDraft(args: { confidence, skillEligible, verifierMeta: verifier, + similarityKey, vectorText: [title, trigger, procedure, verification, boundary].join("\n"), refineFallback, }; @@ -481,6 +513,148 @@ function buildDraftFallback( return { title, trigger, procedure, verification, guidance }; } +function deriveFeedbackSimilarityKey(args: { + sourceKind: FeedbackSourceKind; + outcome: ObjectiveOutcome; + text: string; + type: ExperienceType; + polarity: EvidencePolarity; +}): FeedbackSimilarityKey | null { + const outcomeKind = outcomeKindOf(args.outcome, args.polarity); + return { + sourceKind: args.sourceKind, + outcomeKind, + taskKind: extractTaskKind(args.text), + issueKind: issueKindOf(args.type, args.polarity), + }; +} + +function deriveFeedbackSimilarityKeyFromPolicy(row: PolicyRow): FeedbackSimilarityKey | null { + const text = [ + row.title, + row.trigger, + row.procedure, + row.verification, + row.boundary, + ...(row.decisionGuidance?.preference ?? []), + ...(row.decisionGuidance?.antiPattern ?? []), + ].join("\n"); + const meta = row.verifierMeta as Record | null | undefined; + const sourceKind = meta ? "verifier" : "unknown"; + return { + sourceKind, + outcomeKind: outcomeKindOf(null, row.evidencePolarity ?? "neutral"), + taskKind: extractTaskKind(text), + issueKind: issueKindOf(row.experienceType ?? "repair_instruction", row.evidencePolarity ?? "neutral"), + }; +} + +function feedbackSourceKind( + raw: unknown, + verifierMeta: Record | null, + lower: string, +): FeedbackSourceKind { + const src = rawSource(raw); + if (src && /evaluator|evoagentbench|benchmark|gateway/i.test(src)) return "evaluator"; + if (src && /verifier|verification/i.test(src)) return "verifier"; + if (verifierMeta || /\bverifier\b|\bverification\b/.test(lower)) return "verifier"; + return "unknown"; +} + +function rawSource(raw: unknown): string | null { + let obj: unknown = raw; + if (typeof obj === "string") { + try { + obj = JSON.parse(obj); + } catch { + return null; + } + } + if (typeof obj !== "object" || obj == null) return null; + const source = (obj as Record).source; + return typeof source === "string" && source.trim() ? source.trim() : null; +} + +function outcomeKindOf(outcome: ObjectiveOutcome | null, polarity: EvidencePolarity): FeedbackOutcomeKind { + if (outcome === "pass") return "pass"; + if (outcome === "fail") return "fail"; + if (polarity === "positive") return "pass"; + if (polarity === "negative") return "fail"; + return "unknown"; +} + +function issueKindOf(type: ExperienceType, polarity: EvidencePolarity): string { + return `${type}:${polarity}`; +} + +const TASK_STOPWORDS = new Set([ + "verifier", + "feedback", + "previous", + "attempt", + "reward", + "passed", + "total", + "failed", + "failure", + "success", + "avoid", + "prefer", + "repair", + "instead", + "using", + "wrong", + "correct", + "field", + "next", + "time", + "please", + "briefly", + "reflect", + "what", + "would", + "keep", + "improve", +]); + +function extractTaskKind(text: string): string | undefined { + const lower = text.toLowerCase(); + if (/\bsec\s*13f\b/.test(lower)) return "sec_13f"; + if (/\bfft\b|autocorrelation|triplets?/.test(lower)) return "array_triplets"; + const tokens = lower + .split(/[^a-z0-9\u4e00-\u9fff]+/u) + .map((t) => t.trim()) + .filter((t) => t.length >= 3 && !TASK_STOPWORDS.has(t) && !/^\d+$/.test(t)); + return tokens[0]?.slice(0, 48); +} + +function isMetaOnlyFeedback(text: string, verifierMeta: Record | null): boolean { + const lower = text.toLowerCase(); + if (/\bheartbeat_ok\b/.test(lower)) return true; + const hasVerifier = Boolean(verifierMeta) + || /\bverifier\b|\bverification\b|verifier reward|passed\s*[:=]|resolved\s*[:=]/i.test(text); + const asksReflection = /please briefly reflect|what you would keep|what you would improve|reflect on what|反思/.test(lower); + if (hasVerifier && asksReflection) return true; + if (!hasVerifier) return false; + const stripped = lower + .replace(/\bverifier feedback(?: for the previous attempt)?\b/g, " ") + .replace(/\bverifier reward\b|\breward\b|\bresolved\b|\bpassed\b|\btotal\b|\bscore\b|\bstatus\b/g, " ") + .replace(/\bpass(?:ed)?\b|\bsuccess(?:ful)?\b|\bcorrect\b|\bfailed?\b|\bfailure\b/g, " ") + .replace(/\btrue\b|\bfalse\b|\byes\b|\bno\b/g, " ") + .replace(/[0-9._:-]+/g, " "); + const meaningful = stripped + .split(/[^a-z0-9\u4e00-\u9fff]+/u) + .map((t) => t.trim()) + .filter((t) => t.length >= 3 && !TASK_STOPWORDS.has(t)); + return meaningful.length === 0; +} + +function isMetaOnlyDerivedGuidance(parts: readonly string[]): boolean { + const text = parts.join("\n"); + return isMetaOnlyFeedback(text, null) + || /please briefly reflect|what you would keep|what you would improve|\bheartbeat_ok\b/i.test(text); +} + function guidanceOf( type: ExperienceType, classified: ReturnType, @@ -585,6 +759,7 @@ function findSimilarPolicy( deps: FeedbackExperienceDeps, ): PolicyRow | null { if (!vec) return null; + if (isSimilarityBannedSource(draft.similarityKey?.sourceKind)) return null; const hits = deps.repos.policies.searchByVector(vec, 5, { statusIn: ["active", "candidate"], hardCap: 50, @@ -594,14 +769,33 @@ function findSimilarPolicy( const row = deps.repos.policies.getById(hit.id as PolicyId); if (!row) continue; if (row.mergeFamily && row.mergeFamily !== mergeFamily) continue; - if (row.experienceType && row.experienceType !== draft.type && hit.score < 0.82) { - continue; - } + const minScore = feedbackSimilarityThreshold(draft, row); + if (minScore == null || hit.score < minScore) continue; return row; } return null; } +function feedbackSimilarityThreshold(draft: DraftExperience, row: PolicyRow): number | null { + const incoming = draft.similarityKey; + const existing = deriveFeedbackSimilarityKeyFromPolicy(row); + if (!incoming || !existing) return null; + if (isSimilarityBannedSource(incoming.sourceKind) || isSimilarityBannedSource(existing.sourceKind)) { + return null; + } + if (incoming.sourceKind !== existing.sourceKind) return null; + if (incoming.outcomeKind !== existing.outcomeKind) return null; + if (incoming.taskKind && existing.taskKind && incoming.taskKind !== existing.taskKind) return null; + if (incoming.issueKind && existing.issueKind && incoming.issueKind !== existing.issueKind) return null; + const missingKeyPart = !incoming.taskKind || !existing.taskKind || !incoming.issueKind || !existing.issueKind; + const typeMismatch = row.experienceType && row.experienceType !== draft.type; + return missingKeyPart || typeMismatch ? STRICT_SIMILARITY : MERGE_SIMILARITY; +} + +function isSimilarityBannedSource(sourceKind: FeedbackSourceKind | undefined): boolean { + return sourceKind === "verifier" || sourceKind === "evaluator"; +} + function mergePolicy( existing: PolicyRow, draft: DraftExperience, @@ -860,7 +1054,7 @@ function buildEpisodeContext( // Fallback: use current trace only if (!episode?.traceIds || episode.traceIds.length === 0) { if (currentTrace) { - const block = formatTurn(1, currentTrace); + const block = formatFeedbackTraceTurn(1, currentTrace); return { userRequest: currentTrace.userText, agentResponse: currentTrace.agentText, @@ -890,7 +1084,7 @@ function buildEpisodeContext( } if (traces.length === 0) { - const block = currentTrace ? formatTurn(1, currentTrace) : ""; + const block = currentTrace ? formatFeedbackTraceTurn(1, currentTrace) : ""; return { userRequest: currentTrace?.userText ?? "", agentResponse: currentTrace?.agentText ?? "", @@ -902,9 +1096,9 @@ function buildEpisodeContext( }; } - const selected = compressEpisodeTraces(traces, feedbackText, REFINE_MAX_CONTEXT_CHARS); + const selected = compressFeedbackEpisodeTraces(traces, feedbackText, REFINE_MAX_CONTEXT_CHARS); const contextParts = selected.kept.map((item) => - formatTurn(item.idx + 1, item.trace), + formatFeedbackTraceTurn(item.idx + 1, item.trace), ); const fullContext = contextParts.join("\n\n"); const firstTurn = selected.kept[0]?.trace ?? traces[0]; @@ -921,128 +1115,6 @@ function buildEpisodeContext( }; } -function compressEpisodeTraces( - traces: readonly TraceRow[], - feedbackText: string, - maxChars: number, -): { - kept: Array<{ trace: TraceRow; idx: number; text: string; value: number }>; - droppedCount: number; -} { - const feedbackKeywords = extractFeedbackKeywords(feedbackText); - const firstId = traces[0]?.id; - const lastId = traces[traces.length - 1]?.id; - const entries = traces.map((trace, idx) => { - const text = formatTurn(idx + 1, trace); - const value = traceInformationValue(trace, feedbackKeywords, firstId, lastId); - return { trace, idx, text, value }; - }); - const kept = [...entries]; - let total = kept.reduce((sum, item) => sum + item.text.length, 0) + Math.max(0, kept.length - 1) * 2; - while (total > maxChars && kept.length > 1) { - let dropIdx = -1; - let dropValue = Infinity; - let dropLen = Infinity; - for (let i = 0; i < kept.length; i++) { - const entry = kept[i]!; - if (isProtectedTrace(entry.trace, firstId, lastId)) continue; - const better = - entry.value < dropValue - || (entry.value === dropValue && entry.text.length < dropLen); - if (better) { - dropIdx = i; - dropValue = entry.value; - dropLen = entry.text.length; - } - } - if (dropIdx < 0) break; - kept.splice(dropIdx, 1); - total = kept.reduce((sum, item) => sum + item.text.length, 0) + Math.max(0, kept.length - 1) * 2; - } - kept.sort((a, b) => a.idx - b.idx); - return { kept, droppedCount: entries.length - kept.length }; -} - -/** Capture-time reflection label: pivotal steps must survive context compression. */ -function isPivotalTrace(trace: TraceRow): boolean { - return trace.reflection?.trim() === "PIVOTAL"; -} - -function isProtectedTrace( - trace: TraceRow, - firstId: string | undefined, - lastId: string | undefined, -): boolean { - return trace.id === firstId || trace.id === lastId || isPivotalTrace(trace); -} - -function traceInformationValue( - trace: TraceRow, - feedbackKeywords: readonly string[], - firstId: string | undefined, - lastId: string | undefined, -): number { - let score = 0; - if (trace.id === firstId) score += 80; - if (trace.id === lastId) score += 80; - if (isPivotalTrace(trace)) score += 70; - if ((trace.toolCalls?.length ?? 0) > 0) score += 18; - if ((trace.errorSignatures?.length ?? 0) > 0) score += 42; - if (trace.toolCalls?.some((tool) => typeof tool.errorCode === "string" && tool.errorCode.trim().length > 0)) { - score += 45; - } - const corpus = `${trace.userText}\n${trace.agentText}`.toLowerCase(); - if (feedbackKeywords.some((kw) => corpus.includes(kw))) score += 30; - if (/error|failed|failure|timeout|exception|错误|失败|超时/i.test(corpus)) score += 28; - if (trace.userText.trim().length > 0) score += 5; - if (trace.agentText.trim().length > 0) score += 5; - return score; -} - -function extractFeedbackKeywords(text: string): string[] { - const normalized = text.toLowerCase(); - const tokens = normalized.split(/[^a-z0-9\u4e00-\u9fff]+/u) - .map((t) => t.trim()) - .filter(Boolean) - .filter((t) => (/[a-z0-9]/.test(t) ? t.length >= 4 : t.length >= 2)); - return dedupeLines(tokens).slice(0, 32); -} - -function formatTurn(turnNumber: number, trace: TraceRow): string { - const userText = truncate(trace.userText, 280); - const agentText = truncate(trace.agentText, 360); - const toolSummary = summarizeTools(trace.toolCalls ?? []); - const errorSummary = summarizeErrors(trace); - const lines = [ - `Turn ${turnNumber}:`, - `User: ${userText}`, - `Agent: ${agentText}`, - toolSummary ? `Tools: ${toolSummary}` : null, - errorSummary ? `Errors: ${errorSummary}` : null, - ].filter((line): line is string => typeof line === "string"); - return lines.join("\n"); -} - -function summarizeTools(toolCalls: TraceRow["toolCalls"]): string { - if (!Array.isArray(toolCalls) || toolCalls.length === 0) return ""; - const pieces = toolCalls.slice(0, 4).map((tool) => { - const name = tool.name || "unknown"; - const code = typeof tool.errorCode === "string" && tool.errorCode.trim() ? `#${tool.errorCode}` : ""; - const output = typeof tool.output === "string" ? truncate(tool.output.replace(/\s+/g, " "), 80) : ""; - return [name, code, output].filter(Boolean).join(" "); - }); - return truncate(pieces.join(" | "), 280); -} - -function summarizeErrors(trace: TraceRow): string { - const sig = (trace.errorSignatures ?? []).slice(0, 3).map((s) => truncate(s, 80)); - const codes = (trace.toolCalls ?? []) - .map((tool) => tool.errorCode) - .filter((code): code is string => typeof code === "string" && code.trim().length > 0); - const merged = dedupeLines([...codes, ...sig]); - return truncate(merged.join(" | "), 260); -} - function classifyLlmFallbackReason(err: unknown): Exclude { if (MemosError.is(err)) { if (err.code === ERROR_CODES.LLM_TIMEOUT) return "llm_timeout"; @@ -1053,12 +1125,6 @@ function classifyLlmFallbackReason(err: unknown): Exclude maxChars && kept.length > 1) { + let dropIdx = -1; + let dropValue = Infinity; + let dropLen = Infinity; + for (let i = 0; i < kept.length; i++) { + const entry = kept[i]!; + if (isProtectedTrace(entry.trace, firstId, lastId)) continue; + const better = + entry.value < dropValue + || (entry.value === dropValue && entry.text.length < dropLen); + if (better) { + dropIdx = i; + dropValue = entry.value; + dropLen = entry.text.length; + } + } + if (dropIdx < 0) break; + kept.splice(dropIdx, 1); + total = textTotal(kept); + } + kept.sort((a, b) => a.idx - b.idx); + return { kept, droppedCount: entries.length - kept.length }; +} + +export function selectRepresentativeFeedbackTraces( + traces: readonly TraceRow[], + feedbackText: string, + limit: number, +): TraceRow[] { + if (limit <= 0 || traces.length === 0) return []; + const firstId = traces[0]?.id; + const lastId = traces[traces.length - 1]?.id; + return buildFeedbackTraceEntries(traces, feedbackText) + .sort((a, b) => { + const bp = protectedRank(b.trace, firstId, lastId); + const ap = protectedRank(a.trace, firstId, lastId); + if (bp !== ap) return bp - ap; + if (b.value !== a.value) return b.value - a.value; + return a.idx - b.idx; + }) + .slice(0, limit) + .sort((a, b) => a.idx - b.idx) + .map((entry) => entry.trace); +} + +export function formatFeedbackTraceTurn(turnNumber: number, trace: TraceRow): string { + const userText = truncate(trace.userText, 280); + const agentText = truncate(trace.agentText, 360); + const toolSummary = summarizeTools(trace.toolCalls ?? []); + const errorSummary = summarizeErrors(trace); + const lines = [ + `Turn ${turnNumber}:`, + `User: ${userText}`, + `Agent: ${agentText}`, + toolSummary ? `Tools: ${toolSummary}` : null, + errorSummary ? `Errors: ${errorSummary}` : null, + ].filter((line): line is string => typeof line === "string"); + return lines.join("\n"); +} + +function buildFeedbackTraceEntries( + traces: readonly TraceRow[], + feedbackText: string, +): FeedbackTraceSelectionEntry[] { + const feedbackKeywords = extractFeedbackKeywords(feedbackText); + const firstId = traces[0]?.id; + const lastId = traces[traces.length - 1]?.id; + return traces.map((trace, idx) => { + const text = formatFeedbackTraceTurn(idx + 1, trace); + const value = traceInformationValue(trace, feedbackKeywords, firstId, lastId); + return { trace, idx, text, value }; + }); +} + +function textTotal(entries: readonly FeedbackTraceSelectionEntry[]): number { + return entries.reduce((sum, item) => sum + item.text.length, 0) + Math.max(0, entries.length - 1) * 2; +} + +/** Capture-time reflection label: pivotal steps must survive context compression. */ +function isPivotalTrace(trace: TraceRow): boolean { + return trace.reflection?.trim() === "PIVOTAL"; +} + +function isProtectedTrace( + trace: TraceRow, + firstId: string | undefined, + lastId: string | undefined, +): boolean { + return trace.id === firstId || trace.id === lastId || isPivotalTrace(trace); +} + +function protectedRank( + trace: TraceRow, + firstId: string | undefined, + lastId: string | undefined, +): number { + if (isPivotalTrace(trace)) return 3; + if (trace.id === firstId || trace.id === lastId) return 2; + return 0; +} + +function traceInformationValue( + trace: TraceRow, + feedbackKeywords: readonly string[], + firstId: string | undefined, + lastId: string | undefined, +): number { + let score = 0; + if (trace.id === firstId) score += 80; + if (trace.id === lastId) score += 80; + if (isPivotalTrace(trace)) score += 70; + if ((trace.toolCalls?.length ?? 0) > 0) score += 18; + if ((trace.errorSignatures?.length ?? 0) > 0) score += 42; + if (trace.toolCalls?.some((tool) => typeof tool.errorCode === "string" && tool.errorCode.trim().length > 0)) { + score += 45; + } + const corpus = `${trace.userText}\n${trace.agentText}`.toLowerCase(); + if (feedbackKeywords.some((kw) => corpus.includes(kw))) score += 30; + if (/error|failed|failure|timeout|exception|错误|失败|超时/i.test(corpus)) score += 28; + if (trace.userText.trim().length > 0) score += 5; + if (trace.agentText.trim().length > 0) score += 5; + return score; +} + +function extractFeedbackKeywords(text: string): string[] { + const normalized = text.toLowerCase(); + const tokens = normalized.split(/[^a-z0-9\u4e00-\u9fff]+/u) + .map((t) => t.trim()) + .filter(Boolean) + .filter((t) => (/[a-z0-9]/.test(t) ? t.length >= 4 : t.length >= 2)); + return dedupeLines(tokens).slice(0, 32); +} + +function summarizeTools(toolCalls: TraceRow["toolCalls"]): string { + if (!Array.isArray(toolCalls) || toolCalls.length === 0) return ""; + const pieces = toolCalls.slice(0, 4).map((tool) => { + const name = tool.name || "unknown"; + const code = typeof tool.errorCode === "string" && tool.errorCode.trim() ? `#${tool.errorCode}` : ""; + const output = typeof tool.output === "string" ? truncate(tool.output.replace(/\s+/g, " "), 80) : ""; + return [name, code, output].filter(Boolean).join(" "); + }); + return truncate(pieces.join(" | "), 280); +} + +function summarizeErrors(trace: TraceRow): string { + const sig = (trace.errorSignatures ?? []).slice(0, 3).map((s) => truncate(s, 80)); + const codes = (trace.toolCalls ?? []) + .map((tool) => tool.errorCode) + .filter((code): code is string => typeof code === "string" && code.trim().length > 0); + const merged = dedupeLines([...codes, ...sig]); + return truncate(merged.join(" | "), 260); +} + +function truncate(s: string, maxLen: number): string { + if (!s) return ""; + if (s.length <= maxLen) return s; + return s.slice(0, maxLen - 3) + "..."; +} + +function dedupeLines(lines: readonly string[]): string[] { + const out: string[] = []; + const seen = new Set(); + for (const line of lines) { + const s = line.trim(); + if (!s || seen.has(s)) continue; + seen.add(s); + out.push(s); + } + return out; +} diff --git a/apps/memos-local-plugin/core/llm/prompts/reward.ts b/apps/memos-local-plugin/core/llm/prompts/reward.ts index 59760f3ee..b280c4155 100644 --- a/apps/memos-local-plugin/core/llm/prompts/reward.ts +++ b/apps/memos-local-plugin/core/llm/prompts/reward.ts @@ -8,7 +8,7 @@ import type { PromptDef } from "./index.js"; * reflection-weighted backprop uses this value as the terminal V_T. * * Axes come straight from the V7 rubric table in §0.6: - * 1. goal_achievement — did the agent actually solve the stated task? + * 1. goal_achievement — did the agent complete EPISODE_MISSION? * 2. process_quality — was the path reasonable and efficient? * 3. user_satisfaction — does the user's own text read as pleased, neutral, or angry? * @@ -18,21 +18,27 @@ import type { PromptDef } from "./index.js"; */ export const REWARD_R_HUMAN_PROMPT: PromptDef = { id: "reward.r_human", - version: 4, + version: 6, description: "Score an episode's R_human from a multi-turn task summary + user feedback.", system: `You are a strict grader of AI-agent task execution. You receive: - TASK_SUMMARY — the FULL conversation arc for this task: - * USER_ASKS_AND_AGENT_REPLIES lists every user turn - paired with the agent's corresponding reply, in - chronological order. One "task" frequently spans - multiple user turns as the user refines / follows - up / pivots topics within the same session. - * MOST_RECENT_USER_ASK and MOST_RECENT_AGENT_REPLY - call out the final exchange explicitly — that is - usually the truest signal of whether the agent is - actually tracking where the user is now. + * EPISODE_MISSION — the canonical goal of this + episode, anchored at the time the task started + (or explicitly updated when the user redefined the + task). This is the authoritative definition of what + the agent was supposed to accomplish. + * USER_ASKS_AND_AGENT_REPLIES — every user turn + paired with the agent's reply, in order. Turns + after the initial task may be follow-ups, + corrections, verifier results, or reflections — + they do NOT redefine EPISODE_MISSION unless the + user explicitly introduces a completely new, + unrelated task. + * MOST_RECENT_USER_ASK / MOST_RECENT_AGENT_REPLY + — the final exchange. Useful for user_satisfaction + and process_quality context. - FEEDBACK — the user's own messages AFTER the task attempt finished. Format: [SOURCE/polarity @ISO-timestamp] SOURCE=USER means the user directly wrote this; @@ -51,24 +57,26 @@ You receive: Grade the agent on THREE INDEPENDENT AXES, each in [-1, 1]: -1. "goal_achievement" — did the agent address what the user ACTUALLY asked? - +1.0 every user ask was correctly addressed AND (if tools were used) +1. "goal_achievement" — did the agent complete EPISODE_MISSION? + Always evaluate against EPISODE_MISSION, not MOST_RECENT_USER_ASK. + +1.0 EPISODE_MISSION was fully addressed AND (if tools were used) EXECUTION_OUTCOME shows task_completed_by_tool=yes. - +0.3 the last ask was addressed well; earlier asks had minor gaps. - 0.0 unclear if the user's ask was met. + +0.3 EPISODE_MISSION substantially addressed; minor gaps only. + 0.0 unclear if EPISODE_MISSION was met. -0.3 agent verbally acknowledged the correct approach but did NOT - re-execute; or missed a significant portion of what was asked. + execute it; or missed a significant portion of EPISODE_MISSION. Use this when EXECUTION_OUTCOME shows task_completed_by_tool=no and the last agent reply is explanatory text only. -1.0 fundamentally wrong answer / caused damage / refused without reason. - CRITICAL RULE — do NOT anchor on the first user turn. A user who - starts with "上海天气" and later pivots to "再查北京天气" is a user - whose goal has EVOLVED; if the agent answered Beijing on the final - turn when asked about Beijing, that is goal-achievement = POSITIVE, - not negative. Judge each user ask on its own merits, weighted - toward the most recent exchange (which is where the user actually - is now). + MISSION ANCHOR RULE — goal_achievement measures completion of + EPISODE_MISSION only. Later turns that are reflections, verifier + results, error messages, or follow-up corrections are NOT new + missions; answering them well does NOT raise goal_achievement. + The only exception: if the user explicitly replaces the task with + an entirely new, unrelated objective (visible in + USER_ASKS_AND_AGENT_REPLIES), treat the new objective as the + effective mission from that point on. EXECUTION RULE — distinguish verbal acknowledgment from actual execution. If EXECUTION_OUTCOME.task_completed_by_tool is "no", the agent's last diff --git a/apps/memos-local-plugin/core/pipeline/memory-core.ts b/apps/memos-local-plugin/core/pipeline/memory-core.ts index bcb676b1f..092752b20 100644 --- a/apps/memos-local-plugin/core/pipeline/memory-core.ts +++ b/apps/memos-local-plugin/core/pipeline/memory-core.ts @@ -5506,11 +5506,11 @@ export function deriveTurnCount( // The old code tripped every rTask < 0 (even -0.05) into the "反例" // bucket — a single LLM misread on a multi-topic episode was enough to // flag a normal task as a negative example. Tightening the floor to -// −0.5 means only genuinely bad outcomes (clear user correction, wrong +// −0.15 means clearly negative outcomes (user correction, wrong // action, damage) surface as 反例; mild negative judgments fall into // the softer "below threshold" bucket and the user doesn't get // shouted at. -export const R_NEGATIVE_FLOOR = -0.5; +export const R_NEGATIVE_FLOOR = -0.15; export const R_BELOW_THRESHOLD = 0.15; // aligned with `algorithm.skill.minGain` export function deriveSkillStatus( diff --git a/apps/memos-local-plugin/core/reward/task-summary.ts b/apps/memos-local-plugin/core/reward/task-summary.ts index db54ec7c0..b9f6937c4 100644 --- a/apps/memos-local-plugin/core/reward/task-summary.ts +++ b/apps/memos-local-plugin/core/reward/task-summary.ts @@ -3,22 +3,16 @@ * that the R_human scorer feeds to the LLM. * * V7 §0.6 scoring anchor: when a single episode spans multiple user - * turns (the `merge_follow_ups` mode, default), the goal is NOT just - * the first user message. Each follow-up is its own sub-goal the - * agent has to address; the scorer needs the full chain to judge - * whether the agent tracked the user's evolving intent. The previous - * build pinned `USER_QUERY` to only the first user turn, which caused - * multi-topic episodes (e.g. 上海天气 → 穿衣 → 带伞 → 北京天气) to be - * marked as R<0 just because the final assistant reply did not match - * the *opening* query — a false negative that kept real tasks out of - * the L2/Skill pipeline. + * turns (the `merge_follow_ups` mode, default), the scorer needs both + * a stable mission anchor and the chronological turn chain. The mission + * tells `goal_achievement` what task is being graded; the turn chain + * tells process/user-satisfaction scoring whether later turns were + * corrections, verifier output, reflections, or a genuine task reset. * - * So we now emit a chronological USER_ASKS / AGENT_REPLIES block - * covering every user turn paired with the agent's corresponding reply - * (plus a per-step action summary for tool-call context). The scorer's - * rubric is updated in parallel to judge "did the agent address every - * user ask, especially the most recent one?" — see - * `core/llm/prompts/reward.ts`. + * So we emit EPISODE_MISSION plus a chronological USER_ASKS / + * AGENT_REPLIES block covering every user turn paired with the agent's + * corresponding reply (plus a per-step action summary for tool-call + * context). See `core/llm/prompts/reward.ts` for the matching rubric. * * The result is clipped to `cfg.summaryMaxChars` with a head+tail * strategy — identical to `capture/normalizer.ts` — so the most recent @@ -74,10 +68,26 @@ export function buildTaskSummary(input: SummaryInput): TaskSummary { const agentActions = traces.map(traceOneLiner).filter(Boolean).join("\n"); const hostContext = formatHostAgentContext(episode, input.evaluator); + // EPISODE_MISSION: the canonical goal of this episode. + // Prefer an explicitly updated canonicalGoal (set when the user + // genuinely re-defines the task), then initialUserText recorded at + // episode start, then the first user turn as last-resort fallback. + // This is the stable anchor used by the reward scorer to evaluate + // goal_achievement — independent of what the most recent user turn says. + const missionText = + (typeof episode.meta?.canonicalGoal === "string" && episode.meta.canonicalGoal.trim().length > 0) + ? episode.meta.canonicalGoal.trim() + : (typeof episode.meta?.initialUserText === "string" && episode.meta.initialUserText.trim().length > 0) + ? episode.meta.initialUserText.trim() + : userQuery; + const body = [ hostContext ? `HOST_AGENT_CONTEXT:` : "", hostContext, hostContext ? `` : "", + `EPISODE_MISSION:`, + oneLine(missionText, 800), + ``, `USER_ASKS_AND_AGENT_REPLIES (${pairs.length}, in order):`, pairsText, ``, diff --git a/apps/memos-local-plugin/core/skill/repair-candidate.ts b/apps/memos-local-plugin/core/skill/repair-candidate.ts index f888c590d..014862124 100644 --- a/apps/memos-local-plugin/core/skill/repair-candidate.ts +++ b/apps/memos-local-plugin/core/skill/repair-candidate.ts @@ -27,8 +27,10 @@ import { ids } from "../id.js"; import type { Embedder } from "../embedding/types.js"; import type { Logger } from "../logger/types.js"; import type { Repos } from "../storage/repos/index.js"; -import type { PolicyRow, SkillId, SkillRow, TraceId } from "../types.js"; +import type { EpisodeId, PolicyRow, SkillId, SkillRow, TraceId, TraceRow } from "../types.js"; +import { selectRepresentativeFeedbackTraces } from "../experience/trace-selection.js"; import { deriveNameFromText, uniquifySkillName } from "./name.js"; +import type { SkillConfig } from "./types.js"; /** * Q3: born at the retrieval floor — visible enough to be tried, no head start. @@ -40,7 +42,8 @@ import { deriveNameFromText, uniquifySkillName } from "./name.js"; export const REPAIR_CANDIDATE_INITIAL_ETA = 0.1; export interface MintRepairCandidateDeps { - repos: Pick; + repos: Pick; + config: Pick; embedder: Embedder | null; now?: () => number; log?: Logger; @@ -94,6 +97,7 @@ export function mintRepairCandidate( const name = uniquifySkillName(baseName, existingNames); const id = ids.skill() as SkillId; const invocationGuide = renderRepairGuide(policy, fix); + const evidenceAnchors = selectRepairEvidenceAnchors(policy, deps); const row: SkillRow = { id, @@ -111,7 +115,7 @@ export function mintRepairCandidate( trialsPassed: 0, sourcePolicyIds: [policy.id], sourceWorldModelIds: [], - evidenceAnchors: (policy.sourceTraceIds ?? []) as TraceId[], + evidenceAnchors, vec: null, createdAt: now, updatedAt: now, @@ -146,6 +150,64 @@ export function mintRepairCandidate( return id; } +export function selectRepairEvidenceAnchors( + policy: PolicyRow, + deps: Pick, +): TraceId[] { + const limit = Math.max(0, deps.config.evidenceLimit); + if (limit === 0) return []; + const traces = loadPolicyEpisodeTraces(policy, deps.repos); + const feedbackText = [ + policy.title, + policy.trigger, + policy.procedure, + policy.verification, + ...(policy.decisionGuidance?.preference ?? []), + ...(policy.decisionGuidance?.antiPattern ?? []), + ].join("\n"); + const seen = new Set(); + const out: TraceId[] = []; + for (const trace of selectRepresentativeFeedbackTraces(traces, feedbackText, limit)) { + if (seen.has(trace.id)) continue; + seen.add(trace.id); + out.push(trace.id as TraceId); + } + return out; +} + +function loadPolicyEpisodeTraces( + policy: PolicyRow, + repos: Pick, +): TraceRow[] { + const out: TraceRow[] = []; + const seen = new Set(); + for (const episodeId of policy.sourceEpisodeIds ?? []) { + for (const trace of loadEpisodeTraces(episodeId as EpisodeId, repos)) { + if (seen.has(trace.id)) continue; + seen.add(trace.id); + out.push(trace); + } + } + return out; +} + +function loadEpisodeTraces( + episodeId: EpisodeId, + repos: Pick, +): TraceRow[] { + const episode = repos.episodes.getById(episodeId); + const canonicalIds = episode?.traceIds ?? []; + if (canonicalIds.length > 0) { + const rows: TraceRow[] = []; + for (const id of canonicalIds) { + const row = repos.traces.getById(id as TraceId); + if (row) rows.push(row); + } + if (rows.length > 0) return rows; + } + return repos.traces.list({ episodeId, limit: 500 }); +} + function stripPrefix(title: string): string { return title.replace(/^(avoid|repair|prefer|success)\s*:\s*/i, "").trim(); } diff --git a/apps/memos-local-plugin/core/skill/subscriber.ts b/apps/memos-local-plugin/core/skill/subscriber.ts index 967ed6dc9..9853f9631 100644 --- a/apps/memos-local-plugin/core/skill/subscriber.ts +++ b/apps/memos-local-plugin/core/skill/subscriber.ts @@ -197,10 +197,10 @@ export function attachSkillSubscriber( ? "verifier not a full pass" : "no verifier signal on this episode" : outcome === "pass" - ? "rTask >= 0.5" + ? `rTask >= ${passTh}` : outcome === "fail" - ? "rTask <= -0.5" - : "-0.5 < rTask < 0.5", + ? `rTask <= ${failTh}` + : `${failTh} < rTask < ${passTh}`, }; const changed = deps.repos.skillTrials.resolve( trial.id, diff --git a/apps/memos-local-plugin/tests/unit/episode/outcome.test.ts b/apps/memos-local-plugin/tests/unit/episode/outcome.test.ts index 4096a0997..e95e68e69 100644 --- a/apps/memos-local-plugin/tests/unit/episode/outcome.test.ts +++ b/apps/memos-local-plugin/tests/unit/episode/outcome.test.ts @@ -37,10 +37,10 @@ describe("computeEpisodeOutcome", () => { expect(computeEpisodeOutcome(0.25, null, cfg)).toBe("unknown"); }); - it("rTask <= -0.5 = failure (when verifier doesn't veto)", () => { - expect(computeEpisodeOutcome(-0.5, null, cfg)).toBe("failure"); + it("rTask <= -0.15 = failure (when verifier doesn't veto)", () => { + expect(computeEpisodeOutcome(-0.15, null, cfg)).toBe("failure"); expect(computeEpisodeOutcome(-1, null, cfg)).toBe("failure"); - expect(computeEpisodeOutcome(-0.25, null, cfg)).toBe("unknown"); + expect(computeEpisodeOutcome(-0.14, null, cfg)).toBe("unknown"); }); it("neutral rTask + verifier=true => verifier fallback yields success", () => { diff --git a/apps/memos-local-plugin/tests/unit/experience/feedback-builder.test.ts b/apps/memos-local-plugin/tests/unit/experience/feedback-builder.test.ts index 90b9f884e..27e62150c 100644 --- a/apps/memos-local-plugin/tests/unit/experience/feedback-builder.test.ts +++ b/apps/memos-local-plugin/tests/unit/experience/feedback-builder.test.ts @@ -157,6 +157,51 @@ describe("feedback experience builder", () => { expect(row?.skillEligible).toBe(false); }); + it("does not mark verifier reflection prompts as skill-eligible even on full pass", async () => { + const result = await runFeedbackExperience( + { + feedback: feedback({ + id: "fb_full_reflect" as FeedbackRow["id"], + polarity: "positive", + rationale: + "Verifier feedback for the previous attempt. Verifier reward: 1.0. passed: 4, total: 4. Please briefly reflect on what you would keep and what you would improve next time.", + raw: { + source: "evoagentbench_gateway_manual_feedback", + verifier: { reward: 1, passed: 4, total: 4 }, + }, + }), + episode: { id: "ep_feedback" as EpisodeId, traceIds: [trace.id], rTask: 1 }, + trace, + }, + { repos: handle.repos, embedder: fakeEmbedder(), namespace, now: () => NOW }, + ); + + const row = handle.repos.policies.getById(result.policyId!)!; + expect(row.experienceType).toBe("success_pattern"); + expect(row.evidencePolarity).toBe("positive"); + expect(row.skillEligible).toBe(false); + }); + + it("does not mark HEARTBEAT_OK verifier responses as skill-eligible", async () => { + const result = await runFeedbackExperience( + { + feedback: feedback({ + id: "fb_heartbeat" as FeedbackRow["id"], + polarity: "positive", + rationale: "Verifier feedback: success. HEARTBEAT_OK", + raw: { source: "verifier", verifier: { reward: 1, passed: 1, total: 1 } }, + }), + episode: { id: "ep_feedback" as EpisodeId, traceIds: [trace.id], rTask: 1 }, + trace, + }, + { repos: handle.repos, embedder: fakeEmbedder(), namespace, now: () => NOW }, + ); + + const row = handle.repos.policies.getById(result.policyId!)!; + expect(row.experienceType).toBe("success_pattern"); + expect(row.skillEligible).toBe(false); + }); + it("records the suggested fix as a preference on a constructive negative (avoid + do-Y in one record)", async () => { const result = await runFeedbackExperience( { @@ -244,7 +289,7 @@ describe("feedback experience builder", () => { expect(sibling?.sourceFeedbackIds).toContain("fb_avoid"); }); - it("does not merge into active policy in same merge family", async () => { + it("does not treat verifier similarity as an active-hit candidate fork", async () => { const base = await runFeedbackExperience( { feedback: feedback({ @@ -281,14 +326,15 @@ describe("feedback experience builder", () => { ); expect(follow.created).toBe(true); + expect(follow.policyId).not.toBe(base.policyId); const all = handle.repos.policies.list({ limit: 20 }); const activeRows = all.filter((p) => p.status === "active"); const candidateRows = all.filter((p) => p.status === "candidate"); - expect(activeRows).toHaveLength(1); - expect(candidateRows.length).toBeGreaterThanOrEqual(1); + expect(activeRows).toHaveLength(2); + expect(candidateRows).toHaveLength(0); }); - it("re-derives mergeFamily after polarity changes during merge", async () => { + it("merges compatible non-verifier feedback with the existing similarity threshold", async () => { handle.repos.policies.insert({ id: "po_merge_family" as never, ownerAgentKind: "hermes", @@ -296,21 +342,21 @@ describe("feedback experience builder", () => { ownerWorkspaceId: "workspace", title: "SEC 13F extraction rule", trigger: "when parsing 13F", - procedure: "prefer validated issuer field", + procedure: "avoid wrong issuer field", verification: "issuer matches filing", boundary: "", support: 1, gain: 0.2, status: "candidate", - experienceType: "success_pattern", - evidencePolarity: "positive", + experienceType: "failure_avoidance", + evidencePolarity: "negative", mergeFamily: null, sourceEpisodeIds: ["ep_feedback" as EpisodeId], sourceFeedbackIds: [], sourceTraceIds: [trace.id], inducedBy: "feedback.experience.v1", - decisionGuidance: { preference: ["prefer validated issuer field"], antiPattern: [] }, - skillEligible: true, + decisionGuidance: { preference: [], antiPattern: ["avoid wrong issuer field"] }, + skillEligible: false, createdAt: NOW, updatedAt: NOW, vec: vec([1, 0, 0]), @@ -320,8 +366,8 @@ describe("feedback experience builder", () => { { feedback: feedback({ id: "fb_merge_family" as FeedbackRow["id"], - rationale: "Verifier failed: avoid wrong SEC 13F issuer field.", - raw: { source: "verifier", score: -1 }, + rationale: "Avoid wrong SEC 13F issuer field.", + raw: {}, }), episode: { id: "ep_feedback" as EpisodeId, traceIds: [trace.id], rTask: -1 }, trace, @@ -331,8 +377,103 @@ describe("feedback experience builder", () => { expect(merged.policyId).toBe("po_merge_family"); const row = handle.repos.policies.getById("po_merge_family" as never); - expect(row?.evidencePolarity).toBe("mixed"); - expect(row?.mergeFamily).toBe("failure_corrective"); + expect(row?.support).toBe(2); + expect(row?.evidencePolarity).toBe("negative"); + expect(row?.mergeFamily).toBe("failure_avoidance"); + expect(row?.sourceFeedbackIds).toContain("fb_merge_family"); + }); + + it("does not split compatible manual feedback just because policy source kind is not persisted", async () => { + handle.repos.policies.insert({ + id: "po_manual_merge" as never, + ownerAgentKind: "hermes", + ownerProfileId: "default", + ownerWorkspaceId: "workspace", + title: "SEC 13F extraction rule", + trigger: "when parsing 13F", + procedure: "avoid wrong issuer field", + verification: "issuer matches filing", + boundary: "", + support: 1, + gain: 0.2, + status: "candidate", + experienceType: "failure_avoidance", + evidencePolarity: "negative", + mergeFamily: null, + sourceEpisodeIds: ["ep_feedback" as EpisodeId], + sourceFeedbackIds: [], + sourceTraceIds: [trace.id], + inducedBy: "feedback.experience.v1", + decisionGuidance: { preference: [], antiPattern: ["avoid wrong issuer field"] }, + skillEligible: false, + createdAt: NOW, + updatedAt: NOW, + vec: vec([1, 0, 0]), + }); + + const merged = await runFeedbackExperience( + { + feedback: feedback({ + id: "fb_manual_merge" as FeedbackRow["id"], + rationale: "Avoid wrong SEC 13F issuer field.", + raw: { source: "manual" }, + }), + episode: { id: "ep_feedback" as EpisodeId, traceIds: [trace.id], rTask: -1 }, + trace, + }, + { repos: handle.repos, embedder: fakeEmbedder(), namespace, now: () => NOW + 4 }, + ); + + expect(merged.policyId).toBe("po_manual_merge"); + const row = handle.repos.policies.getById("po_manual_merge" as never); + expect(row?.support).toBe(2); + expect(row?.sourceFeedbackIds).toContain("fb_manual_merge"); + }); + + it("uses the stricter threshold when task or issue key parts are missing", async () => { + handle.repos.policies.insert({ + id: "po_strict_missing_key" as never, + ownerAgentKind: "hermes", + ownerProfileId: "default", + ownerWorkspaceId: "workspace", + title: "Avoid", + trigger: "Avoid", + procedure: "Avoid", + verification: "", + boundary: "", + support: 1, + gain: 0.2, + status: "candidate", + experienceType: "failure_avoidance", + evidencePolarity: "negative", + mergeFamily: null, + sourceEpisodeIds: ["ep_feedback" as EpisodeId], + sourceFeedbackIds: [], + sourceTraceIds: [trace.id], + inducedBy: "feedback.experience.v1", + decisionGuidance: { preference: [], antiPattern: ["avoid"] }, + skillEligible: false, + createdAt: NOW, + updatedAt: NOW, + vec: vec([1, 0]), + }); + + const result = await runFeedbackExperience( + { + feedback: feedback({ + id: "fb_strict_missing_key" as FeedbackRow["id"], + rationale: "Avoid wrong.", + raw: {}, + }), + episode: { id: "ep_feedback" as EpisodeId, traceIds: [trace.id], rTask: -1 }, + trace, + }, + { repos: handle.repos, embedder: fakeEmbedder(vec([0.8, 0.6])), namespace, now: () => NOW + 5 }, + ); + + expect(result.policyId).not.toBe("po_strict_missing_key"); + const original = handle.repos.policies.getById("po_strict_missing_key" as never); + expect(original?.support).toBe(1); }); it("passes compressed all-trace context to refiner in trace order", async () => { diff --git a/apps/memos-local-plugin/tests/unit/reward/human-scorer.test.ts b/apps/memos-local-plugin/tests/unit/reward/human-scorer.test.ts index aea8af200..8d3b8c82c 100644 --- a/apps/memos-local-plugin/tests/unit/reward/human-scorer.test.ts +++ b/apps/memos-local-plugin/tests/unit/reward/human-scorer.test.ts @@ -64,7 +64,7 @@ describe("reward/human-scorer", () => { it("LLM mode: happy path, uses the LLM and reports llm source", async () => { const llm = fakeLlm({ completeJson: { - "reward.reward.r_human.v4": { + "reward.reward.r_human.v7": { goal_achievement: 0.9, process_quality: 0.5, user_satisfaction: 0.8, @@ -89,7 +89,7 @@ describe("reward/human-scorer", () => { it("LLM mode: clamps axes to [-1, 1]", async () => { const llm = fakeLlm({ completeJson: { - "reward.reward.r_human.v4": { + "reward.reward.r_human.v7": { goal_achievement: 5, process_quality: -3, user_satisfaction: 2, @@ -112,7 +112,7 @@ describe("reward/human-scorer", () => { it("LLM mode: rejects non-numeric axes (via validate) → falls back to heuristic", async () => { const llm = fakeLlm({ completeJson: { - "reward.reward.r_human.v4": { goal_achievement: "yes", process_quality: 0, user_satisfaction: 0 }, + "reward.reward.r_human.v7": { goal_achievement: "yes", process_quality: 0, user_satisfaction: 0 }, }, }); const out = await scoreHuman( @@ -153,7 +153,7 @@ describe("reward/human-scorer", () => { let capturedUserContent = ""; const llm = fakeLlm({ completeJson: { - "reward.reward.r_human.v4": (input: unknown) => { + "reward.reward.r_human.v7": (input: unknown) => { const msgs = input as Array<{ role: string; content: string }>; capturedUserContent = msgs.find((m) => m.role === "user")?.content ?? ""; return { diff --git a/apps/memos-local-plugin/tests/unit/reward/reward.integration.test.ts b/apps/memos-local-plugin/tests/unit/reward/reward.integration.test.ts index f0c35e857..253d8b9ac 100644 --- a/apps/memos-local-plugin/tests/unit/reward/reward.integration.test.ts +++ b/apps/memos-local-plugin/tests/unit/reward/reward.integration.test.ts @@ -202,7 +202,7 @@ describe("reward/integration", () => { const llm = fakeLlm({ completeJson: { - "reward.reward.r_human.v4": { + "reward.reward.r_human.v7": { goal_achievement: 0.9, process_quality: 0.7, user_satisfaction: 0.8, @@ -219,7 +219,7 @@ describe("reward/integration", () => { llm, bus, cfg: cfg(), - outcomeThresholds: { successThreshold: 0.5, failureThreshold: -0.5 }, + outcomeThresholds: { successThreshold: 0.5, failureThreshold: -0.15 }, now: () => NOW, }); diff --git a/apps/memos-local-plugin/tests/unit/server/http.test.ts b/apps/memos-local-plugin/tests/unit/server/http.test.ts index 3a8e70780..0aeb1daf9 100644 --- a/apps/memos-local-plugin/tests/unit/server/http.test.ts +++ b/apps/memos-local-plugin/tests/unit/server/http.test.ts @@ -353,7 +353,7 @@ describe("HTTP server — REST routes", () => { }); it("GET /api/v1/episodes?status=failed filters by derived status", async () => { - // Mix of failed (rTask <= -0.5) and completed/active rows so the + // Mix of failed (rTask <= negative floor) and completed/active rows so the // server has to actually filter — the viewer used to do this in // the browser on top of one paginated page, which broke pagination. (core.listEpisodeRows as any).mockResolvedValueOnce([ diff --git a/apps/memos-local-plugin/tests/unit/skill/_helpers.ts b/apps/memos-local-plugin/tests/unit/skill/_helpers.ts index 621e037b5..21ce70b31 100644 --- a/apps/memos-local-plugin/tests/unit/skill/_helpers.ts +++ b/apps/memos-local-plugin/tests/unit/skill/_helpers.ts @@ -45,7 +45,7 @@ export function makeSkillConfig(partial: Partial = {}): SkillConfig archiveEta: 0.1, minEtaForRetrieval: 0.1, outcomeRTaskSuccessThreshold: 0.5, - outcomeRTaskFailureThreshold: -0.5, + outcomeRTaskFailureThreshold: -0.15, failureEpisodeScorePenalty: 0, failureEpisodeMaxRatio: 0.4, ...partial, diff --git a/apps/memos-local-plugin/tests/unit/skill/repair-candidate.test.ts b/apps/memos-local-plugin/tests/unit/skill/repair-candidate.test.ts index e28f65d67..78480a6a1 100644 --- a/apps/memos-local-plugin/tests/unit/skill/repair-candidate.test.ts +++ b/apps/memos-local-plugin/tests/unit/skill/repair-candidate.test.ts @@ -20,6 +20,7 @@ const namespace: RuntimeNamespace = { profileId: "default", workspaceId: "workspace", }; +const repairConfig = { evidenceLimit: 2 }; function feedback(partial: Partial = {}): FeedbackRow { return { @@ -53,6 +54,15 @@ async function makeConstructiveNegative(handle: TmpDbHandle, trace: TraceRow): P return result.policyId!; } +function repairDeps(handle: TmpDbHandle) { + return { + repos: handle.repos, + config: repairConfig, + embedder: null, + now: () => NOW, + }; +} + describe("repair candidate minting", () => { let handle: TmpDbHandle; let trace: TraceRow; @@ -78,11 +88,7 @@ describe("repair candidate minting", () => { const policy = handle.repos.policies.getById(policyId)!; expect(isRepairCandidatePolicy(policy)).toBe(true); - const skillId = mintRepairCandidate(policy, { - repos: handle.repos, - embedder: null, - now: () => NOW, - }); + const skillId = mintRepairCandidate(policy, repairDeps(handle)); expect(skillId).toBeTruthy(); const skill = handle.repos.skills.getById(skillId!)!; @@ -91,17 +97,51 @@ describe("repair candidate minting", () => { expect(skill.repairOrigin).toBe(true); expect(skill.strictTrial).toBe(true); // verifier origin → full-pass-only trials expect(skill.sourcePolicyIds).toEqual([policyId]); + expect(skill.evidenceAnchors.length).toBeLessThanOrEqual(repairConfig.evidenceLimit); expect(skill.trialsAttempted).toBe(0); expect(skill.invocationGuide.toLowerCase()).toContain("fft"); }); + it("selects bounded representative evidence instead of copying policy sourceTraceIds", async () => { + const traceIds: TraceRow["id"][] = [trace.id]; + for (let i = 1; i <= 4; i++) { + const extra = seedTrace(handle, { + id: `tr_feedback_${i}`, + episodeId: "ep_feedback", + sessionId: "se_feedback", + userText: i === 3 ? "Timeout error from quadratic bitset approach" : `padding turn ${i}`, + agentText: i === 3 ? "PIVOTAL repair should use FFT autocorrelation" : `padding answer ${i}`, + reflection: i === 3 ? "PIVOTAL" : null, + vec: vec([1, 0, 0]), + }); + traceIds.push(extra.id); + } + + const result = await runFeedbackExperience( + { + feedback: feedback({ id: "fb_many" as FeedbackRow["id"] }), + episode: { id: "ep_feedback" as EpisodeId, traceIds, rTask: -0.51 }, + trace, + }, + { repos: handle.repos, embedder: null, namespace, now: () => NOW }, + ); + const policy = handle.repos.policies.getById(result.policyId!)!; + expect(policy.sourceTraceIds).toEqual(traceIds); + + const skillId = mintRepairCandidate(policy, repairDeps(handle)); + const skill = handle.repos.skills.getById(skillId!)!; + expect(skill.evidenceAnchors.length).toBeLessThanOrEqual(repairConfig.evidenceLimit); + expect(skill.evidenceAnchors).not.toEqual(policy.sourceTraceIds); + expect(skill.evidenceAnchors).toContain("tr_feedback_3" as TraceRow["id"]); + }); + it("dedups: a second mint for the same policy returns null (rebuild path owns it)", async () => { const policyId = await makeConstructiveNegative(handle, trace); const policy = handle.repos.policies.getById(policyId)!; - const first = mintRepairCandidate(policy, { repos: handle.repos, embedder: null, now: () => NOW }); + const first = mintRepairCandidate(policy, repairDeps(handle)); expect(first).toBeTruthy(); - const second = mintRepairCandidate(policy, { repos: handle.repos, embedder: null, now: () => NOW }); + const second = mintRepairCandidate(policy, repairDeps(handle)); expect(second).toBeNull(); expect(handle.repos.skills.list({ limit: 50 }).length).toBe(1); }); @@ -120,7 +160,7 @@ describe("repair candidate minting", () => { const policy = handle.repos.policies.getById(result.policyId!)!; expect(isRepairCandidatePolicy(policy)).toBe(true); - const skillId = mintRepairCandidate(policy, { repos: handle.repos, embedder: null, now: () => NOW }); + const skillId = mintRepairCandidate(policy, repairDeps(handle)); expect(skillId).toBeTruthy(); const skill = handle.repos.skills.getById(skillId!)!; expect(skill.name).toMatch(/^[a-z0-9_]+$/); @@ -144,6 +184,6 @@ describe("repair candidate minting", () => { ); const policy = handle.repos.policies.getById(result.policyId!)!; expect(isRepairCandidatePolicy(policy)).toBe(false); - expect(mintRepairCandidate(policy, { repos: handle.repos, embedder: null, now: () => NOW })).toBeNull(); + expect(mintRepairCandidate(policy, repairDeps(handle))).toBeNull(); }); }); diff --git a/apps/memos-local-plugin/viewer/src/stores/i18n.ts b/apps/memos-local-plugin/viewer/src/stores/i18n.ts index 6766a4bb1..e4f73773b 100644 --- a/apps/memos-local-plugin/viewer/src/stores/i18n.ts +++ b/apps/memos-local-plugin/viewer/src/stores/i18n.ts @@ -555,7 +555,7 @@ const en = { "tasks.skill.generated": "Skill generated", "tasks.skill.upgraded": "Skill upgraded", "tasks.skill.not_generated": "Below induction threshold", - "tasks.skill.skipped": "Scored as negative example (R ≤ -0.5)", + "tasks.skill.skipped": "Scored as negative example (R ≤ -0.15)", "tasks.skill.openSkill": "Open skill", "tasks.skillReason.queued.inProgress": "Task still in progress; skill pipeline has not started yet.", @@ -568,7 +568,7 @@ const en = { "tasks.skillReason.skipped": "Low task score (R={rTask}), recorded as a counterexample; the system will try to derive avoidance guidance from this episode's traces and will not generate an invokable skill.", "tasks.skillReason.not_generated.belowThreshold": - "Task score R={rTask} is below the induction threshold (≥ {threshold}) but above the negative-example floor (≤ -0.50); this task has weak or negative signal, so no L2 experience will be generated yet. Similar future tasks can accumulate into reusable experience if they score higher.", + "Task score R={rTask} is below the induction threshold (≥ {threshold}) but above the negative-example floor (≤ -0.15); this task has weak or negative signal, so no L2 experience will be generated yet. Similar future tasks can accumulate into reusable experience if they score higher.", "tasks.skillReason.not_generated.noPolicy": "No L2 experience is linked to this task yet — induction may still be processing asynchronously. Refresh in a moment to see the latest status.", "tasks.skillReason.generated": @@ -1412,7 +1412,7 @@ const zh: Record = { "tasks.skill.generated": "已生成技能", "tasks.skill.upgraded": "已升级技能", "tasks.skill.not_generated": "未达沉淀阈值", - "tasks.skill.skipped": "本任务评为反例 (R ≤ -0.5)", + "tasks.skill.skipped": "本任务评为反例 (R ≤ -0.15)", "tasks.skill.openSkill": "打开技能", "tasks.skillReason.queued.inProgress": "任务仍在进行中,技能流水线尚未启动。", @@ -1425,7 +1425,7 @@ const zh: Record = { "tasks.skillReason.skipped": "任务评分较低 (R={rTask}),记为反例;系统会尝试从本次任务轨迹归纳规避建议,不会生成可调用技能。", "tasks.skillReason.not_generated.belowThreshold": - "任务评分 R={rTask} 低于沉淀阈值 (≥ {threshold}),但未达到反例阈值 (≤ -0.50);本次任务信号偏弱或带有负向反馈,暂不会生成 L2 经验。后续相似任务若获得更高评分,会继续累积为可复用经验。", + "任务评分 R={rTask} 低于沉淀阈值 (≥ {threshold}),但未达到反例阈值 (≤ -0.15);本次任务信号偏弱或带有负向反馈,暂不会生成 L2 经验。后续相似任务若获得更高评分,会继续累积为可复用经验。", "tasks.skillReason.not_generated.noPolicy": "该任务暂未关联到 L2 经验——可能仍在异步归纳处理中。稍后刷新可查看最新状态。", "tasks.skillReason.generated": From 575421180eddd030c957e063f567a9d5a95a1304 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=91=E5=B8=83=E6=9E=97?= <11641432+heiheiyouyou@user.noreply.gitee.com> Date: Thu, 4 Jun 2026 20:41:58 +0800 Subject: [PATCH 5/5] fix: Erroneous killing search link optimization --- .../core/config/defaults.ts | 9 +- apps/memos-local-plugin/core/config/schema.ts | 6 +- apps/memos-local-plugin/core/pipeline/deps.ts | 1 + .../core/retrieval/llm-filter.ts | 88 ++++++-- .../core/retrieval/retrieve.ts | 31 +-- .../core/retrieval/tier2-experience.ts | 30 ++- .../core/retrieval/types.ts | 4 +- .../tests/unit/retrieval/integration.test.ts | 10 +- .../tests/unit/retrieval/llm-filter.test.ts | 136 ++++++++++-- .../unit/retrieval/tier2-experience.test.ts | 197 ++++++++++++++++++ 10 files changed, 438 insertions(+), 74 deletions(-) create mode 100644 apps/memos-local-plugin/tests/unit/retrieval/tier2-experience.test.ts diff --git a/apps/memos-local-plugin/core/config/defaults.ts b/apps/memos-local-plugin/core/config/defaults.ts index 01ab88d8a..31853b0c5 100644 --- a/apps/memos-local-plugin/core/config/defaults.ts +++ b/apps/memos-local-plugin/core/config/defaults.ts @@ -241,10 +241,11 @@ export const DEFAULT_CONFIG: ResolvedConfig = { skillInjectionMode: "summary", skillSummaryChars: 200, llmFilterEnabled: true, - // Tighter than the legacy default (5) so the LLM filter has a - // small budget; combined with the richer prompt (v3) this keeps - // packets concise without over-dropping. - llmFilterMaxKeep: 4, + // Successful LLM filtering can keep a wider set of genuinely + // relevant memories; no-LLM/failure paths stay conservative via + // llmFilterFallbackMaxKeep below. + llmFilterMaxKeep: 8, + llmFilterFallbackMaxKeep: 4, // Set to 2: skip the LLM precision pass when there's only one // candidate (no point ranking a single item). Anything with 2+ // candidates still goes through the filter to drop off-topic diff --git a/apps/memos-local-plugin/core/config/schema.ts b/apps/memos-local-plugin/core/config/schema.ts index c2bb8cb97..b44eb6142 100644 --- a/apps/memos-local-plugin/core/config/schema.ts +++ b/apps/memos-local-plugin/core/config/schema.ts @@ -464,8 +464,10 @@ const AlgorithmSchema = Type.Object({ * small LLM call dramatically cuts down irrelevant injections. */ llmFilterEnabled: Bool(true), - /** Keep at most this many candidates after the LLM filter. */ - llmFilterMaxKeep: NumberInRange(5, 1, 30), + /** Keep at most this many candidates after a successful LLM filter. */ + llmFilterMaxKeep: NumberInRange(8, 1, 30), + /** Keep at most this many candidates when no valid LLM judgement is available. */ + llmFilterFallbackMaxKeep: NumberInRange(4, 0, 30), /** * Skip the filter when the ranked list has fewer than this many * items. Default 1 — even a single candidate gets a precision diff --git a/apps/memos-local-plugin/core/pipeline/deps.ts b/apps/memos-local-plugin/core/pipeline/deps.ts index 38308c0ca..12ae95479 100644 --- a/apps/memos-local-plugin/core/pipeline/deps.ts +++ b/apps/memos-local-plugin/core/pipeline/deps.ts @@ -160,6 +160,7 @@ export function extractAlgorithmConfig( decayHalfLifeDays: alg.reward.decayHalfLifeDays, llmFilterEnabled: alg.lightweightMemory.enabled ? true : alg.retrieval.llmFilterEnabled, llmFilterMaxKeep: alg.retrieval.llmFilterMaxKeep, + llmFilterFallbackMaxKeep: alg.retrieval.llmFilterFallbackMaxKeep, llmFilterMinCandidates: alg.lightweightMemory.enabled ? 1 : alg.retrieval.llmFilterMinCandidates, llmFilterCandidateBodyChars: alg.retrieval.llmFilterCandidateBodyChars, lightweightMemory: alg.lightweightMemory.enabled, diff --git a/apps/memos-local-plugin/core/retrieval/llm-filter.ts b/apps/memos-local-plugin/core/retrieval/llm-filter.ts index 10a73cd6d..fe26127a0 100644 --- a/apps/memos-local-plugin/core/retrieval/llm-filter.ts +++ b/apps/memos-local-plugin/core/retrieval/llm-filter.ts @@ -9,8 +9,9 @@ * * Design constraints: * - One LLM call per turn, bounded output (index list + `sufficient`). - * - Totally opt-in: if the LLM is null, or the config flag is off, - * or the candidate list is empty, we pass through unchanged. + * - Totally opt-in: if the LLM is null or the config flag is off, + * we apply a small mechanical fallback cap instead of calling out. + * Empty / below-threshold lists still pass through unchanged. * - On ANY failure (network, schema, timeout) we fall back to a * mechanical cutoff. A broken filter must never crash retrieval. * - Returns both kept and dropped candidates so callers can log @@ -51,6 +52,7 @@ export interface FilterDeps { RetrievalConfig, | "llmFilterEnabled" | "llmFilterMaxKeep" + | "llmFilterFallbackMaxKeep" | "llmFilterMinCandidates" | "llmFilterCandidateBodyChars" >; @@ -91,7 +93,7 @@ export async function llmFilterCandidates( ): Promise { const { ranked, query } = input; if (!deps.config.llmFilterEnabled) { - return passthrough(ranked, "disabled"); + return fallbackCap(ranked, deps, "disabled"); } // `llmFilterMinCandidates` is the *minimum* list length required to // RUN the filter. Default is 1, meaning even a single candidate gets @@ -108,7 +110,7 @@ export async function llmFilterCandidates( return passthrough(ranked, "empty_query"); } if (!deps.llm) { - return passthrough(ranked, "no_llm"); + return fallbackCap(ranked, deps, "no_llm"); } const bodyChars = @@ -220,7 +222,7 @@ function passthrough( * apply a relative-relevance cutoff so we don't dump the entire ranked * list into the prompt. Keeps: * 1. items whose score ≥ `topScore · 0.7` - * 2. capped at `llmFilterMaxKeep` so the prompt stays small. + * 2. capped at `llmFilterFallbackMaxKeep` so the prompt stays small. * * The ranker already applied an initial cutoff with the same family of * floors, but the LLM is expected to prune further (because the @@ -246,7 +248,7 @@ function safeCutoff( 0, ); const cutoff = topScore > 0 ? topScore * ratio : 0; - const keepCap = Math.max(0, deps.config.llmFilterMaxKeep); + const keepCap = fallbackMaxKeep(deps); if (keepCap === 0) { return { kept: [], @@ -276,6 +278,35 @@ function safeCutoff( }; } +function fallbackCap( + ranked: readonly RankedCandidate[], + deps: FilterDeps, + outcome: Extract, +): FilterResult { + const keepCap = fallbackMaxKeep(deps); + if (keepCap === 0) { + return { + kept: [], + dropped: [...ranked], + outcome, + sufficient: null, + }; + } + return { + kept: ranked.slice(0, keepCap), + dropped: ranked.slice(keepCap), + outcome, + sufficient: null, + }; +} + +function fallbackMaxKeep(deps: FilterDeps): number { + return Math.max( + 0, + deps.config.llmFilterFallbackMaxKeep ?? Math.min(deps.config.llmFilterMaxKeep, 4), + ); +} + function coerceBool(v: unknown): boolean | null { if (typeof v === "boolean") return v; if (v === "true" || v === "yes" || v === 1) return true; @@ -330,16 +361,7 @@ function describeCandidate(r: RankedCandidate, bodyChars: number): string { experienceType?: string; evidencePolarity?: string; }; - const parts = [ - ex.title, - ex.experienceType ? `type=${ex.experienceType}` : null, - ex.evidencePolarity ? `evidence=${ex.evidencePolarity}` : null, - ex.trigger, - ex.procedure, - ex.verification, - ].filter(Boolean).join(" "); - const body = squashBody(parts, bodyChars); - return `[EXPERIENCE] ${body}`; + return describeExperience(ex, bodyChars); } const ep = c as { summary?: string }; const body = squashBody(ep.summary ?? "", bodyChars); @@ -356,6 +378,40 @@ function describeCandidate(r: RankedCandidate, bodyChars: number): string { } } +function describeExperience( + ex: { + title?: string; + trigger?: string; + procedure?: string; + verification?: string; + experienceType?: string; + evidencePolarity?: string; + }, + bodyChars: number, +): string { + const headParts = [ + squashBody(ex.title ?? "(experience)", 80), + ex.experienceType || ex.evidencePolarity + ? `(${[ex.experienceType, ex.evidencePolarity].filter(Boolean).join(", ")})` + : null, + ].filter(Boolean); + const lines = [`[EXPERIENCE] ${headParts.join(" ")}`]; + const remaining = Math.max(0, bodyChars - lines[0]!.length); + const triggerBudget = Math.min(160, Math.floor(remaining * 0.38)); + const procedureBudget = Math.min(180, Math.floor(remaining * 0.42)); + const verificationBudget = Math.min(80, Math.floor(remaining * 0.2)); + if (ex.trigger?.trim() && triggerBudget > 0) { + lines.push(` Trigger: ${squashBody(ex.trigger, triggerBudget)}`); + } + if (ex.procedure?.trim() && procedureBudget > 0) { + lines.push(` Do: ${squashBody(ex.procedure, procedureBudget)}`); + } + if (ex.verification?.trim() && verificationBudget > 0) { + lines.push(` Check: ${squashBody(ex.verification, verificationBudget)}`); + } + return lines.join("\n"); +} + function squashBody(s: string, max: number): string { const cleaned = s.replace(/\s+/g, " ").trim(); if (cleaned.length <= max) return cleaned; diff --git a/apps/memos-local-plugin/core/retrieval/retrieve.ts b/apps/memos-local-plugin/core/retrieval/retrieve.ts index e3a25dc31..8fc194120 100644 --- a/apps/memos-local-plugin/core/retrieval/retrieve.ts +++ b/apps/memos-local-plugin/core/retrieval/retrieve.ts @@ -35,7 +35,7 @@ import { dedupeTraceEpisodeByEpisodeId } from "./dedupe-trace-episode.js"; import { toPacket, renderSnippetForDebug } from "./injector.js"; import { llmFilterCandidates } from "./llm-filter.js"; import { STANDALONE_MATH_FINAL_ANSWER_TASK_KIND } from "./math-task.js"; -import { rank, type RankedCandidate } from "./ranker.js"; +import { rank } from "./ranker.js"; import { runTier1 } from "./tier1-skill.js"; import { runTier2Experience } from "./tier2-experience.js"; import { runTier2 } from "./tier2-trace.js"; @@ -375,12 +375,7 @@ async function runAll( config: deps.config, now: deps.now(), }); - const mechanicalRanked = ctx.reason !== "decision_repair" && - requiresKeywordConfirmation(compiled.text) - ? ranked.ranked.filter((candidate) => - bypassesKeywordConfirmation(candidate) || hasKeywordChannel(candidate) - ) - : ranked.ranked; + const mechanicalRanked = ranked.ranked; const fuseLatencyMs = Date.now() - fuseStart; // ─── LLM relevance filter ────────────────────────────────────────── @@ -605,28 +600,6 @@ function emptyResult( }; } -function requiresKeywordConfirmation(text: string): boolean { - const tokens = text.match(/[A-Za-z0-9_:-]{12,}/g) ?? []; - return tokens.some((token) => { - const hasIdentifierShape = /[_:-]/.test(token) || /\d/.test(token); - const hasEnoughEntropy = /[A-Za-z]/.test(token) && token.length >= 16; - return hasIdentifierShape && hasEnoughEntropy; - }); -} - -function hasKeywordChannel(candidate: RankedCandidate): boolean { - return (candidate.candidate.channels ?? []).some((channel) => - channel.channel === "fts" || - channel.channel === "pattern" || - channel.channel === "structural" - ); -} - -function bypassesKeywordConfirmation(candidate: RankedCandidate): boolean { - const refKind = candidate.candidate.refKind; - return refKind === "skill" || refKind === "world-model"; -} - function approxTokens(s: string): number { if (!s) return 0; return Math.ceil(s.length / 4); diff --git a/apps/memos-local-plugin/core/retrieval/tier2-experience.ts b/apps/memos-local-plugin/core/retrieval/tier2-experience.ts index f6103de44..f09b4b68e 100644 --- a/apps/memos-local-plugin/core/retrieval/tier2-experience.ts +++ b/apps/memos-local-plugin/core/retrieval/tier2-experience.ts @@ -98,9 +98,9 @@ export async function runTier2Experience( for (const [id, state] of merged) { const row = repo.getById(id); if (!row) continue; - if ((row.sourceFeedbackIds?.length ?? 0) === 0) continue; const status = row.status ?? "candidate"; if (status === "archived") continue; + if (!hasExecutableExperienceShape(row)) continue; out.push({ tier: "tier2", refKind: "experience", @@ -131,11 +131,28 @@ export async function runTier2Experience( matchedChannels: state.channels.map((c) => c.channel), experienceType: row.experienceType ?? "success_pattern", evidencePolarity: row.evidencePolarity ?? "positive", + riskFlags: [ + ...(row.sourceFeedbackIds?.length ? [] : ["missing_source_feedback"]), + ...(state.channels.some((c) => c.channel === "vec") ? [] : ["keyword_only"]), + ], }, }); } - out.sort((a, b) => bestChannelScore(b) - bestChannelScore(a)); - const trimmed = out.slice(0, vecPoolSize); + const keywordSupplementSize = Math.min(keywordPoolSize, deps.config.tier2TopK); + const vectorHits = out + .filter((c) => c.channels?.some((ch) => ch.channel === "vec") || c.cosine > 0) + .sort((a, b) => bestChannelScore(b) - bestChannelScore(a)) + .slice(0, vecPoolSize); + const vectorIds = new Set(vectorHits.map((c) => c.refId)); + const keywordOnlyHits = out + .filter((c) => !vectorIds.has(c.refId)) + .filter((c) => !(c.channels?.some((ch) => ch.channel === "vec") || c.cosine > 0)) + .filter((c) => + c.channels?.some((ch) => ch.channel === "fts" || ch.channel === "pattern"), + ) + .sort((a, b) => bestChannelScore(b) - bestChannelScore(a)) + .slice(0, keywordSupplementSize); + const trimmed = [...vectorHits, ...keywordOnlyHits]; log.info("done", { candidates: merged.size, kept: trimmed.length, @@ -156,6 +173,13 @@ export async function runTier2Experience( } } +function hasExecutableExperienceShape(row: { + trigger?: string; + procedure?: string; +}): boolean { + return !!row.trigger?.trim() || !!row.procedure?.trim(); +} + interface CandidateState { cosine: number; channels: ChannelRank[]; diff --git a/apps/memos-local-plugin/core/retrieval/types.ts b/apps/memos-local-plugin/core/retrieval/types.ts index ae1b6f944..27d776604 100644 --- a/apps/memos-local-plugin/core/retrieval/types.ts +++ b/apps/memos-local-plugin/core/retrieval/types.ts @@ -299,11 +299,13 @@ export interface RetrievalConfig { * genuinely relevant ones before injection. * * When `llmFilterEnabled` is false or the LLM is unavailable, the - * ranked list is passed through unchanged. + * ranked list is mechanically capped by `llmFilterFallbackMaxKeep`. */ llmFilterEnabled: boolean; /** Keep at most N candidates after the LLM filter. */ llmFilterMaxKeep: number; + /** Keep at most N candidates when no valid LLM judgement is available. */ + llmFilterFallbackMaxKeep?: number; /** Skip the filter entirely when the ranked list has fewer than this many items. */ llmFilterMinCandidates: number; /** diff --git a/apps/memos-local-plugin/tests/unit/retrieval/integration.test.ts b/apps/memos-local-plugin/tests/unit/retrieval/integration.test.ts index 3d2fb0049..d58bd37b4 100644 --- a/apps/memos-local-plugin/tests/unit/retrieval/integration.test.ts +++ b/apps/memos-local-plugin/tests/unit/retrieval/integration.test.ts @@ -231,20 +231,18 @@ describe("retrieval/integration", () => { expect(skillIds).not.toContain("sk_weak"); }); - it("keeps abstract memories when long unique identifier queries require keywords", async () => { + it("does not hard-drop vector traces for coding identifiers", async () => { const res = await turnStartRetrieve(makeDeps(handle), { reason: "turn_start", agent: "openclaw", - sessionId: "s1" as SessionId, - userText: "zlxqyz_unique_marker_2026_test_no_such_content", + sessionId: "s_current" as SessionId, + userText: "debug memos-local-plugin create_episode_snapshot", ts: NOW as never, }); const refKinds = res.packet.snippets.map((s) => s.refKind); expect(refKinds).toContain("skill"); - expect(refKinds).toContain("world-model"); - expect(refKinds).not.toContain("trace"); - expect(refKinds).not.toContain("episode"); + expect(refKinds.some((kind) => kind === "trace" || kind === "episode")).toBe(true); }); it("recalls feedback experiences through keyword channels when embeddings degrade", async () => { diff --git a/apps/memos-local-plugin/tests/unit/retrieval/llm-filter.test.ts b/apps/memos-local-plugin/tests/unit/retrieval/llm-filter.test.ts index ddb58c604..bdbdd1439 100644 --- a/apps/memos-local-plugin/tests/unit/retrieval/llm-filter.test.ts +++ b/apps/memos-local-plugin/tests/unit/retrieval/llm-filter.test.ts @@ -3,6 +3,7 @@ import { describe, expect, it, vi } from "vitest"; import { llmFilterCandidates } from "../../../core/retrieval/llm-filter.js"; import type { RankedCandidate } from "../../../core/retrieval/ranker.js"; import type { + ExperienceCandidate, RetrievalConfig, TraceCandidate, } from "../../../core/retrieval/types.js"; @@ -11,11 +12,13 @@ const cfg: Pick< RetrievalConfig, | "llmFilterEnabled" | "llmFilterMaxKeep" + | "llmFilterFallbackMaxKeep" | "llmFilterMinCandidates" | "llmFilterCandidateBodyChars" > = { llmFilterEnabled: true, - llmFilterMaxKeep: 4, + llmFilterMaxKeep: 8, + llmFilterFallbackMaxKeep: 4, llmFilterMinCandidates: 1, llmFilterCandidateBodyChars: 500, }; @@ -57,14 +60,66 @@ function trace(id: string, score: number): RankedCandidate { }; } +function experience(id: string): RankedCandidate { + const cand: ExperienceCandidate = { + tier: "tier2", + refKind: "experience", + refId: id as never, + cosine: 0, + ts: 1_700_000_000_000 as never, + vec: null, + channels: [{ channel: "fts", rank: 0, score: 1 }], + title: "SEC 13F extraction correction", + trigger: `When the user asks about ${"SEC 13F issuer CUSIP ".repeat(20)}`, + procedure: "Use the holdings table issuer and CUSIP columns directly.", + verification: "Verify issuer and CUSIP match the same holdings row.", + boundary: "", + support: 1, + gain: 0.7, + status: "active", + experienceType: "failure_avoidance", + evidencePolarity: "negative", + salience: 0.9, + confidence: 0.8, + skillEligible: false, + sourceEpisodeIds: [], + sourceFeedbackIds: [], + sourceTraceIds: [], + decisionGuidance: { preference: [], antiPattern: [] }, + updatedAt: 1_700_000_000_000 as never, + }; + return { + candidate: cand, + relevance: 1, + rrf: 0, + score: 1, + normSq: null, + }; +} + describe("retrieval/llm-filter", () => { - it("disabled → passthrough with null sufficient", async () => { + it("disabled → fallback capped with null sufficient", async () => { + const ranked = [ + trace("a", 0.9), + trace("b", 0.8), + trace("c", 0.7), + trace("d", 0.6), + ]; const result = await llmFilterCandidates( - { query: "anything", ranked: [trace("a", 0.9), trace("b", 0.5)] }, - { llm: null, log, config: { ...cfg, llmFilterEnabled: false } }, + { query: "anything", ranked }, + { + llm: null, + log, + config: { + ...cfg, + llmFilterEnabled: false, + llmFilterFallbackMaxKeep: 2, + }, + }, ); expect(result.outcome).toBe("disabled"); - expect(result.kept.length).toBe(2); + expect(result.kept.map((r) => String(r.candidate.refId))).toEqual(["a", "b"]); + expect(result.dropped.map((r) => String(r.candidate.refId))).toEqual(["c", "d"]); expect(result.sufficient).toBeNull(); }); @@ -199,7 +254,7 @@ describe("retrieval/llm-filter", () => { expect(result.kept.length).toBeGreaterThanOrEqual(1); }); - it("safe-cutoff respects llmFilterMaxKeep cap", async () => { + it("safe-cutoff respects llmFilterFallbackMaxKeep cap", async () => { const llm: any = { completeJson: vi.fn().mockRejectedValue(new Error("boom")), }; @@ -213,38 +268,73 @@ describe("retrieval/llm-filter", () => { ]; const result = await llmFilterCandidates( { query: "q", ranked }, - { llm, log, config: { ...cfg, llmFilterMaxKeep: 2 } }, + { + llm, + log, + config: { ...cfg, llmFilterMaxKeep: 8, llmFilterFallbackMaxKeep: 2 }, + }, ); expect(result.kept.length).toBeLessThanOrEqual(2); expect(result.outcome).toBe("llm_failed_safe_cutoff"); }); - it("safe-cutoff respects a zero llmFilterMaxKeep cap", async () => { + it("safe-cutoff respects a zero llmFilterFallbackMaxKeep cap", async () => { const llm: any = { completeJson: vi.fn().mockRejectedValue(new Error("boom")), }; const result = await llmFilterCandidates( { query: "q", ranked: [trace("a", 0.9), trace("b", 0.8)] }, - { llm, log, config: { ...cfg, llmFilterMaxKeep: 0 } }, + { llm, log, config: { ...cfg, llmFilterFallbackMaxKeep: 0 } }, ); expect(result.kept).toEqual([]); expect(result.dropped.length).toBe(2); expect(result.outcome).toBe("llm_failed_safe_cutoff"); }); - it("no LLM at all → passthrough (not safe-cutoff, since the call never happens)", async () => { + it("no LLM at all → fallback capped without full passthrough", async () => { const result = await llmFilterCandidates( { query: "q", - ranked: [trace("a", 0.9), trace("b", 0.8), trace("c", 0.7)], + ranked: [ + trace("a", 0.9), + trace("b", 0.8), + trace("c", 0.7), + trace("d", 0.6), + ], }, - { llm: null, log, config: cfg }, + { llm: null, log, config: { ...cfg, llmFilterFallbackMaxKeep: 2 } }, ); expect(result.outcome).toBe("no_llm"); - expect(result.kept.length).toBe(3); + expect(result.kept.map((r) => String(r.candidate.refId))).toEqual(["a", "b"]); + expect(result.dropped.map((r) => String(r.candidate.refId))).toEqual(["c", "d"]); expect(result.sufficient).toBeNull(); }); + it("malformed LLM output uses fallback cap instead of normal max keep", async () => { + const llm: any = { + completeJson: vi.fn().mockResolvedValue({ + value: { ranked: "not-an-array" }, + servedBy: "fake", + }), + }; + const ranked = [ + trace("a", 0.9), + trace("b", 0.89), + trace("c", 0.88), + trace("d", 0.87), + ]; + const result = await llmFilterCandidates( + { query: "q", ranked }, + { + llm, + log, + config: { ...cfg, llmFilterMaxKeep: 8, llmFilterFallbackMaxKeep: 2 }, + }, + ); + expect(result.outcome).toBe("llm_failed_safe_cutoff"); + expect(result.kept.length).toBeLessThanOrEqual(2); + }); + it("candidate description omits retrieval metadata and keeps semantic content", async () => { const seen: string[] = []; const llm: any = { @@ -265,6 +355,26 @@ describe("retrieval/llm-filter", () => { expect(seen[0]).not.toContain("score="); }); + it("experience descriptions preserve procedure and verification despite long triggers", async () => { + const seen: string[] = []; + const llm: any = { + completeJson: vi.fn().mockImplementation(async (messages: any[]) => { + seen.push(messages[1].content); + return { value: { selected: [1], sufficient: true }, servedBy: "fake" }; + }), + }; + await llmFilterCandidates( + { query: "q", ranked: [experience("po_sec13f")] }, + { llm, log, config: { ...cfg, llmFilterCandidateBodyChars: 500 } }, + ); + + expect(seen[0]).toContain("[EXPERIENCE] SEC 13F extraction correction"); + expect(seen[0]).toContain("Trigger:"); + expect(seen[0]).toContain("Do: Use the holdings table issuer and CUSIP columns directly."); + expect(seen[0]).toContain("Check: Verify issuer and CUSIP match the same holdings row."); + expect(seen[0]).not.toContain("sourceFeedbackIds"); + }); + it("LLM output budget scales for large ranked lists", async () => { const llm: any = { completeJson: vi.fn().mockResolvedValue({ diff --git a/apps/memos-local-plugin/tests/unit/retrieval/tier2-experience.test.ts b/apps/memos-local-plugin/tests/unit/retrieval/tier2-experience.test.ts new file mode 100644 index 000000000..cf0636195 --- /dev/null +++ b/apps/memos-local-plugin/tests/unit/retrieval/tier2-experience.test.ts @@ -0,0 +1,197 @@ +import { describe, expect, it } from "vitest"; + +import { runTier2Experience } from "../../../core/retrieval/tier2-experience.js"; +import type { RetrievalConfig, RetrievalRepos } from "../../../core/retrieval/types.js"; +import type { EmbeddingVector, PolicyId } from "../../../core/types.js"; + +const NOW = 1_700_000_000_000 as never; + +function vec(arr: number[]): EmbeddingVector { + return Float32Array.from(arr) as unknown as EmbeddingVector; +} + +const cfg: RetrievalConfig = { + tier1TopK: 3, + tier2TopK: 3, + tier3TopK: 2, + candidatePoolFactor: 2, + weightCosine: 0.6, + weightPriority: 0.4, + mmrLambda: 0.7, + includeLowValue: false, + rrfConstant: 60, + minSkillEta: 0.5, + minTraceSim: 0.3, + tagFilter: "auto", + keywordTopK: 20, + decayHalfLifeDays: 30, + llmFilterEnabled: false, + llmFilterMaxKeep: 4, + llmFilterMinCandidates: 1, +}; + +type PolicyRow = NonNullable>; + +function makePolicy( + id: string, + partial: Partial = {}, +): PolicyRow { + return { + id, + title: partial.title ?? `policy ${id}`, + trigger: partial.trigger ?? "when parsing SEC 13F", + procedure: partial.procedure ?? "use holdings table fields", + verification: partial.verification ?? "issuer matches row", + boundary: partial.boundary ?? "", + support: partial.support ?? 1, + gain: partial.gain ?? 0.1, + status: partial.status ?? "active", + experienceType: partial.experienceType ?? "failure_avoidance", + evidencePolarity: partial.evidencePolarity ?? "negative", + salience: partial.salience ?? 0.5, + confidence: partial.confidence ?? 0.7, + skillEligible: partial.skillEligible ?? false, + sourceEpisodeIds: partial.sourceEpisodeIds ?? [], + sourceFeedbackIds: partial.sourceFeedbackIds ?? ["fb1" as never], + sourceTraceIds: partial.sourceTraceIds ?? [], + decisionGuidance: partial.decisionGuidance ?? { preference: [], antiPattern: [] }, + vec: partial.vec ?? vec([1, 0, 0]), + updatedAt: partial.updatedAt ?? NOW, + }; +} + +function makeRepo( + rows: PolicyRow[], + hits: { + vector?: Array<{ id: string; score: number }>; + text?: Array<{ id: string; score: number }>; + pattern?: Array<{ id: string; score: number }>; + }, +): RetrievalRepos["policies"] { + const byId = new Map(rows.map((row) => [row.id, row])); + return { + searchByVector(_query, k) { + return (hits.vector ?? []).slice(0, k).map((hit) => ({ + id: hit.id, + score: hit.score, + meta: { + title: byId.get(hit.id)?.title ?? hit.id, + status: byId.get(hit.id)?.status ?? "active", + support: byId.get(hit.id)?.support ?? 1, + gain: byId.get(hit.id)?.gain ?? 0, + }, + })); + }, + searchByText(_query, k) { + return (hits.text ?? []).slice(0, k).map((hit) => ({ + id: hit.id, + score: hit.score, + })); + }, + searchByPattern(_terms, k) { + return (hits.pattern ?? []).slice(0, k).map((hit) => ({ + id: hit.id, + score: hit.score, + })); + }, + list() { + return rows; + }, + getById(id) { + return byId.get(id) ?? null; + }, + }; +} + +describe("retrieval/tier2-experience", () => { + it("recalls executable experiences without sourceFeedbackIds", async () => { + const row = makePolicy("po_no_feedback", { sourceFeedbackIds: [] }); + const out = await runTier2Experience( + { repos: { policies: makeRepo([row], { text: [{ id: row.id, score: 1 }] }) }, config: cfg }, + { queryVec: null, ftsMatch: "SEC 13F" }, + ); + + expect(out.map((c) => String(c.refId))).toEqual(["po_no_feedback"]); + expect(out[0]?.sourceFeedbackIds).toEqual([]); + }); + + it("drops title-only and verification-only experiences as non-executable", async () => { + const titleOnly = makePolicy("po_title_only", { + title: "looks relevant", + trigger: "", + procedure: "", + verification: "", + sourceFeedbackIds: [], + }); + const verificationOnly = makePolicy("po_check_only", { + title: "", + trigger: "", + procedure: "", + verification: "check something", + sourceFeedbackIds: [], + }); + const runnable = makePolicy("po_runnable", { + trigger: "when parsing SEC 13F", + procedure: "", + sourceFeedbackIds: [], + }); + const out = await runTier2Experience( + { + repos: { + policies: makeRepo([titleOnly, verificationOnly, runnable], { + text: [ + { id: titleOnly.id, score: 1 }, + { id: verificationOnly.id, score: 0.9 }, + { id: runnable.id, score: 0.8 }, + ], + }), + }, + config: cfg, + }, + { queryVec: null, ftsMatch: "SEC 13F" }, + ); + + expect(out.map((c) => String(c.refId))).toEqual(["po_runnable"]); + }); + + it("keeps a bounded keyword-only supplement when vector hits fill the pool", async () => { + const vectorRows = Array.from({ length: 6 }, (_, i) => + makePolicy(`po_vec_${i}`, { + trigger: `vector trigger ${i}`, + procedure: "vector procedure", + vec: vec([1, 0, 0]), + }), + ); + const keywordRows = Array.from({ length: 20 }, (_, i) => + makePolicy(`po_kw_${i}`, { + trigger: `keyword trigger ${i}`, + procedure: "keyword procedure", + sourceFeedbackIds: [], + vec: null, + }), + ); + const out = await runTier2Experience( + { + repos: { + policies: makeRepo([...vectorRows, ...keywordRows], { + vector: vectorRows.map((row, i) => ({ id: row.id, score: 0.9 - i * 0.01 })), + text: keywordRows.map((row, i) => ({ id: row.id, score: 1 - i * 0.01 })), + }), + }, + config: cfg, + }, + { queryVec: vec([1, 0, 0]), ftsMatch: "keyword" }, + ); + + const keywordOnly = out.filter((c) => + c.channels.some((ch) => ch.channel === "fts") && + !c.channels.some((ch) => ch.channel === "vec"), + ); + expect(out.length).toBeLessThanOrEqual(9); + expect(keywordOnly.map((c) => String(c.refId))).toEqual([ + "po_kw_0", + "po_kw_1", + "po_kw_2", + ]); + }); +});