diff --git a/.claude/skills/forge.md b/.claude/skills/forge.md index 16ed2e7..0eb9ef3 100644 --- a/.claude/skills/forge.md +++ b/.claude/skills/forge.md @@ -660,10 +660,22 @@ Forge-specific attributes use the `forge.*` namespace (`forge.task.id`, `forge.task.final_state`, `forge.tool.name`, `forge.workflow.id`, ...). -Phase 3 ships **metadata-only** spans. `capture_content` is plumbed -through the config schema but not yet honored by the instrumentation; -content capture is a follow-up that will reuse the FWS-8 audit -redactor. +**Default posture is metadata-only.** Prompts, completions, tool +args, and tool results are NOT stamped on spans unless +`observability.tracing.capture_content: true` is set (Phase 3.5 / +#130). When opted-in: `llm.completion` gains `gen_ai.input.messages` +(JSON array of role+content sent to the model) + +`gen_ai.output.messages` (JSON single-element array for the response, +current OTel GenAI semconv; supersedes the deprecated flat-string +`gen_ai.prompt` / `gen_ai.completion`); +`tool.` gains `forge.tool.args` + `forge.tool.result`. +Captured values pass through a redactor (vendor secret-token shapes: +Anthropic / OpenAI / GitHub / AWS / Slack / private keys / Telegram) +when `redact: true` (default with capture). Each value is byte-capped +at 4 KiB with a `…[truncated:N]` marker byte-identical to the audit +payload-capture marker, so an operator grepping `[truncated:` across +spans and audit rows sees aligned output. `redact: false` is the +enterprise raw-capture path. **Read**: `docs/core-concepts/observability-tracing.md`, `docs/reference/forge-yaml-schema.md` § `observability.tracing`, @@ -790,8 +802,8 @@ observability: # OTel Tracing v1 (#108) — off by default service_name: "" # default: agent_id headers: { x-tenant: demo } resource_attrs: { deployment.environment: prod } - redact: true - capture_content: false # Phase 3 ships metadata-only + redact: true # scrub vendor secrets when capture is on + capture_content: false # off by default; opt in to span content skills: path: SKILL.md # main agent skill file diff --git a/docs/core-concepts/observability-tracing.md b/docs/core-concepts/observability-tracing.md index 7f3befb..b341ec6 100644 --- a/docs/core-concepts/observability-tracing.md +++ b/docs/core-concepts/observability-tracing.md @@ -45,8 +45,8 @@ observability: x-tenant: demo resource_attrs: # extra OTel resource attributes deployment.environment: prod - redact: true # default true — Phase 3 metadata-only ships now - capture_content: false # enterprise opt-in for prompt/completion content + redact: true # scrub vendor secret tokens when capture_content is on + capture_content: false # opt-in: stamp prompt/completion/tool I/O on spans ``` | Field | Type | Default | Notes | @@ -60,8 +60,8 @@ observability: | `service_name` | string | `agent_id` | `OTEL_SERVICE_NAME` env wins if set. | | `headers` | map | — | OTLP HTTP/gRPC headers. Env is the preferred path for secrets. | | `resource_attrs` | map | — | Merged with the auto-stamped `service.*` + `forge.runtime.version`. | -| `redact` | bool | `true` | PII redaction posture flag (consumed by Phase 3+ instrumentation). | -| `capture_content` | bool | `false` | Reserved — metadata-only spans ship now; content capture is a follow-up. | +| `redact` | bool | `true` | When `capture_content: true`, scrub vendor secret tokens (Anthropic / OpenAI / GitHub / AWS / Slack / private keys / Telegram) before stamping content attributes. See [Span content capture](#span-content-capture). | +| `capture_content` | bool | `false` | Stamp prompt / completion / tool I/O as span attributes. Off by default; metadata-only spans ship. See [Span content capture](#span-content-capture). | ## Config precedence @@ -152,9 +152,24 @@ Forge mixes OTel GenAI semconv with Forge-specific `forge.*` namespaced attribut Tool errors do **not** fail the outer `agent.execute` span — they surface to the LLM as text and the loop continues. The tool span carries the failure detail so operators can pivot from a trace to the specific failed invocation. -### Phase 3 is metadata-only +### Span content capture -Tool args / results, prompts, completions are **not** recorded as span attributes today. The `capture_content` + `redact` knobs are plumbed but not yet honored by the instrumentation — content capture is a follow-up that will reuse the FWS-8 audit redactor. +Prompts, completions, tool args, and tool results are **off by default** — Phase 3 spans ship metadata only (provider, model, usage, finish reasons, tool name). Operators who need content attributes for in-trace debugging or supervised-learning corpora opt in via `observability.tracing.capture_content: true` (Phase 3.5 / issue #130). + +| `forge.yaml` knob | Span | Attribute keys added when `capture_content: true` | +|---|---|---| +| (always) | `llm.completion` | `gen_ai.system`, `gen_ai.request.model`, `gen_ai.usage.input_tokens`, `gen_ai.usage.output_tokens`, `gen_ai.response.finish_reasons` | +| `capture_content: true` | `llm.completion` | `gen_ai.input.messages` (JSON array of role+content pairs sent to the model), `gen_ai.output.messages` (JSON single-element array of role+content for the model's response) — current OTel GenAI semconv, supersedes the deprecated flat-string `gen_ai.prompt` / `gen_ai.completion` | +| (always) | `tool.` | `forge.tool.name`, `forge.tool.error` (on failure) | +| `capture_content: true` | `tool.` | `forge.tool.args` (raw arguments JSON), `forge.tool.result` (raw output) | + +When `capture_content: true` and `redact: true` (the default when capture is on), attribute values pass through a redactor that scrubs the same vendor secret-token shapes the runtime guardrails default rules cover (Anthropic `sk-ant-…`, OpenAI `sk-…`, GitHub `ghp_/gho_/ghs_/github_pat_…`, AWS `AKIA…`, Slack `xoxb-/xoxp-…`, RSA/EC/OPENSSH/PRIVATE key blocks, Telegram bot tokens). Matched values become `[REDACTED]`. Setting `redact: false` is the enterprise raw-capture path — content is stamped verbatim with the byte cap still applied. + +Every captured value is byte-capped at **4 KiB** (below the 5 KiB attribute soft-cap most backends apply). When the input exceeds the cap, the value ends with a `…[truncated:N]` marker where `N` is the original byte length. The marker is **byte-identical** to what the audit payload-capture path emits for the same input, so an operator grepping `[truncated:` across span attributes and audit rows sees aligned output. + +**Default posture** (no opt-in): the `gen_ai.input.messages`, `gen_ai.output.messages`, `forge.tool.args`, `forge.tool.result` keys are **absent** from spans — not set to empty string. Backends that gate dashboards on "is this key present?" can distinguish "metadata-only by default" from "operator opted in but the field happened to be empty." + +**OTel semconv versioning note**: the GenAI semantic conventions moved from flat-string (`gen_ai.prompt`, `gen_ai.completion`) to structured (`gen_ai.input.messages`, `gen_ai.output.messages`) attributes. Forge emits only the **current** structured keys. Backends that only recognize the deprecated flat-string attributes will not show prompt / completion text on Forge spans — upgrade the backend's semconv mapping or use a span processor to translate. ## End-to-end propagation (Phase 5) diff --git a/docs/security/audit-logging.md b/docs/security/audit-logging.md index c8896d8..1a14847 100644 --- a/docs/security/audit-logging.md +++ b/docs/security/audit-logging.md @@ -330,6 +330,37 @@ shape — no `trace_id` / `span_id` keys appear. The `AuditSchemaVersion` is NOT bumped: adding optional fields is a schema-compatible change per the policy above. +### Content-capture parity + +When `observability.tracing.capture_content: true` is set, prompt / +completion / tool-args / tool-result content appears on **both** the +linked OTel span and the FWS-8 audit row for the same logical event. +The two pipelines run the captured content through the same redact- +then-truncate helper (`runtime.PrepareSpanContent` / +`runtime.TruncateForAudit`) so: + +- The redaction marker is identical (`[REDACTED]`) — operators + grepping either sink for vendor secret-token shapes see the same + match. +- The truncation marker is byte-identical (`…[truncated:N]` where + `N` is the original byte length of the input). Grepping + `[truncated:` across audit rows and span attributes returns + aligned, comparable results. +- The redact patterns mirror the runtime guardrails CustomRules + defaults (Anthropic / OpenAI / GitHub / AWS / Slack / private key + blocks / Telegram bot tokens). Adding a new vendor pattern to one + pipeline implies adding it to the other. + +The audit pipeline's byte cap (16 KiB per field, see +`AuditPayloadCapture.Cap*Bytes`) is intentionally larger than the +span cap (4 KiB — below the soft attribute-length limit most +observability backends apply). The two caps are independent: a single +event may be truncated on the span side and survive intact on the +audit side. The trailing marker shape is the same either way. + +See [Observability — Span content capture](../core-concepts/observability-tracing.md#span-content-capture) for the +span-side attribute keys and opt-in switches. + ## Streams (FWS-9) `forge run` / `forge serve` use the OS streams as a stream-level diff --git a/forge-cli/runtime/runner.go b/forge-cli/runtime/runner.go index 6055c74..ff06489 100644 --- a/forge-cli/runtime/runner.go +++ b/forge-cli/runtime/runner.go @@ -834,6 +834,15 @@ func (r *Runner) Run(ctx context.Context) error { MaxIterations: 100, CharBudget: charBudget, FilesDir: filepath.Join(r.cfg.WorkDir, ".forge", "files"), + // Issue #130 — the same resolved TracingConfig + // already passed to NewTracerProvider drives Phase + // 3.5 span-content capture inside the executor + // loop. Disabled state (Enabled=false + + // CaptureContent=false) is the zero-value default, + // so missing this on an older config schema is + // equivalent to "metadata-only spans" — the + // posture this initiative preserves. + TracingConfig: tracingCfg, } if r.derivedCLIConfig != nil { execCfg.WorkflowPhases = r.derivedCLIConfig.WorkflowPhases diff --git a/forge-core/observability/attrs.go b/forge-core/observability/attrs.go index 1d122ee..0649466 100644 --- a/forge-core/observability/attrs.go +++ b/forge-core/observability/attrs.go @@ -88,9 +88,7 @@ const ( AttrForgeLoopIteration = "forge.loop.iteration" // AttrForgeToolName / AttrForgeToolError name the tool call - // instrumentation. Tool args / results are NOT recorded here — - // Phase 3 is metadata-only. A future "capture_content=true with - // PII redaction" phase will add args/result attribute keys. + // instrumentation. AttrForgeToolName = "forge.tool.name" AttrForgeToolError = "forge.tool.error" @@ -98,4 +96,34 @@ const ( // resolved to — "completed", "failed", "canceled". Set on the // agent.execute span just before End. AttrForgeTaskFinalState = "forge.task.final_state" + + // ─── Content-capture attributes (Phase 3.5 / issue #130) ───── + // + // These attributes are set only when TracingConfig.CaptureContent + // is true. The default posture remains metadata-only: an absent + // attribute is the signal that an operator did not opt in. Set + // values pass through PrepareSpanContent (redact-then-truncate) + // so the same scrub passes both the OTel pipeline and (in the + // future) the audit payload-capture path. + + // AttrGenAIInputMessages is the structured inbound message array + // the agent sent to the LLM — a JSON array of role+content pairs. + // Per OTel GenAI semantic conventions (current). Supersedes the + // deprecated `gen_ai.prompt` flat-string attribute. + AttrGenAIInputMessages = "gen_ai.input.messages" + + // AttrGenAIOutputMessages is the structured response array from + // the model — a JSON array of role+content pairs (single element + // for a non-streaming, single-choice completion). Per OTel GenAI + // semantic conventions (current). Supersedes the deprecated + // `gen_ai.completion` flat-string attribute. + AttrGenAIOutputMessages = "gen_ai.output.messages" + + // AttrForgeToolArgs is the raw arguments JSON the agent passed to + // a tool. Set on tool. spans. + AttrForgeToolArgs = "forge.tool.args" + + // AttrForgeToolResult is the raw output the tool returned. Set on + // tool. spans. + AttrForgeToolResult = "forge.tool.result" ) diff --git a/forge-core/runtime/content_redact.go b/forge-core/runtime/content_redact.go new file mode 100644 index 0000000..3fcb449 --- /dev/null +++ b/forge-core/runtime/content_redact.go @@ -0,0 +1,147 @@ +package runtime + +import ( + "encoding/json" + "regexp" + + "github.com/initializ/forge/forge-core/llm" +) + +// Span-attribute content capture (issue #130 / Phase 3.5). +// +// Phase 3 of the OTel Tracing v1 initiative (#108, PR #125) shipped +// span instrumentation across the executor loop and tool calls but +// kept it metadata-only — span attributes carried provider, model, +// usage tokens, finish reasons, but no prompt / completion / tool I/O +// text. Phase 2 (#103, PR #124) plumbed two operator-facing knobs +// (`capture_content`, `redact`) through the config schema but the +// runtime never read them. This file is the redact-and-cap pipeline +// that Phase 3 sites call into when `CaptureContent=true` so the same +// PII / secret scrub passes both the OTel attribute path and (in the +// future) the audit payload-capture path. +// +// Pattern parity: RedactSecrets's regex list mirrors the runtime +// guardrails CustomRule defaults in forge-cli/runtime/guardrails_loader.go's +// DefaultStructuredGuardrails. The two should evolve together — when +// a new secret token shape is added to the guardrails list, add it +// here. The parity test in content_redact_parity_test.go inside +// forge-cli/runtime/ enforces this at CI time. +// +// Order matters: redact runs BEFORE truncate so the truncation +// boundary can never split a `[REDACTED]` marker mid-string. +// +// The functions are designed to be called on hot paths +// (every LLM call, every tool call) so the regex set is pre-compiled +// at package init and the empty-input fast path skips the pattern +// loop entirely. + +// RedactionMarker is the placeholder substituted for any matched +// secret. Operators grepping audit logs and traces for "[REDACTED]" +// can correlate scrub events across both pipelines. +const RedactionMarker = "[REDACTED]" + +// DefaultSpanContentCapBytes is the per-attribute byte cap for span +// content. 4 KiB stays comfortably under common observability backend +// limits (Datadog caps attributes around 5 KiB; Tempo's default attr +// length limit is 4 KiB) so a long prompt doesn't get re-truncated by +// the backend with a different marker shape, breaking the +// correlate-by-marker grep flow. +const DefaultSpanContentCapBytes = 4 << 10 + +// redactPattern is a single regex applied to span / audit content +// before storage. Each entry's regex is pre-compiled at init. +type redactPattern struct { + name string + re *regexp.Regexp +} + +// redactPatterns covers token shapes operators have asked us to scrub +// from prompts / completions / tool I/O. The shapes are drawn from +// runtime-observed secrets in vendor SDKs — same list as the +// guardrails CustomRules defaults. See the package-doc note above on +// parity with forge-cli/runtime/guardrails_loader.go. +var redactPatterns = []redactPattern{ + {name: "anthropic_key", re: regexp.MustCompile(`sk-ant-[A-Za-z0-9\-]{20,}`)}, + {name: "openai_key", re: regexp.MustCompile(`sk-[A-Za-z0-9]{20,}`)}, + {name: "github_pat", re: regexp.MustCompile(`ghp_[A-Za-z0-9]{36}`)}, + {name: "github_oauth", re: regexp.MustCompile(`gho_[A-Za-z0-9]{36}`)}, + {name: "github_server", re: regexp.MustCompile(`ghs_[A-Za-z0-9]{36}`)}, + {name: "github_fine", re: regexp.MustCompile(`github_pat_[A-Za-z0-9_]{22,}`)}, + {name: "aws_access", re: regexp.MustCompile(`AKIA[0-9A-Z]{16}`)}, + {name: "slack_bot", re: regexp.MustCompile(`xoxb-[0-9]{10,}-[A-Za-z0-9-]+`)}, + {name: "slack_user", re: regexp.MustCompile(`xoxp-[0-9]{10,}-[A-Za-z0-9-]+`)}, + // Private-key block: anchored to both BEGIN and END markers so we + // scrub the entire payload at once. (?s) makes . match newlines. + {name: "private_key", re: regexp.MustCompile(`(?s)-----BEGIN (RSA|EC|OPENSSH|PRIVATE) [^-]*KEY-----.*?-----END (RSA|EC|OPENSSH|PRIVATE) [^-]*KEY-----`)}, + {name: "telegram_bot", re: regexp.MustCompile(`[0-9]{8,10}:[A-Za-z0-9_-]{35,}`)}, +} + +// RedactSecrets returns s with every known secret token shape replaced +// by RedactionMarker. Empty input is returned unchanged (fast path). +// +// Applied in pattern-list order; overlap is fine because +// ReplaceAllString rewrites the string left-to-right and subsequent +// patterns operate on the post-replacement output. A run that matches +// multiple shapes (e.g. an `sk-` prefix that also starts a longer +// vendor key) is scrubbed once — RedactionMarker doesn't satisfy any +// other pattern, so re-applying patterns is idempotent. +func RedactSecrets(s string) string { + if s == "" { + return s + } + for _, p := range redactPatterns { + s = p.re.ReplaceAllString(s, RedactionMarker) + } + return s +} + +// serializeChatMessages JSON-encodes the inbound chat messages list +// for use as the gen_ai.prompt span attribute (OTel GenAI semantic +// conventions). Returns the empty string for nil / empty input or on +// marshal failure — an empty return signals the caller to skip +// stamping the attribute, preserving the "absent attribute = no +// opt-in" contract. +// +// Lives next to PrepareSpanContent because both are pure +// content-shaping helpers for the span-capture pipeline; the audit +// pipeline uses the same input but emits it as native event fields, +// not a JSON blob. +func serializeChatMessages(messages []llm.ChatMessage) string { + if len(messages) == 0 { + return "" + } + b, err := json.Marshal(messages) + if err != nil { + return "" + } + return string(b) +} + +// PrepareSpanContent runs the redact (when redact=true) and +// byte-cap-with-truncation-marker pipeline for content destined for +// an OTel span attribute. The pipeline is: +// +// 1. Apply RedactSecrets when redact=true. +// 2. TruncateForAudit (the same byte-cap helper the audit path uses) +// so a runaway prompt can't blow past the backend attribute limit +// and silently drop the marker. +// +// maxBytes <= 0 falls back to DefaultSpanContentCapBytes. The +// truncation marker is identical to what AuditPayloadCapture writes, +// so an operator who sees a `…[truncated:N]` suffix on an audit +// payload-captured field sees the same suffix on the linked span +// attribute for the same logical event. +// +// Returns the empty string when s is empty (skipping the pipeline). +func PrepareSpanContent(s string, redact bool, maxBytes int) string { + if s == "" { + return s + } + if redact { + s = RedactSecrets(s) + } + if maxBytes <= 0 { + maxBytes = DefaultSpanContentCapBytes + } + return TruncateForAudit(s, maxBytes) +} diff --git a/forge-core/runtime/content_redact_test.go b/forge-core/runtime/content_redact_test.go new file mode 100644 index 0000000..ffe1138 --- /dev/null +++ b/forge-core/runtime/content_redact_test.go @@ -0,0 +1,141 @@ +package runtime + +import ( + "strings" + "testing" +) + +func TestRedactSecrets_KnownPatterns(t *testing.T) { + cases := []struct { + name string + input string + wantNot string // a substring that MUST NOT appear in the output + }{ + {"anthropic_key", "key=sk-ant-12345abcdef67890abcdefXYZ end", "sk-ant-12345abcdef67890abcdefXYZ"}, + {"openai_key", "auth: sk-1234567890abcdefghijABCDEF tail", "sk-1234567890abcdefghijABCDEF"}, + {"github_pat", "token ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa val", "ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}, + {"github_oauth", "auth gho_bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb x", "gho_bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"}, + {"github_server", "header ghs_ssssssssssssssssssssssssssssssssssss y", "ghs_ssssssssssssssssssssssssssssssssssss"}, + {"github_fine", "pat github_pat_aaaaaaaaaaaaaaaaaaaaaa1234 z", "github_pat_aaaaaaaaaaaaaaaaaaaaaa1234"}, + {"aws_access", "AKIAIOSFODNN7EXAMPLE production-leak", "AKIAIOSFODNN7EXAMPLE"}, + {"slack_bot", "xoxb-1234567890-abcdef-bot-token-here ok", "xoxb-1234567890-abcdef-bot-token-here"}, + {"slack_user", "xoxp-9876543210-abcdef-user-token-here !", "xoxp-9876543210-abcdef-user-token-here"}, + {"telegram_bot", "tg=123456789:AAEhBP9-Klm-this-is-a-very-long-tg-bot-token-here", "123456789:AAEhBP9-Klm-this-is-a-very-long-tg-bot-token-here"}, + { + "private_key", + "-----BEGIN RSA PRIVATE KEY-----\nMIIEowIBAAKCAQEAvDdt2g\n-----END RSA PRIVATE KEY-----", + "MIIEowIBAAKCAQEAvDdt2g", + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + out := RedactSecrets(tc.input) + if strings.Contains(out, tc.wantNot) { + t.Errorf("secret leaked into redacted output\n input: %q\n output: %q", tc.input, out) + } + if !strings.Contains(out, RedactionMarker) { + t.Errorf("expected redaction marker %q in output, got %q", RedactionMarker, out) + } + }) + } +} + +func TestRedactSecrets_PreservesSurroundingText(t *testing.T) { + in := "Please use the key sk-ant-abcdefghij1234567890XYZ for testing" + out := RedactSecrets(in) + + if !strings.HasPrefix(out, "Please use the key ") { + t.Errorf("prefix lost; got %q", out) + } + if !strings.HasSuffix(out, " for testing") { + t.Errorf("suffix lost; got %q", out) + } + if strings.Contains(out, "sk-ant-abcdefghij1234567890XYZ") { + t.Errorf("secret survived redaction; got %q", out) + } +} + +func TestRedactSecrets_EmptyInput(t *testing.T) { + if RedactSecrets("") != "" { + t.Error("empty input must round-trip") + } +} + +func TestRedactSecrets_NoSecrets_NoOp(t *testing.T) { + in := "What is the weather in Paris?" + out := RedactSecrets(in) + if out != in { + t.Errorf("non-secret content was modified: %q -> %q", in, out) + } +} + +// TestPrepareSpanContent_RedactThenTruncate pins the ordering invariant: +// redact runs first, then byte-cap. If truncate ran first, a secret +// that straddled the cap boundary could survive in the truncated tail +// after the marker. The chosen order makes that impossible. +func TestPrepareSpanContent_RedactThenTruncate(t *testing.T) { + // Build input that ends with a secret near the cap boundary. + prefix := strings.Repeat("x", DefaultSpanContentCapBytes-30) + secret := "AKIAIOSFODNN7EXAMPLE" + in := prefix + " " + secret + + out := PrepareSpanContent(in, true, DefaultSpanContentCapBytes) + if strings.Contains(out, secret) { + t.Errorf("secret survived the redact-then-truncate pipeline near the cap boundary: %q", out) + } +} + +// TestPrepareSpanContent_RedactFalse_KeepsRawContent confirms the +// enterprise raw-capture path leaves the content untouched up to the +// byte cap. The cap still fires. +func TestPrepareSpanContent_RedactFalse_KeepsRawContent(t *testing.T) { + in := "sk-ant-abcdefghij1234567890XYZ" + out := PrepareSpanContent(in, false, DefaultSpanContentCapBytes) + if out != in { + t.Errorf("redact=false must not scrub; got %q want %q", out, in) + } +} + +// TestPrepareSpanContent_EmptyContent_FastPath confirms empty input +// short-circuits to empty output, so callers using a non-empty return +// to gate attribute stamping see "no opt-in" semantics for empty +// content. +func TestPrepareSpanContent_EmptyContent_FastPath(t *testing.T) { + if got := PrepareSpanContent("", true, 100); got != "" { + t.Errorf("empty input must return empty; got %q", got) + } +} + +// TestPrepareSpanContent_MaxBytesZero_FallsBackToDefault checks the +// caller-friendly default fallback. Operators / tests passing 0 get +// the package default rather than "no cap" (which would defeat the +// backend-attr-limit motivation). +func TestPrepareSpanContent_MaxBytesZero_FallsBackToDefault(t *testing.T) { + // 5 KiB of content with 0 cap → truncated to the 4 KiB default. + in := strings.Repeat("a", 5<<10) + out := PrepareSpanContent(in, false, 0) + if len(out) > DefaultSpanContentCapBytes+64 { + t.Errorf("maxBytes=0 must default to DefaultSpanContentCapBytes; got len=%d", len(out)) + } +} + +// TestPrepareSpanContent_TruncationMarkerMatchesAuditPipeline is the +// cross-pipeline parity check the issue called out by name. The +// marker shape on span content MUST be byte-identical to what the +// audit payload-capture path produces for the same input, so an +// operator grepping for "[truncated:" across both sinks sees aligned +// output. +func TestPrepareSpanContent_TruncationMarkerMatchesAuditPipeline(t *testing.T) { + in := strings.Repeat("z", DefaultSpanContentCapBytes*2) + + spanOut := PrepareSpanContent(in, false, DefaultSpanContentCapBytes) + auditOut := TruncateForAudit(in, DefaultSpanContentCapBytes) + + if spanOut != auditOut { + t.Errorf("span and audit truncation outputs diverged for the same input:\n span: %q\n audit: %q", + spanOut, auditOut) + } + if !strings.Contains(spanOut, "[truncated:") { + t.Errorf("expected truncation marker in span output; got %q", spanOut) + } +} diff --git a/forge-core/runtime/loop.go b/forge-core/runtime/loop.go index 66d3f18..aa75369 100644 --- a/forge-core/runtime/loop.go +++ b/forge-core/runtime/loop.go @@ -56,6 +56,12 @@ type LLMExecutor struct { filesDir string // directory for file_create output sessionMaxAge time.Duration // max age for session recovery (0 = no limit) workflowPhases []string // workflow phases from skills (edit, finalize, query) + // tracingCfg governs Phase 3.5 span-attribute content capture + // (issue #130). Only CaptureContent + Redact are consumed here; + // the rest of the struct is honored by the cli runner's tracer + // setup. Zero value (CaptureContent=false) means metadata-only + // spans — the default posture. + tracingCfg observability.TracingConfig } // LLMExecutorConfig configures the LLM executor. @@ -74,6 +80,12 @@ type LLMExecutorConfig struct { FilesDir string // directory for file_create output (default: $TMPDIR/forge-files) SessionMaxAge time.Duration // max idle time before session recovery is skipped (0 = 30m default) WorkflowPhases []string // workflow phases from skills (edit, finalize, query) + // TracingConfig is the same observability.TracingConfig the cli + // runner resolves and passes to NewTracerProvider. The executor + // reads CaptureContent + Redact to decide whether to stamp + // prompt / completion / tool I/O content on Phase 3 spans + // (issue #130). Zero value disables content capture. + TracingConfig observability.TracingConfig } // NewLLMExecutor creates a new LLMExecutor with the given configuration. @@ -131,6 +143,7 @@ func NewLLMExecutor(cfg LLMExecutorConfig) *LLMExecutor { filesDir: cfg.FilesDir, sessionMaxAge: sessionMaxAge, workflowPhases: cfg.WorkflowPhases, + tracingCfg: cfg.TracingConfig, } } @@ -355,6 +368,21 @@ func (e *LLMExecutor) Execute(ctx context.Context, task *a2a.Task, msg *a2a.Mess attribute.String(observability.AttrGenAISystem, e.provider), attribute.String(observability.AttrGenAIRequestModel, e.modelName), ) + // Phase 3.5 (#130) — stamp the structured input messages on the + // span when CaptureContent is enabled. Runs through the + // redact-then-truncate pipeline (PrepareSpanContent) so PII + // scrubbing is identical to what audit payload-capture will + // emit for the same event. The attribute key matches the + // current OTel GenAI semconv (gen_ai.input.messages) which + // supersedes the deprecated flat-string `gen_ai.prompt`. + if e.tracingCfg.CaptureContent { + if prompt := serializeChatMessages(req.Messages); prompt != "" { + llmSpan.SetAttributes(attribute.String( + observability.AttrGenAIInputMessages, + PrepareSpanContent(prompt, e.tracingCfg.Redact, DefaultSpanContentCapBytes), + )) + } + } resp, err := e.client.Chat(llmCtx, req) llmDuration := time.Since(llmStart) if err != nil { @@ -394,6 +422,19 @@ func (e *LLMExecutor) Execute(ctx context.Context, task *a2a.Task, msg *a2a.Mess if resp.FinishReason != "" { llmSpan.SetAttributes(attribute.StringSlice(observability.AttrGenAIResponseFinishReasons, []string{resp.FinishReason})) } + // Phase 3.5 (#130) — completion stamped after success as a + // structured single-element messages array, matching the + // current OTel GenAI semconv (gen_ai.output.messages). Empty + // content (e.g. tool-call-only assistant turns) yields no + // attribute so an absent key remains the "no opt-in" signal. + if e.tracingCfg.CaptureContent && resp.Message.Content != "" { + if out := serializeChatMessages([]llm.ChatMessage{resp.Message}); out != "" { + llmSpan.SetAttributes(attribute.String( + observability.AttrGenAIOutputMessages, + PrepareSpanContent(out, e.tracingCfg.Redact, DefaultSpanContentCapBytes), + )) + } + } llmSpan.End() // Fire AfterLLMCall hook @@ -622,12 +663,16 @@ func (e *LLMExecutor) Execute(ctx context.Context, task *a2a.Task, msg *a2a.Mess toolStart := time.Now() // Phase 3 (#104) — child span around the tool call. Span // name is "tool." so a flame graph groups tools - // by kind without a query. Tool args / results are NOT - // recorded as attributes here — Phase 3 is metadata-only; - // content capture lands when the FWS-8 redactor can be - // reused for spans. + // by kind without a query. Phase 3.5 (#130) added optional + // args/result content capture under CaptureContent + Redact. toolCtx, toolSpan := Tracer().Start(ctx, "tool."+tc.Function.Name) toolSpan.SetAttributes(attribute.String(observability.AttrForgeToolName, tc.Function.Name)) + if e.tracingCfg.CaptureContent && tc.Function.Arguments != "" { + toolSpan.SetAttributes(attribute.String( + observability.AttrForgeToolArgs, + PrepareSpanContent(tc.Function.Arguments, e.tracingCfg.Redact, DefaultSpanContentCapBytes), + )) + } result, execErr := e.tools.Execute(toolCtx, tc.Function.Name, json.RawMessage(tc.Function.Arguments)) toolDuration := time.Since(toolStart) if execErr != nil { @@ -636,6 +681,17 @@ func (e *LLMExecutor) Execute(ctx context.Context, task *a2a.Task, msg *a2a.Mess toolSpan.SetAttributes(attribute.String(observability.AttrForgeToolError, execErr.Error())) result = fmt.Sprintf("Error executing tool %s: %s", tc.Function.Name, execErr.Error()) } + // Phase 3.5 (#130) — tool result content capture. The + // error-path `result` (set above) is the synthetic + // "Error executing tool" string the loop returns to the + // LLM; capturing it gives backends the same view the LLM + // will see on the next iteration. + if e.tracingCfg.CaptureContent && result != "" { + toolSpan.SetAttributes(attribute.String( + observability.AttrForgeToolResult, + PrepareSpanContent(result, e.tracingCfg.Redact, DefaultSpanContentCapBytes), + )) + } toolSpan.End() iterResults = append(iterResults, toolIterResult{ Name: tc.Function.Name, diff --git a/forge-core/runtime/loop_spans_content_test.go b/forge-core/runtime/loop_spans_content_test.go new file mode 100644 index 0000000..24cd7ab --- /dev/null +++ b/forge-core/runtime/loop_spans_content_test.go @@ -0,0 +1,331 @@ +package runtime + +import ( + "context" + "encoding/json" + "strings" + "testing" + + "github.com/initializ/forge/forge-core/a2a" + "github.com/initializ/forge/forge-core/llm" + "github.com/initializ/forge/forge-core/observability" + sdktrace "go.opentelemetry.io/otel/sdk/trace" +) + +// awsKeyFixture is a deliberately obvious secret shape used across the +// capture-content tests so the scrub-vs-raw assertions stay readable. +const awsKeyFixture = "AKIAIOSFODNN7EXAMPLE" + +// findAttr returns the string value of a span attribute by key, with +// `ok=true` only when the key was set on the span. Distinguishes +// "attribute absent" (the metadata-only signal) from "attribute set +// to the empty string" (which the executor should never produce). +func findAttr(span sdktrace.ReadOnlySpan, key string) (string, bool) { + for _, kv := range span.Attributes() { + if string(kv.Key) == key { + return kv.Value.AsString(), true + } + } + return "", false +} + +// runOnePromptOneCompletion exercises the executor with a single-turn +// task (no tool calls) so the test focuses on the LLM-span content +// attributes. Returns the recorded llm.completion span. +func runOnePromptOneCompletion(t *testing.T, tracingCfg observability.TracingConfig, prompt, completion string) sdktrace.ReadOnlySpan { + t.Helper() + + tp, rec := observability.NewTestTracerProvider() + SetTracerProvider(tp) + t.Cleanup(func() { + ResetTracerProviderForTest() + _ = tp.Shutdown(context.Background()) + }) + + client := &mockLLMClient{ + chatFunc: func(_ context.Context, _ *llm.ChatRequest) (*llm.ChatResponse, error) { + return &llm.ChatResponse{ + Message: llm.ChatMessage{Role: llm.RoleAssistant, Content: completion}, + Usage: llm.UsageInfo{InputTokens: 50, OutputTokens: 10}, + FinishReason: "stop", + }, nil + }, + } + + exec := NewLLMExecutor(LLMExecutorConfig{ + Client: client, + Tools: &mockToolExecutor{}, + MaxIterations: 3, + ModelName: "test-model", + Provider: "anthropic", + TracingConfig: tracingCfg, + }) + + task := &a2a.Task{ID: "task-content"} + msg := &a2a.Message{Role: a2a.MessageRoleUser, Parts: []a2a.Part{{Kind: a2a.PartKindText, Text: prompt}}} + if _, err := exec.Execute(context.Background(), task, msg); err != nil { + t.Fatalf("Execute: %v", err) + } + + span, ok := rec.FindSpan("llm.completion") + if !ok { + t.Fatal("missing llm.completion span") + } + return span +} + +// TestExecute_CaptureContentTrue_StampsRedactedPromptOnLLMSpan — +// issue #130 acceptance case. Operator opts in (CaptureContent=true, +// Redact=true), sends a prompt containing an AWS access key shape. +// The gen_ai.input.messages attribute must be present and the raw key must +// NOT appear in its value. +func TestExecute_CaptureContentTrue_StampsRedactedPromptOnLLMSpan(t *testing.T) { + cfg := observability.TracingConfig{CaptureContent: true, Redact: true} + span := runOnePromptOneCompletion(t, cfg, + "deploy with key "+awsKeyFixture+" and reboot", + "ok", + ) + + got, ok := findAttr(span, observability.AttrGenAIInputMessages) + if !ok { + t.Fatal("gen_ai.input.messages attribute missing — CaptureContent=true did not stamp") + } + if strings.Contains(got, awsKeyFixture) { + t.Errorf("raw AWS key survived redaction on gen_ai.input.messages:\n%s", got) + } + if !strings.Contains(got, RedactionMarker) { + t.Errorf("expected redaction marker %q in gen_ai.input.messages; got %q", RedactionMarker, got) + } +} + +// TestExecute_CaptureContentTrue_RedactFalse_StampsRawPromptOnLLMSpan +// is the enterprise raw-capture path — operator explicitly turned +// redaction off, accepting that span attributes will carry verbatim +// content. The cap is still applied but the secret is NOT scrubbed. +func TestExecute_CaptureContentTrue_RedactFalse_StampsRawPromptOnLLMSpan(t *testing.T) { + cfg := observability.TracingConfig{CaptureContent: true, Redact: false} + span := runOnePromptOneCompletion(t, cfg, + "key="+awsKeyFixture, + "done", + ) + + got, ok := findAttr(span, observability.AttrGenAIInputMessages) + if !ok { + t.Fatal("gen_ai.input.messages missing") + } + if !strings.Contains(got, awsKeyFixture) { + t.Errorf("Redact=false must preserve raw content; expected raw key in span attribute, got %q", got) + } + if strings.Contains(got, RedactionMarker) { + t.Errorf("Redact=false must not insert redaction marker; got %q", got) + } +} + +// TestExecute_CaptureContentFalse_NoContentAttribute pins the default +// posture. With no opt-in, the gen_ai.input.messages / gen_ai.output.messages +// attributes are absent — not set to empty string. Backends that +// look for "is the key present?" must see "no" so the +// metadata-only contract holds. +func TestExecute_CaptureContentFalse_NoContentAttribute(t *testing.T) { + cfg := observability.TracingConfig{CaptureContent: false} + span := runOnePromptOneCompletion(t, cfg, + "any prompt with "+awsKeyFixture, + "any completion", + ) + + if _, ok := findAttr(span, observability.AttrGenAIInputMessages); ok { + t.Errorf("CaptureContent=false must not set gen_ai.input.messages") + } + if _, ok := findAttr(span, observability.AttrGenAIOutputMessages); ok { + t.Errorf("CaptureContent=false must not set gen_ai.output.messages") + } +} + +// TestExecute_LargePrompt_TruncatesWithSameMarkerAsAudit verifies the +// cross-pipeline parity invariant — span content trimmed by the cap +// produces a marker byte-identical to what the audit +// payload-capture path produces for the same input. +func TestExecute_LargePrompt_TruncatesWithSameMarkerAsAudit(t *testing.T) { + // 8 KiB of recognizable filler — twice the default span cap. + bigPrompt := strings.Repeat("payload-", 1024) + + cfg := observability.TracingConfig{CaptureContent: true, Redact: false} + span := runOnePromptOneCompletion(t, cfg, bigPrompt, "ok") + + got, ok := findAttr(span, observability.AttrGenAIInputMessages) + if !ok { + t.Fatal("gen_ai.input.messages missing") + } + + // The executor serializes the messages into a JSON array, so the + // audited input we compare against is the same JSON form. We use + // the same helper the executor uses to produce that string, then + // apply the same cap, and assert byte equality with the span + // attribute's value. + serialized := serializeChatMessages([]llm.ChatMessage{{Role: llm.RoleUser, Content: bigPrompt}}) + wantTruncated := TruncateForAudit(serialized, DefaultSpanContentCapBytes) + + if got != wantTruncated { + t.Errorf("span content truncation diverged from audit truncation for the same input\n span: %q\n audit: %q", + got, wantTruncated) + } + if !strings.Contains(got, "[truncated:") { + t.Errorf("expected truncation marker in gen_ai.input.messages; got prefix %q…", got[:64]) + } +} + +// TestExecute_CaptureContentTrue_StampsCompletionOnLLMSpan covers the +// happy completion path — when CaptureContent=true, the model's +// response text appears on the llm.completion span (post-success, +// before End) as a structured single-element messages array +// matching the current OTel GenAI semconv. +func TestExecute_CaptureContentTrue_StampsCompletionOnLLMSpan(t *testing.T) { + cfg := observability.TracingConfig{CaptureContent: true, Redact: false} + span := runOnePromptOneCompletion(t, cfg, "question", "the answer is 42") + + got, ok := findAttr(span, observability.AttrGenAIOutputMessages) + if !ok { + t.Fatal("gen_ai.output.messages missing") + } + + // Must be JSON: a single-element [{role,content}] array. + var msgs []llm.ChatMessage + if err := json.Unmarshal([]byte(got), &msgs); err != nil { + t.Fatalf("gen_ai.output.messages is not a JSON message array: %v\nvalue: %s", err, got) + } + if len(msgs) != 1 { + t.Fatalf("expected exactly 1 output message, got %d: %v", len(msgs), msgs) + } + if msgs[0].Role != llm.RoleAssistant { + t.Errorf("output message role = %q; want %q", msgs[0].Role, llm.RoleAssistant) + } + if msgs[0].Content != "the answer is 42" { + t.Errorf("output message content = %q; want %q", msgs[0].Content, "the answer is 42") + } +} + +// TestExecute_CaptureContentTrue_EmptyCompletion_SkipsAttribute +// matches the empty-content fast path in PrepareSpanContent and the +// `resp.Message.Content != ""` guard in loop.go — an assistant turn +// that returns no text (e.g. a tool-call-only response) should NOT +// stamp an empty completion attribute. Empty completion = no +// completion = no attribute. +func TestExecute_CaptureContentTrue_EmptyCompletion_SkipsAttribute(t *testing.T) { + cfg := observability.TracingConfig{CaptureContent: true, Redact: false} + span := runOnePromptOneCompletion(t, cfg, "question", "") + + if _, ok := findAttr(span, observability.AttrGenAIOutputMessages); ok { + t.Errorf("empty completion must not stamp gen_ai.output.messages attribute") + } +} + +// runOneToolCall exercises the executor with one tool-call iteration +// so the test focuses on the tool. span attributes. Returns the +// recorded tool span. +func runOneToolCall(t *testing.T, tracingCfg observability.TracingConfig, toolArgs, toolResult string) sdktrace.ReadOnlySpan { + t.Helper() + + tp, rec := observability.NewTestTracerProvider() + SetTracerProvider(tp) + t.Cleanup(func() { + ResetTracerProviderForTest() + _ = tp.Shutdown(context.Background()) + }) + + turn := 0 + client := &mockLLMClient{ + chatFunc: func(_ context.Context, _ *llm.ChatRequest) (*llm.ChatResponse, error) { + turn++ + if turn == 1 { + return &llm.ChatResponse{ + Message: llm.ChatMessage{ + Role: llm.RoleAssistant, + ToolCalls: []llm.ToolCall{{ + ID: "tc-1", + Type: "function", + Function: llm.FunctionCall{Name: "echo", Arguments: toolArgs}, + }}, + }, + FinishReason: "tool_calls", + }, nil + } + return &llm.ChatResponse{ + Message: llm.ChatMessage{Role: llm.RoleAssistant, Content: "done"}, + FinishReason: "stop", + }, nil + }, + } + tools := &mockToolExecutor{ + executeFunc: func(_ context.Context, _ string, _ json.RawMessage) (string, error) { + return toolResult, nil + }, + } + + exec := NewLLMExecutor(LLMExecutorConfig{ + Client: client, + Tools: tools, + MaxIterations: 3, + ModelName: "test-model", + Provider: "anthropic", + TracingConfig: tracingCfg, + }) + if _, err := exec.Execute(context.Background(), + &a2a.Task{ID: "task-tool-content"}, + &a2a.Message{Role: a2a.MessageRoleUser, Parts: []a2a.Part{{Kind: a2a.PartKindText, Text: "go"}}}); err != nil { + t.Fatalf("Execute: %v", err) + } + + span, ok := rec.FindSpan("tool.echo") + if !ok { + t.Fatal("missing tool.echo span") + } + return span +} + +// TestExecute_CaptureContentTrue_StampsToolArgsAndResult exercises the +// tool-side mirror of the LLM-span content tests. Args from the +// LLM-emitted tool call land at forge.tool.args; tool stdout/return +// lands at forge.tool.result; both go through the same redact + +// truncate pipeline. +func TestExecute_CaptureContentTrue_StampsToolArgsAndResult(t *testing.T) { + cfg := observability.TracingConfig{CaptureContent: true, Redact: true} + span := runOneToolCall(t, cfg, `{"target":"`+awsKeyFixture+`"}`, "deleted "+awsKeyFixture) + + args, ok := findAttr(span, observability.AttrForgeToolArgs) + if !ok { + t.Error("forge.tool.args missing when CaptureContent=true") + } else { + if strings.Contains(args, awsKeyFixture) { + t.Errorf("raw AWS key survived redaction on forge.tool.args: %q", args) + } + if !strings.Contains(args, RedactionMarker) { + t.Errorf("expected redaction marker in forge.tool.args; got %q", args) + } + } + + result, ok := findAttr(span, observability.AttrForgeToolResult) + if !ok { + t.Error("forge.tool.result missing when CaptureContent=true") + } else { + if strings.Contains(result, awsKeyFixture) { + t.Errorf("raw AWS key survived redaction on forge.tool.result: %q", result) + } + if !strings.Contains(result, RedactionMarker) { + t.Errorf("expected redaction marker in forge.tool.result; got %q", result) + } + } +} + +// TestExecute_CaptureContentFalse_NoToolContentAttributes mirrors the +// default-posture check on the tool-call side. +func TestExecute_CaptureContentFalse_NoToolContentAttributes(t *testing.T) { + cfg := observability.TracingConfig{CaptureContent: false} + span := runOneToolCall(t, cfg, `{"target":"foo"}`, "ok") + + if _, ok := findAttr(span, observability.AttrForgeToolArgs); ok { + t.Errorf("CaptureContent=false must not set forge.tool.args") + } + if _, ok := findAttr(span, observability.AttrForgeToolResult); ok { + t.Errorf("CaptureContent=false must not set forge.tool.result") + } +}