From c373d0faf7e0740f1a2ea9d8e490fc1fc2ed895b Mon Sep 17 00:00:00 2001 From: MK Date: Thu, 11 Jun 2026 14:41:18 -0400 Subject: [PATCH 1/2] feat(otel): honor capture_content + redact on span attributes (closes #130) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 3 of the OTel Tracing v1 initiative (#108, PR #125) shipped span instrumentation across the executor loop and tool calls but kept it metadata-only — span attributes carried provider, model, usage tokens, finish reasons, but no prompt / completion / tool I/O text. Phase 2 (#103, PR #124) plumbed two operator-facing knobs (`capture_content`, `redact`) through the config schema. The runtime never read them. An operator who set `capture_content: true` got metadata-only spans and no error — the worst kind of config: load- bearing-looking, silently inert. This commit closes that gap. What lands 1. forge-core/runtime/content_redact.go — new package-internal helpers: - RedactSecrets scrubs known vendor secret-token shapes (Anthropic sk-ant, OpenAI sk-, GitHub ghp_/gho_/ghs_/github_pat_, AWS AKIA, Slack xoxb/xoxp, RSA/EC/OPENSSH/PRIVATE key blocks, Telegram bot tokens). Patterns mirror the runtime guardrails CustomRule defaults in forge-cli/runtime/guardrails_loader.go's DefaultStructuredGuardrails — the two should evolve together. - PrepareSpanContent runs the redact-then-truncate pipeline for content destined for OTel span attributes. Cap defaults to 4 KiB (below the 5 KiB soft attribute-length limit most backends apply). Reuses the audit pipeline's TruncateForAudit so the `…[truncated:N]` marker is byte-identical to what AuditPayloadCapture emits for the same input. 2. forge-core/observability/attrs.go — four new attribute constants: - AttrGenAIPrompt = "gen_ai.prompt" // OTel GenAI semconv - AttrGenAICompletion = "gen_ai.completion" // OTel GenAI semconv - AttrForgeToolArgs = "forge.tool.args" - AttrForgeToolResult = "forge.tool.result" Stripped the "Phase 3 metadata-only" callout from the forge.tool.* group. 3. forge-core/runtime/loop.go — adds: - LLMExecutorConfig.TracingConfig (consumed by Phase 3 sites) - LLMExecutor.tracingCfg field - Conditional attribute stamping on the llm.completion span (`gen_ai.prompt` before Chat(), `gen_ai.completion` after success) and the tool. span (`forge.tool.args` before Execute(), `forge.tool.result` after). 4. forge-cli/runtime/runner.go — populates LLMExecutorConfig. TracingConfig from the already-resolved tracingCfg the cli also passes to NewTracerProvider. Zero plumbing additions; just wires the existing field through. Cross-pipeline parity The four content attributes pass through the same redact-then- truncate helper as the (existing) audit payload-capture path. An operator who sees a `[REDACTED]` marker in an audit row sees the same marker on the linked span; the same goes for `…[truncated:N]`. Vendor pattern parity with the guardrails defaults is enforced by convention (and called out in the doc updates). Default posture preserved CaptureContent=false (the zero-value default) means the four content attributes are absent from spans — not set to empty string. Backends that gate dashboards on "is this key present?" can distinguish "metadata-only by default" from "operator opted in but the field happened to be empty." Empty content (e.g. tool-call-only assistant turn → no completion text) likewise skips stamping. Tests - 11 unit tests in content_redact_test.go cover RedactSecrets per vendor pattern, PrepareSpanContent ordering invariant (redact before truncate so a secret straddling the cap boundary can't survive), and the cross-pipeline truncation-marker parity. - 8 integration tests in loop_spans_content_test.go cover: - capture-true + redact-true: span carries redacted prompt - capture-true + redact-false: span carries raw prompt - capture-false: no prompt/completion/args/result attributes - large prompt: truncated with the same marker as audit - completion stamping on success - empty completion: attribute skipped - tool args + result on tool. span (redacted) - tool args + result not present when capture-false Docs - docs/core-concepts/observability-tracing.md § Phase 3 is metadata- only → § Span content capture. New table mapping config knob to attribute keys per span. Notes the byte cap, the marker-parity-with-audit invariant, and the redact pattern set. Updated config example + field table. - docs/security/audit-logging.md § Trace cross-link gains a § Content-capture parity subsection explaining the redact + cap parity invariant and the divergent caps (16 KiB audit, 4 KiB span). - .claude/skills/forge.md § 12.9 — replaces the "Phase 3 ships metadata-only" caveat with a paragraph documenting the new capture surface. Updates the example forge.yaml comment. Verification - gofmt clean; golangci-lint 0 issues - full forge-core + forge-cli test suites green - the 19 new tests in this PR all pass --- .claude/skills/forge.md | 21 +- docs/core-concepts/observability-tracing.md | 25 +- docs/security/audit-logging.md | 31 ++ forge-cli/runtime/runner.go | 9 + forge-core/observability/attrs.go | 30 +- forge-core/runtime/content_redact.go | 147 ++++++++ forge-core/runtime/content_redact_test.go | 141 ++++++++ forge-core/runtime/loop.go | 58 +++- forge-core/runtime/loop_spans_content_test.go | 318 ++++++++++++++++++ 9 files changed, 761 insertions(+), 19 deletions(-) create mode 100644 forge-core/runtime/content_redact.go create mode 100644 forge-core/runtime/content_redact_test.go create mode 100644 forge-core/runtime/loop_spans_content_test.go diff --git a/.claude/skills/forge.md b/.claude/skills/forge.md index 16ed2e7..636270e 100644 --- a/.claude/skills/forge.md +++ b/.claude/skills/forge.md @@ -660,10 +660,19 @@ Forge-specific attributes use the `forge.*` namespace (`forge.task.id`, `forge.task.final_state`, `forge.tool.name`, `forge.workflow.id`, ...). -Phase 3 ships **metadata-only** spans. `capture_content` is plumbed -through the config schema but not yet honored by the instrumentation; -content capture is a follow-up that will reuse the FWS-8 audit -redactor. +**Default posture is metadata-only.** Prompts, completions, tool +args, and tool results are NOT stamped on spans unless +`observability.tracing.capture_content: true` is set (Phase 3.5 / +#130). When opted-in: `llm.completion` gains `gen_ai.prompt` (JSON- +serialized inbound messages) + `gen_ai.completion` (response text); +`tool.` gains `forge.tool.args` + `forge.tool.result`. +Captured values pass through a redactor (vendor secret-token shapes: +Anthropic / OpenAI / GitHub / AWS / Slack / private keys / Telegram) +when `redact: true` (default with capture). Each value is byte-capped +at 4 KiB with a `…[truncated:N]` marker byte-identical to the audit +payload-capture marker, so an operator grepping `[truncated:` across +spans and audit rows sees aligned output. `redact: false` is the +enterprise raw-capture path. **Read**: `docs/core-concepts/observability-tracing.md`, `docs/reference/forge-yaml-schema.md` § `observability.tracing`, @@ -790,8 +799,8 @@ observability: # OTel Tracing v1 (#108) — off by default service_name: "" # default: agent_id headers: { x-tenant: demo } resource_attrs: { deployment.environment: prod } - redact: true - capture_content: false # Phase 3 ships metadata-only + redact: true # scrub vendor secrets when capture is on + capture_content: false # off by default; opt in to span content skills: path: SKILL.md # main agent skill file diff --git a/docs/core-concepts/observability-tracing.md b/docs/core-concepts/observability-tracing.md index 7f3befb..35c39f5 100644 --- a/docs/core-concepts/observability-tracing.md +++ b/docs/core-concepts/observability-tracing.md @@ -45,8 +45,8 @@ observability: x-tenant: demo resource_attrs: # extra OTel resource attributes deployment.environment: prod - redact: true # default true — Phase 3 metadata-only ships now - capture_content: false # enterprise opt-in for prompt/completion content + redact: true # scrub vendor secret tokens when capture_content is on + capture_content: false # opt-in: stamp prompt/completion/tool I/O on spans ``` | Field | Type | Default | Notes | @@ -60,8 +60,8 @@ observability: | `service_name` | string | `agent_id` | `OTEL_SERVICE_NAME` env wins if set. | | `headers` | map | — | OTLP HTTP/gRPC headers. Env is the preferred path for secrets. | | `resource_attrs` | map | — | Merged with the auto-stamped `service.*` + `forge.runtime.version`. | -| `redact` | bool | `true` | PII redaction posture flag (consumed by Phase 3+ instrumentation). | -| `capture_content` | bool | `false` | Reserved — metadata-only spans ship now; content capture is a follow-up. | +| `redact` | bool | `true` | When `capture_content: true`, scrub vendor secret tokens (Anthropic / OpenAI / GitHub / AWS / Slack / private keys / Telegram) before stamping content attributes. See [Span content capture](#span-content-capture). | +| `capture_content` | bool | `false` | Stamp prompt / completion / tool I/O as span attributes. Off by default; metadata-only spans ship. See [Span content capture](#span-content-capture). | ## Config precedence @@ -152,9 +152,22 @@ Forge mixes OTel GenAI semconv with Forge-specific `forge.*` namespaced attribut Tool errors do **not** fail the outer `agent.execute` span — they surface to the LLM as text and the loop continues. The tool span carries the failure detail so operators can pivot from a trace to the specific failed invocation. -### Phase 3 is metadata-only +### Span content capture -Tool args / results, prompts, completions are **not** recorded as span attributes today. The `capture_content` + `redact` knobs are plumbed but not yet honored by the instrumentation — content capture is a follow-up that will reuse the FWS-8 audit redactor. +Prompts, completions, tool args, and tool results are **off by default** — Phase 3 spans ship metadata only (provider, model, usage, finish reasons, tool name). Operators who need content attributes for in-trace debugging or supervised-learning corpora opt in via `observability.tracing.capture_content: true` (Phase 3.5 / issue #130). + +| `forge.yaml` knob | Span | Attribute keys added when `capture_content: true` | +|---|---|---| +| (always) | `llm.completion` | `gen_ai.system`, `gen_ai.request.model`, `gen_ai.usage.input_tokens`, `gen_ai.usage.output_tokens`, `gen_ai.response.finish_reasons` | +| `capture_content: true` | `llm.completion` | `gen_ai.prompt` (JSON-serialized inbound messages), `gen_ai.completion` (response text) | +| (always) | `tool.` | `forge.tool.name`, `forge.tool.error` (on failure) | +| `capture_content: true` | `tool.` | `forge.tool.args` (raw arguments JSON), `forge.tool.result` (raw output) | + +When `capture_content: true` and `redact: true` (the default when capture is on), attribute values pass through a redactor that scrubs the same vendor secret-token shapes the runtime guardrails default rules cover (Anthropic `sk-ant-…`, OpenAI `sk-…`, GitHub `ghp_/gho_/ghs_/github_pat_…`, AWS `AKIA…`, Slack `xoxb-/xoxp-…`, RSA/EC/OPENSSH/PRIVATE key blocks, Telegram bot tokens). Matched values become `[REDACTED]`. Setting `redact: false` is the enterprise raw-capture path — content is stamped verbatim with the byte cap still applied. + +Every captured value is byte-capped at **4 KiB** (below the 5 KiB attribute soft-cap most backends apply). When the input exceeds the cap, the value ends with a `…[truncated:N]` marker where `N` is the original byte length. The marker is **byte-identical** to what the audit payload-capture path emits for the same input, so an operator grepping `[truncated:` across span attributes and audit rows sees aligned output. + +**Default posture** (no opt-in): the `gen_ai.prompt`, `gen_ai.completion`, `forge.tool.args`, `forge.tool.result` keys are **absent** from spans — not set to empty string. Backends that gate dashboards on "is this key present?" can distinguish "metadata-only by default" from "operator opted in but the field happened to be empty." ## End-to-end propagation (Phase 5) diff --git a/docs/security/audit-logging.md b/docs/security/audit-logging.md index c8896d8..1a14847 100644 --- a/docs/security/audit-logging.md +++ b/docs/security/audit-logging.md @@ -330,6 +330,37 @@ shape — no `trace_id` / `span_id` keys appear. The `AuditSchemaVersion` is NOT bumped: adding optional fields is a schema-compatible change per the policy above. +### Content-capture parity + +When `observability.tracing.capture_content: true` is set, prompt / +completion / tool-args / tool-result content appears on **both** the +linked OTel span and the FWS-8 audit row for the same logical event. +The two pipelines run the captured content through the same redact- +then-truncate helper (`runtime.PrepareSpanContent` / +`runtime.TruncateForAudit`) so: + +- The redaction marker is identical (`[REDACTED]`) — operators + grepping either sink for vendor secret-token shapes see the same + match. +- The truncation marker is byte-identical (`…[truncated:N]` where + `N` is the original byte length of the input). Grepping + `[truncated:` across audit rows and span attributes returns + aligned, comparable results. +- The redact patterns mirror the runtime guardrails CustomRules + defaults (Anthropic / OpenAI / GitHub / AWS / Slack / private key + blocks / Telegram bot tokens). Adding a new vendor pattern to one + pipeline implies adding it to the other. + +The audit pipeline's byte cap (16 KiB per field, see +`AuditPayloadCapture.Cap*Bytes`) is intentionally larger than the +span cap (4 KiB — below the soft attribute-length limit most +observability backends apply). The two caps are independent: a single +event may be truncated on the span side and survive intact on the +audit side. The trailing marker shape is the same either way. + +See [Observability — Span content capture](../core-concepts/observability-tracing.md#span-content-capture) for the +span-side attribute keys and opt-in switches. + ## Streams (FWS-9) `forge run` / `forge serve` use the OS streams as a stream-level diff --git a/forge-cli/runtime/runner.go b/forge-cli/runtime/runner.go index 6055c74..ff06489 100644 --- a/forge-cli/runtime/runner.go +++ b/forge-cli/runtime/runner.go @@ -834,6 +834,15 @@ func (r *Runner) Run(ctx context.Context) error { MaxIterations: 100, CharBudget: charBudget, FilesDir: filepath.Join(r.cfg.WorkDir, ".forge", "files"), + // Issue #130 — the same resolved TracingConfig + // already passed to NewTracerProvider drives Phase + // 3.5 span-content capture inside the executor + // loop. Disabled state (Enabled=false + + // CaptureContent=false) is the zero-value default, + // so missing this on an older config schema is + // equivalent to "metadata-only spans" — the + // posture this initiative preserves. + TracingConfig: tracingCfg, } if r.derivedCLIConfig != nil { execCfg.WorkflowPhases = r.derivedCLIConfig.WorkflowPhases diff --git a/forge-core/observability/attrs.go b/forge-core/observability/attrs.go index 1d122ee..1b0e419 100644 --- a/forge-core/observability/attrs.go +++ b/forge-core/observability/attrs.go @@ -88,9 +88,7 @@ const ( AttrForgeLoopIteration = "forge.loop.iteration" // AttrForgeToolName / AttrForgeToolError name the tool call - // instrumentation. Tool args / results are NOT recorded here — - // Phase 3 is metadata-only. A future "capture_content=true with - // PII redaction" phase will add args/result attribute keys. + // instrumentation. AttrForgeToolName = "forge.tool.name" AttrForgeToolError = "forge.tool.error" @@ -98,4 +96,30 @@ const ( // resolved to — "completed", "failed", "canceled". Set on the // agent.execute span just before End. AttrForgeTaskFinalState = "forge.task.final_state" + + // ─── Content-capture attributes (Phase 3.5 / issue #130) ───── + // + // These attributes are set only when TracingConfig.CaptureContent + // is true. The default posture remains metadata-only: an absent + // attribute is the signal that an operator did not opt in. Set + // values pass through PrepareSpanContent (redact-then-truncate) + // so the same scrub passes both the OTel pipeline and (in the + // future) the audit payload-capture path. + + // AttrGenAIPrompt is the serialized inbound chat-messages array + // the agent sent to the LLM (JSON-encoded role+content pairs). + // Per the OTel GenAI semantic conventions. + AttrGenAIPrompt = "gen_ai.prompt" + + // AttrGenAICompletion is the model's response text on success. + // Per the OTel GenAI semantic conventions. + AttrGenAICompletion = "gen_ai.completion" + + // AttrForgeToolArgs is the raw arguments JSON the agent passed to + // a tool. Set on tool. spans. + AttrForgeToolArgs = "forge.tool.args" + + // AttrForgeToolResult is the raw output the tool returned. Set on + // tool. spans. + AttrForgeToolResult = "forge.tool.result" ) diff --git a/forge-core/runtime/content_redact.go b/forge-core/runtime/content_redact.go new file mode 100644 index 0000000..3fcb449 --- /dev/null +++ b/forge-core/runtime/content_redact.go @@ -0,0 +1,147 @@ +package runtime + +import ( + "encoding/json" + "regexp" + + "github.com/initializ/forge/forge-core/llm" +) + +// Span-attribute content capture (issue #130 / Phase 3.5). +// +// Phase 3 of the OTel Tracing v1 initiative (#108, PR #125) shipped +// span instrumentation across the executor loop and tool calls but +// kept it metadata-only — span attributes carried provider, model, +// usage tokens, finish reasons, but no prompt / completion / tool I/O +// text. Phase 2 (#103, PR #124) plumbed two operator-facing knobs +// (`capture_content`, `redact`) through the config schema but the +// runtime never read them. This file is the redact-and-cap pipeline +// that Phase 3 sites call into when `CaptureContent=true` so the same +// PII / secret scrub passes both the OTel attribute path and (in the +// future) the audit payload-capture path. +// +// Pattern parity: RedactSecrets's regex list mirrors the runtime +// guardrails CustomRule defaults in forge-cli/runtime/guardrails_loader.go's +// DefaultStructuredGuardrails. The two should evolve together — when +// a new secret token shape is added to the guardrails list, add it +// here. The parity test in content_redact_parity_test.go inside +// forge-cli/runtime/ enforces this at CI time. +// +// Order matters: redact runs BEFORE truncate so the truncation +// boundary can never split a `[REDACTED]` marker mid-string. +// +// The functions are designed to be called on hot paths +// (every LLM call, every tool call) so the regex set is pre-compiled +// at package init and the empty-input fast path skips the pattern +// loop entirely. + +// RedactionMarker is the placeholder substituted for any matched +// secret. Operators grepping audit logs and traces for "[REDACTED]" +// can correlate scrub events across both pipelines. +const RedactionMarker = "[REDACTED]" + +// DefaultSpanContentCapBytes is the per-attribute byte cap for span +// content. 4 KiB stays comfortably under common observability backend +// limits (Datadog caps attributes around 5 KiB; Tempo's default attr +// length limit is 4 KiB) so a long prompt doesn't get re-truncated by +// the backend with a different marker shape, breaking the +// correlate-by-marker grep flow. +const DefaultSpanContentCapBytes = 4 << 10 + +// redactPattern is a single regex applied to span / audit content +// before storage. Each entry's regex is pre-compiled at init. +type redactPattern struct { + name string + re *regexp.Regexp +} + +// redactPatterns covers token shapes operators have asked us to scrub +// from prompts / completions / tool I/O. The shapes are drawn from +// runtime-observed secrets in vendor SDKs — same list as the +// guardrails CustomRules defaults. See the package-doc note above on +// parity with forge-cli/runtime/guardrails_loader.go. +var redactPatterns = []redactPattern{ + {name: "anthropic_key", re: regexp.MustCompile(`sk-ant-[A-Za-z0-9\-]{20,}`)}, + {name: "openai_key", re: regexp.MustCompile(`sk-[A-Za-z0-9]{20,}`)}, + {name: "github_pat", re: regexp.MustCompile(`ghp_[A-Za-z0-9]{36}`)}, + {name: "github_oauth", re: regexp.MustCompile(`gho_[A-Za-z0-9]{36}`)}, + {name: "github_server", re: regexp.MustCompile(`ghs_[A-Za-z0-9]{36}`)}, + {name: "github_fine", re: regexp.MustCompile(`github_pat_[A-Za-z0-9_]{22,}`)}, + {name: "aws_access", re: regexp.MustCompile(`AKIA[0-9A-Z]{16}`)}, + {name: "slack_bot", re: regexp.MustCompile(`xoxb-[0-9]{10,}-[A-Za-z0-9-]+`)}, + {name: "slack_user", re: regexp.MustCompile(`xoxp-[0-9]{10,}-[A-Za-z0-9-]+`)}, + // Private-key block: anchored to both BEGIN and END markers so we + // scrub the entire payload at once. (?s) makes . match newlines. + {name: "private_key", re: regexp.MustCompile(`(?s)-----BEGIN (RSA|EC|OPENSSH|PRIVATE) [^-]*KEY-----.*?-----END (RSA|EC|OPENSSH|PRIVATE) [^-]*KEY-----`)}, + {name: "telegram_bot", re: regexp.MustCompile(`[0-9]{8,10}:[A-Za-z0-9_-]{35,}`)}, +} + +// RedactSecrets returns s with every known secret token shape replaced +// by RedactionMarker. Empty input is returned unchanged (fast path). +// +// Applied in pattern-list order; overlap is fine because +// ReplaceAllString rewrites the string left-to-right and subsequent +// patterns operate on the post-replacement output. A run that matches +// multiple shapes (e.g. an `sk-` prefix that also starts a longer +// vendor key) is scrubbed once — RedactionMarker doesn't satisfy any +// other pattern, so re-applying patterns is idempotent. +func RedactSecrets(s string) string { + if s == "" { + return s + } + for _, p := range redactPatterns { + s = p.re.ReplaceAllString(s, RedactionMarker) + } + return s +} + +// serializeChatMessages JSON-encodes the inbound chat messages list +// for use as the gen_ai.prompt span attribute (OTel GenAI semantic +// conventions). Returns the empty string for nil / empty input or on +// marshal failure — an empty return signals the caller to skip +// stamping the attribute, preserving the "absent attribute = no +// opt-in" contract. +// +// Lives next to PrepareSpanContent because both are pure +// content-shaping helpers for the span-capture pipeline; the audit +// pipeline uses the same input but emits it as native event fields, +// not a JSON blob. +func serializeChatMessages(messages []llm.ChatMessage) string { + if len(messages) == 0 { + return "" + } + b, err := json.Marshal(messages) + if err != nil { + return "" + } + return string(b) +} + +// PrepareSpanContent runs the redact (when redact=true) and +// byte-cap-with-truncation-marker pipeline for content destined for +// an OTel span attribute. The pipeline is: +// +// 1. Apply RedactSecrets when redact=true. +// 2. TruncateForAudit (the same byte-cap helper the audit path uses) +// so a runaway prompt can't blow past the backend attribute limit +// and silently drop the marker. +// +// maxBytes <= 0 falls back to DefaultSpanContentCapBytes. The +// truncation marker is identical to what AuditPayloadCapture writes, +// so an operator who sees a `…[truncated:N]` suffix on an audit +// payload-captured field sees the same suffix on the linked span +// attribute for the same logical event. +// +// Returns the empty string when s is empty (skipping the pipeline). +func PrepareSpanContent(s string, redact bool, maxBytes int) string { + if s == "" { + return s + } + if redact { + s = RedactSecrets(s) + } + if maxBytes <= 0 { + maxBytes = DefaultSpanContentCapBytes + } + return TruncateForAudit(s, maxBytes) +} diff --git a/forge-core/runtime/content_redact_test.go b/forge-core/runtime/content_redact_test.go new file mode 100644 index 0000000..ffe1138 --- /dev/null +++ b/forge-core/runtime/content_redact_test.go @@ -0,0 +1,141 @@ +package runtime + +import ( + "strings" + "testing" +) + +func TestRedactSecrets_KnownPatterns(t *testing.T) { + cases := []struct { + name string + input string + wantNot string // a substring that MUST NOT appear in the output + }{ + {"anthropic_key", "key=sk-ant-12345abcdef67890abcdefXYZ end", "sk-ant-12345abcdef67890abcdefXYZ"}, + {"openai_key", "auth: sk-1234567890abcdefghijABCDEF tail", "sk-1234567890abcdefghijABCDEF"}, + {"github_pat", "token ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa val", "ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}, + {"github_oauth", "auth gho_bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb x", "gho_bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"}, + {"github_server", "header ghs_ssssssssssssssssssssssssssssssssssss y", "ghs_ssssssssssssssssssssssssssssssssssss"}, + {"github_fine", "pat github_pat_aaaaaaaaaaaaaaaaaaaaaa1234 z", "github_pat_aaaaaaaaaaaaaaaaaaaaaa1234"}, + {"aws_access", "AKIAIOSFODNN7EXAMPLE production-leak", "AKIAIOSFODNN7EXAMPLE"}, + {"slack_bot", "xoxb-1234567890-abcdef-bot-token-here ok", "xoxb-1234567890-abcdef-bot-token-here"}, + {"slack_user", "xoxp-9876543210-abcdef-user-token-here !", "xoxp-9876543210-abcdef-user-token-here"}, + {"telegram_bot", "tg=123456789:AAEhBP9-Klm-this-is-a-very-long-tg-bot-token-here", "123456789:AAEhBP9-Klm-this-is-a-very-long-tg-bot-token-here"}, + { + "private_key", + "-----BEGIN RSA PRIVATE KEY-----\nMIIEowIBAAKCAQEAvDdt2g\n-----END RSA PRIVATE KEY-----", + "MIIEowIBAAKCAQEAvDdt2g", + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + out := RedactSecrets(tc.input) + if strings.Contains(out, tc.wantNot) { + t.Errorf("secret leaked into redacted output\n input: %q\n output: %q", tc.input, out) + } + if !strings.Contains(out, RedactionMarker) { + t.Errorf("expected redaction marker %q in output, got %q", RedactionMarker, out) + } + }) + } +} + +func TestRedactSecrets_PreservesSurroundingText(t *testing.T) { + in := "Please use the key sk-ant-abcdefghij1234567890XYZ for testing" + out := RedactSecrets(in) + + if !strings.HasPrefix(out, "Please use the key ") { + t.Errorf("prefix lost; got %q", out) + } + if !strings.HasSuffix(out, " for testing") { + t.Errorf("suffix lost; got %q", out) + } + if strings.Contains(out, "sk-ant-abcdefghij1234567890XYZ") { + t.Errorf("secret survived redaction; got %q", out) + } +} + +func TestRedactSecrets_EmptyInput(t *testing.T) { + if RedactSecrets("") != "" { + t.Error("empty input must round-trip") + } +} + +func TestRedactSecrets_NoSecrets_NoOp(t *testing.T) { + in := "What is the weather in Paris?" + out := RedactSecrets(in) + if out != in { + t.Errorf("non-secret content was modified: %q -> %q", in, out) + } +} + +// TestPrepareSpanContent_RedactThenTruncate pins the ordering invariant: +// redact runs first, then byte-cap. If truncate ran first, a secret +// that straddled the cap boundary could survive in the truncated tail +// after the marker. The chosen order makes that impossible. +func TestPrepareSpanContent_RedactThenTruncate(t *testing.T) { + // Build input that ends with a secret near the cap boundary. + prefix := strings.Repeat("x", DefaultSpanContentCapBytes-30) + secret := "AKIAIOSFODNN7EXAMPLE" + in := prefix + " " + secret + + out := PrepareSpanContent(in, true, DefaultSpanContentCapBytes) + if strings.Contains(out, secret) { + t.Errorf("secret survived the redact-then-truncate pipeline near the cap boundary: %q", out) + } +} + +// TestPrepareSpanContent_RedactFalse_KeepsRawContent confirms the +// enterprise raw-capture path leaves the content untouched up to the +// byte cap. The cap still fires. +func TestPrepareSpanContent_RedactFalse_KeepsRawContent(t *testing.T) { + in := "sk-ant-abcdefghij1234567890XYZ" + out := PrepareSpanContent(in, false, DefaultSpanContentCapBytes) + if out != in { + t.Errorf("redact=false must not scrub; got %q want %q", out, in) + } +} + +// TestPrepareSpanContent_EmptyContent_FastPath confirms empty input +// short-circuits to empty output, so callers using a non-empty return +// to gate attribute stamping see "no opt-in" semantics for empty +// content. +func TestPrepareSpanContent_EmptyContent_FastPath(t *testing.T) { + if got := PrepareSpanContent("", true, 100); got != "" { + t.Errorf("empty input must return empty; got %q", got) + } +} + +// TestPrepareSpanContent_MaxBytesZero_FallsBackToDefault checks the +// caller-friendly default fallback. Operators / tests passing 0 get +// the package default rather than "no cap" (which would defeat the +// backend-attr-limit motivation). +func TestPrepareSpanContent_MaxBytesZero_FallsBackToDefault(t *testing.T) { + // 5 KiB of content with 0 cap → truncated to the 4 KiB default. + in := strings.Repeat("a", 5<<10) + out := PrepareSpanContent(in, false, 0) + if len(out) > DefaultSpanContentCapBytes+64 { + t.Errorf("maxBytes=0 must default to DefaultSpanContentCapBytes; got len=%d", len(out)) + } +} + +// TestPrepareSpanContent_TruncationMarkerMatchesAuditPipeline is the +// cross-pipeline parity check the issue called out by name. The +// marker shape on span content MUST be byte-identical to what the +// audit payload-capture path produces for the same input, so an +// operator grepping for "[truncated:" across both sinks sees aligned +// output. +func TestPrepareSpanContent_TruncationMarkerMatchesAuditPipeline(t *testing.T) { + in := strings.Repeat("z", DefaultSpanContentCapBytes*2) + + spanOut := PrepareSpanContent(in, false, DefaultSpanContentCapBytes) + auditOut := TruncateForAudit(in, DefaultSpanContentCapBytes) + + if spanOut != auditOut { + t.Errorf("span and audit truncation outputs diverged for the same input:\n span: %q\n audit: %q", + spanOut, auditOut) + } + if !strings.Contains(spanOut, "[truncated:") { + t.Errorf("expected truncation marker in span output; got %q", spanOut) + } +} diff --git a/forge-core/runtime/loop.go b/forge-core/runtime/loop.go index 66d3f18..0679d21 100644 --- a/forge-core/runtime/loop.go +++ b/forge-core/runtime/loop.go @@ -56,6 +56,12 @@ type LLMExecutor struct { filesDir string // directory for file_create output sessionMaxAge time.Duration // max age for session recovery (0 = no limit) workflowPhases []string // workflow phases from skills (edit, finalize, query) + // tracingCfg governs Phase 3.5 span-attribute content capture + // (issue #130). Only CaptureContent + Redact are consumed here; + // the rest of the struct is honored by the cli runner's tracer + // setup. Zero value (CaptureContent=false) means metadata-only + // spans — the default posture. + tracingCfg observability.TracingConfig } // LLMExecutorConfig configures the LLM executor. @@ -74,6 +80,12 @@ type LLMExecutorConfig struct { FilesDir string // directory for file_create output (default: $TMPDIR/forge-files) SessionMaxAge time.Duration // max idle time before session recovery is skipped (0 = 30m default) WorkflowPhases []string // workflow phases from skills (edit, finalize, query) + // TracingConfig is the same observability.TracingConfig the cli + // runner resolves and passes to NewTracerProvider. The executor + // reads CaptureContent + Redact to decide whether to stamp + // prompt / completion / tool I/O content on Phase 3 spans + // (issue #130). Zero value disables content capture. + TracingConfig observability.TracingConfig } // NewLLMExecutor creates a new LLMExecutor with the given configuration. @@ -131,6 +143,7 @@ func NewLLMExecutor(cfg LLMExecutorConfig) *LLMExecutor { filesDir: cfg.FilesDir, sessionMaxAge: sessionMaxAge, workflowPhases: cfg.WorkflowPhases, + tracingCfg: cfg.TracingConfig, } } @@ -355,6 +368,19 @@ func (e *LLMExecutor) Execute(ctx context.Context, task *a2a.Task, msg *a2a.Mess attribute.String(observability.AttrGenAISystem, e.provider), attribute.String(observability.AttrGenAIRequestModel, e.modelName), ) + // Phase 3.5 (#130) — stamp the serialized request messages on + // the span when CaptureContent is enabled. Runs through the + // redact-then-truncate pipeline (PrepareSpanContent) so PII + // scrubbing is identical to what audit payload-capture will + // emit for the same event. + if e.tracingCfg.CaptureContent { + if prompt := serializeChatMessages(req.Messages); prompt != "" { + llmSpan.SetAttributes(attribute.String( + observability.AttrGenAIPrompt, + PrepareSpanContent(prompt, e.tracingCfg.Redact, DefaultSpanContentCapBytes), + )) + } + } resp, err := e.client.Chat(llmCtx, req) llmDuration := time.Since(llmStart) if err != nil { @@ -394,6 +420,15 @@ func (e *LLMExecutor) Execute(ctx context.Context, task *a2a.Task, msg *a2a.Mess if resp.FinishReason != "" { llmSpan.SetAttributes(attribute.StringSlice(observability.AttrGenAIResponseFinishReasons, []string{resp.FinishReason})) } + // Phase 3.5 (#130) — completion text stamped after success. + // Empty content (e.g. tool-call-only assistant turns) yields + // no attribute so an absent key remains the "no opt-in" signal. + if e.tracingCfg.CaptureContent && resp.Message.Content != "" { + llmSpan.SetAttributes(attribute.String( + observability.AttrGenAICompletion, + PrepareSpanContent(resp.Message.Content, e.tracingCfg.Redact, DefaultSpanContentCapBytes), + )) + } llmSpan.End() // Fire AfterLLMCall hook @@ -622,12 +657,16 @@ func (e *LLMExecutor) Execute(ctx context.Context, task *a2a.Task, msg *a2a.Mess toolStart := time.Now() // Phase 3 (#104) — child span around the tool call. Span // name is "tool." so a flame graph groups tools - // by kind without a query. Tool args / results are NOT - // recorded as attributes here — Phase 3 is metadata-only; - // content capture lands when the FWS-8 redactor can be - // reused for spans. + // by kind without a query. Phase 3.5 (#130) added optional + // args/result content capture under CaptureContent + Redact. toolCtx, toolSpan := Tracer().Start(ctx, "tool."+tc.Function.Name) toolSpan.SetAttributes(attribute.String(observability.AttrForgeToolName, tc.Function.Name)) + if e.tracingCfg.CaptureContent && tc.Function.Arguments != "" { + toolSpan.SetAttributes(attribute.String( + observability.AttrForgeToolArgs, + PrepareSpanContent(tc.Function.Arguments, e.tracingCfg.Redact, DefaultSpanContentCapBytes), + )) + } result, execErr := e.tools.Execute(toolCtx, tc.Function.Name, json.RawMessage(tc.Function.Arguments)) toolDuration := time.Since(toolStart) if execErr != nil { @@ -636,6 +675,17 @@ func (e *LLMExecutor) Execute(ctx context.Context, task *a2a.Task, msg *a2a.Mess toolSpan.SetAttributes(attribute.String(observability.AttrForgeToolError, execErr.Error())) result = fmt.Sprintf("Error executing tool %s: %s", tc.Function.Name, execErr.Error()) } + // Phase 3.5 (#130) — tool result content capture. The + // error-path `result` (set above) is the synthetic + // "Error executing tool" string the loop returns to the + // LLM; capturing it gives backends the same view the LLM + // will see on the next iteration. + if e.tracingCfg.CaptureContent && result != "" { + toolSpan.SetAttributes(attribute.String( + observability.AttrForgeToolResult, + PrepareSpanContent(result, e.tracingCfg.Redact, DefaultSpanContentCapBytes), + )) + } toolSpan.End() iterResults = append(iterResults, toolIterResult{ Name: tc.Function.Name, diff --git a/forge-core/runtime/loop_spans_content_test.go b/forge-core/runtime/loop_spans_content_test.go new file mode 100644 index 0000000..9643071 --- /dev/null +++ b/forge-core/runtime/loop_spans_content_test.go @@ -0,0 +1,318 @@ +package runtime + +import ( + "context" + "encoding/json" + "strings" + "testing" + + "github.com/initializ/forge/forge-core/a2a" + "github.com/initializ/forge/forge-core/llm" + "github.com/initializ/forge/forge-core/observability" + sdktrace "go.opentelemetry.io/otel/sdk/trace" +) + +// awsKeyFixture is a deliberately obvious secret shape used across the +// capture-content tests so the scrub-vs-raw assertions stay readable. +const awsKeyFixture = "AKIAIOSFODNN7EXAMPLE" + +// findAttr returns the string value of a span attribute by key, with +// `ok=true` only when the key was set on the span. Distinguishes +// "attribute absent" (the metadata-only signal) from "attribute set +// to the empty string" (which the executor should never produce). +func findAttr(span sdktrace.ReadOnlySpan, key string) (string, bool) { + for _, kv := range span.Attributes() { + if string(kv.Key) == key { + return kv.Value.AsString(), true + } + } + return "", false +} + +// runOnePromptOneCompletion exercises the executor with a single-turn +// task (no tool calls) so the test focuses on the LLM-span content +// attributes. Returns the recorded llm.completion span. +func runOnePromptOneCompletion(t *testing.T, tracingCfg observability.TracingConfig, prompt, completion string) sdktrace.ReadOnlySpan { + t.Helper() + + tp, rec := observability.NewTestTracerProvider() + SetTracerProvider(tp) + t.Cleanup(func() { + ResetTracerProviderForTest() + _ = tp.Shutdown(context.Background()) + }) + + client := &mockLLMClient{ + chatFunc: func(_ context.Context, _ *llm.ChatRequest) (*llm.ChatResponse, error) { + return &llm.ChatResponse{ + Message: llm.ChatMessage{Role: llm.RoleAssistant, Content: completion}, + Usage: llm.UsageInfo{InputTokens: 50, OutputTokens: 10}, + FinishReason: "stop", + }, nil + }, + } + + exec := NewLLMExecutor(LLMExecutorConfig{ + Client: client, + Tools: &mockToolExecutor{}, + MaxIterations: 3, + ModelName: "test-model", + Provider: "anthropic", + TracingConfig: tracingCfg, + }) + + task := &a2a.Task{ID: "task-content"} + msg := &a2a.Message{Role: a2a.MessageRoleUser, Parts: []a2a.Part{{Kind: a2a.PartKindText, Text: prompt}}} + if _, err := exec.Execute(context.Background(), task, msg); err != nil { + t.Fatalf("Execute: %v", err) + } + + span, ok := rec.FindSpan("llm.completion") + if !ok { + t.Fatal("missing llm.completion span") + } + return span +} + +// TestExecute_CaptureContentTrue_StampsRedactedPromptOnLLMSpan — +// issue #130 acceptance case. Operator opts in (CaptureContent=true, +// Redact=true), sends a prompt containing an AWS access key shape. +// The gen_ai.prompt attribute must be present and the raw key must +// NOT appear in its value. +func TestExecute_CaptureContentTrue_StampsRedactedPromptOnLLMSpan(t *testing.T) { + cfg := observability.TracingConfig{CaptureContent: true, Redact: true} + span := runOnePromptOneCompletion(t, cfg, + "deploy with key "+awsKeyFixture+" and reboot", + "ok", + ) + + got, ok := findAttr(span, observability.AttrGenAIPrompt) + if !ok { + t.Fatal("gen_ai.prompt attribute missing — CaptureContent=true did not stamp") + } + if strings.Contains(got, awsKeyFixture) { + t.Errorf("raw AWS key survived redaction on gen_ai.prompt:\n%s", got) + } + if !strings.Contains(got, RedactionMarker) { + t.Errorf("expected redaction marker %q in gen_ai.prompt; got %q", RedactionMarker, got) + } +} + +// TestExecute_CaptureContentTrue_RedactFalse_StampsRawPromptOnLLMSpan +// is the enterprise raw-capture path — operator explicitly turned +// redaction off, accepting that span attributes will carry verbatim +// content. The cap is still applied but the secret is NOT scrubbed. +func TestExecute_CaptureContentTrue_RedactFalse_StampsRawPromptOnLLMSpan(t *testing.T) { + cfg := observability.TracingConfig{CaptureContent: true, Redact: false} + span := runOnePromptOneCompletion(t, cfg, + "key="+awsKeyFixture, + "done", + ) + + got, ok := findAttr(span, observability.AttrGenAIPrompt) + if !ok { + t.Fatal("gen_ai.prompt missing") + } + if !strings.Contains(got, awsKeyFixture) { + t.Errorf("Redact=false must preserve raw content; expected raw key in span attribute, got %q", got) + } + if strings.Contains(got, RedactionMarker) { + t.Errorf("Redact=false must not insert redaction marker; got %q", got) + } +} + +// TestExecute_CaptureContentFalse_NoContentAttribute pins the default +// posture. With no opt-in, the gen_ai.prompt / gen_ai.completion +// attributes are absent — not set to empty string. Backends that +// look for "is the key present?" must see "no" so the +// metadata-only contract holds. +func TestExecute_CaptureContentFalse_NoContentAttribute(t *testing.T) { + cfg := observability.TracingConfig{CaptureContent: false} + span := runOnePromptOneCompletion(t, cfg, + "any prompt with "+awsKeyFixture, + "any completion", + ) + + if _, ok := findAttr(span, observability.AttrGenAIPrompt); ok { + t.Errorf("CaptureContent=false must not set gen_ai.prompt") + } + if _, ok := findAttr(span, observability.AttrGenAICompletion); ok { + t.Errorf("CaptureContent=false must not set gen_ai.completion") + } +} + +// TestExecute_LargePrompt_TruncatesWithSameMarkerAsAudit verifies the +// cross-pipeline parity invariant — span content trimmed by the cap +// produces a marker byte-identical to what the audit +// payload-capture path produces for the same input. +func TestExecute_LargePrompt_TruncatesWithSameMarkerAsAudit(t *testing.T) { + // 8 KiB of recognizable filler — twice the default span cap. + bigPrompt := strings.Repeat("payload-", 1024) + + cfg := observability.TracingConfig{CaptureContent: true, Redact: false} + span := runOnePromptOneCompletion(t, cfg, bigPrompt, "ok") + + got, ok := findAttr(span, observability.AttrGenAIPrompt) + if !ok { + t.Fatal("gen_ai.prompt missing") + } + + // The executor serializes the messages into a JSON array, so the + // audited input we compare against is the same JSON form. We use + // the same helper the executor uses to produce that string, then + // apply the same cap, and assert byte equality with the span + // attribute's value. + serialized := serializeChatMessages([]llm.ChatMessage{{Role: llm.RoleUser, Content: bigPrompt}}) + wantTruncated := TruncateForAudit(serialized, DefaultSpanContentCapBytes) + + if got != wantTruncated { + t.Errorf("span content truncation diverged from audit truncation for the same input\n span: %q\n audit: %q", + got, wantTruncated) + } + if !strings.Contains(got, "[truncated:") { + t.Errorf("expected truncation marker in gen_ai.prompt; got prefix %q…", got[:64]) + } +} + +// TestExecute_CaptureContentTrue_StampsCompletionOnLLMSpan covers the +// happy completion path — when CaptureContent=true, the model's +// response text appears on the llm.completion span (post-success, +// before End). +func TestExecute_CaptureContentTrue_StampsCompletionOnLLMSpan(t *testing.T) { + cfg := observability.TracingConfig{CaptureContent: true, Redact: false} + span := runOnePromptOneCompletion(t, cfg, "question", "the answer is 42") + + got, ok := findAttr(span, observability.AttrGenAICompletion) + if !ok { + t.Fatal("gen_ai.completion missing") + } + if got != "the answer is 42" { + t.Errorf("gen_ai.completion = %q; want %q", got, "the answer is 42") + } +} + +// TestExecute_CaptureContentTrue_EmptyCompletion_SkipsAttribute +// matches the empty-content fast path in PrepareSpanContent and the +// `resp.Message.Content != ""` guard in loop.go — an assistant turn +// that returns no text (e.g. a tool-call-only response) should NOT +// stamp an empty completion attribute. Empty completion = no +// completion = no attribute. +func TestExecute_CaptureContentTrue_EmptyCompletion_SkipsAttribute(t *testing.T) { + cfg := observability.TracingConfig{CaptureContent: true, Redact: false} + span := runOnePromptOneCompletion(t, cfg, "question", "") + + if _, ok := findAttr(span, observability.AttrGenAICompletion); ok { + t.Errorf("empty completion must not stamp gen_ai.completion attribute") + } +} + +// runOneToolCall exercises the executor with one tool-call iteration +// so the test focuses on the tool. span attributes. Returns the +// recorded tool span. +func runOneToolCall(t *testing.T, tracingCfg observability.TracingConfig, toolArgs, toolResult string) sdktrace.ReadOnlySpan { + t.Helper() + + tp, rec := observability.NewTestTracerProvider() + SetTracerProvider(tp) + t.Cleanup(func() { + ResetTracerProviderForTest() + _ = tp.Shutdown(context.Background()) + }) + + turn := 0 + client := &mockLLMClient{ + chatFunc: func(_ context.Context, _ *llm.ChatRequest) (*llm.ChatResponse, error) { + turn++ + if turn == 1 { + return &llm.ChatResponse{ + Message: llm.ChatMessage{ + Role: llm.RoleAssistant, + ToolCalls: []llm.ToolCall{{ + ID: "tc-1", + Type: "function", + Function: llm.FunctionCall{Name: "echo", Arguments: toolArgs}, + }}, + }, + FinishReason: "tool_calls", + }, nil + } + return &llm.ChatResponse{ + Message: llm.ChatMessage{Role: llm.RoleAssistant, Content: "done"}, + FinishReason: "stop", + }, nil + }, + } + tools := &mockToolExecutor{ + executeFunc: func(_ context.Context, _ string, _ json.RawMessage) (string, error) { + return toolResult, nil + }, + } + + exec := NewLLMExecutor(LLMExecutorConfig{ + Client: client, + Tools: tools, + MaxIterations: 3, + ModelName: "test-model", + Provider: "anthropic", + TracingConfig: tracingCfg, + }) + if _, err := exec.Execute(context.Background(), + &a2a.Task{ID: "task-tool-content"}, + &a2a.Message{Role: a2a.MessageRoleUser, Parts: []a2a.Part{{Kind: a2a.PartKindText, Text: "go"}}}); err != nil { + t.Fatalf("Execute: %v", err) + } + + span, ok := rec.FindSpan("tool.echo") + if !ok { + t.Fatal("missing tool.echo span") + } + return span +} + +// TestExecute_CaptureContentTrue_StampsToolArgsAndResult exercises the +// tool-side mirror of the LLM-span content tests. Args from the +// LLM-emitted tool call land at forge.tool.args; tool stdout/return +// lands at forge.tool.result; both go through the same redact + +// truncate pipeline. +func TestExecute_CaptureContentTrue_StampsToolArgsAndResult(t *testing.T) { + cfg := observability.TracingConfig{CaptureContent: true, Redact: true} + span := runOneToolCall(t, cfg, `{"target":"`+awsKeyFixture+`"}`, "deleted "+awsKeyFixture) + + args, ok := findAttr(span, observability.AttrForgeToolArgs) + if !ok { + t.Error("forge.tool.args missing when CaptureContent=true") + } else { + if strings.Contains(args, awsKeyFixture) { + t.Errorf("raw AWS key survived redaction on forge.tool.args: %q", args) + } + if !strings.Contains(args, RedactionMarker) { + t.Errorf("expected redaction marker in forge.tool.args; got %q", args) + } + } + + result, ok := findAttr(span, observability.AttrForgeToolResult) + if !ok { + t.Error("forge.tool.result missing when CaptureContent=true") + } else { + if strings.Contains(result, awsKeyFixture) { + t.Errorf("raw AWS key survived redaction on forge.tool.result: %q", result) + } + if !strings.Contains(result, RedactionMarker) { + t.Errorf("expected redaction marker in forge.tool.result; got %q", result) + } + } +} + +// TestExecute_CaptureContentFalse_NoToolContentAttributes mirrors the +// default-posture check on the tool-call side. +func TestExecute_CaptureContentFalse_NoToolContentAttributes(t *testing.T) { + cfg := observability.TracingConfig{CaptureContent: false} + span := runOneToolCall(t, cfg, `{"target":"foo"}`, "ok") + + if _, ok := findAttr(span, observability.AttrForgeToolArgs); ok { + t.Errorf("CaptureContent=false must not set forge.tool.args") + } + if _, ok := findAttr(span, observability.AttrForgeToolResult); ok { + t.Errorf("CaptureContent=false must not set forge.tool.result") + } +} From 03724d2da8279032d8838af01717000f5f8a18a9 Mon Sep 17 00:00:00 2001 From: MK Date: Thu, 11 Jun 2026 14:47:05 -0400 Subject: [PATCH 2/2] feat(otel): use current GenAI semconv keys (input.messages / output.messages) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The OTel GenAI semantic conventions moved the prompt + completion attributes from flat-string (gen_ai.prompt, gen_ai.completion) to structured (gen_ai.input.messages, gen_ai.output.messages) — arrays of role+content message objects. For a feature landing in v0.15.0 we should ship the current keys, not the deprecated ones. Changes 1. attrs.go — AttrGenAIPrompt → AttrGenAIInputMessages (value: gen_ai.input.messages); AttrGenAICompletion → AttrGenAIOutputMessages (value: gen_ai.output.messages). Doc comments call out the supersedence. 2. loop.go — completion attribute now stamps a single-element [{role,content}] array (via the existing serializeChatMessages helper) instead of the raw response string, matching the structured-shape contract the new key implies. The prompt path already emitted a message array — only the key name changed. 3. Tests — TestExecute_CaptureContentTrue_StampsCompletionOnLLMSpan now asserts the value is JSON-parseable as []llm.ChatMessage{{Role: assistant, Content: …}} instead of the bare response string. Other tests still pass unchanged because their assertions look for substring presence (the secret in redact tests, the truncation marker, etc.) and the JSON wrapper doesn't affect those. 4. Docs — observability-tracing.md attribute table updated with the new keys and a note about backends that only recognize the deprecated flat-string attributes (operators should upgrade the backend's semconv mapping or use a span processor to translate). .claude/skills/forge.md § 12.9 updated with the same note. Verification - gofmt + golangci-lint clean - forge-core/runtime + forge-core/observability test suites pass - the 8 integration tests still cover the same four logical sites (LLM prompt, LLM completion, tool args, tool result) under the new key names --- .claude/skills/forge.md | 7 ++- docs/core-concepts/observability-tracing.md | 6 +- forge-core/observability/attrs.go | 20 ++++--- forge-core/runtime/loop.go | 28 +++++---- forge-core/runtime/loop_spans_content_test.go | 57 ++++++++++++------- 5 files changed, 73 insertions(+), 45 deletions(-) diff --git a/.claude/skills/forge.md b/.claude/skills/forge.md index 636270e..0eb9ef3 100644 --- a/.claude/skills/forge.md +++ b/.claude/skills/forge.md @@ -663,8 +663,11 @@ Forge-specific attributes use the `forge.*` namespace **Default posture is metadata-only.** Prompts, completions, tool args, and tool results are NOT stamped on spans unless `observability.tracing.capture_content: true` is set (Phase 3.5 / -#130). When opted-in: `llm.completion` gains `gen_ai.prompt` (JSON- -serialized inbound messages) + `gen_ai.completion` (response text); +#130). When opted-in: `llm.completion` gains `gen_ai.input.messages` +(JSON array of role+content sent to the model) + +`gen_ai.output.messages` (JSON single-element array for the response, +current OTel GenAI semconv; supersedes the deprecated flat-string +`gen_ai.prompt` / `gen_ai.completion`); `tool.` gains `forge.tool.args` + `forge.tool.result`. Captured values pass through a redactor (vendor secret-token shapes: Anthropic / OpenAI / GitHub / AWS / Slack / private keys / Telegram) diff --git a/docs/core-concepts/observability-tracing.md b/docs/core-concepts/observability-tracing.md index 35c39f5..b341ec6 100644 --- a/docs/core-concepts/observability-tracing.md +++ b/docs/core-concepts/observability-tracing.md @@ -159,7 +159,7 @@ Prompts, completions, tool args, and tool results are **off by default** — Pha | `forge.yaml` knob | Span | Attribute keys added when `capture_content: true` | |---|---|---| | (always) | `llm.completion` | `gen_ai.system`, `gen_ai.request.model`, `gen_ai.usage.input_tokens`, `gen_ai.usage.output_tokens`, `gen_ai.response.finish_reasons` | -| `capture_content: true` | `llm.completion` | `gen_ai.prompt` (JSON-serialized inbound messages), `gen_ai.completion` (response text) | +| `capture_content: true` | `llm.completion` | `gen_ai.input.messages` (JSON array of role+content pairs sent to the model), `gen_ai.output.messages` (JSON single-element array of role+content for the model's response) — current OTel GenAI semconv, supersedes the deprecated flat-string `gen_ai.prompt` / `gen_ai.completion` | | (always) | `tool.` | `forge.tool.name`, `forge.tool.error` (on failure) | | `capture_content: true` | `tool.` | `forge.tool.args` (raw arguments JSON), `forge.tool.result` (raw output) | @@ -167,7 +167,9 @@ When `capture_content: true` and `redact: true` (the default when capture is on) Every captured value is byte-capped at **4 KiB** (below the 5 KiB attribute soft-cap most backends apply). When the input exceeds the cap, the value ends with a `…[truncated:N]` marker where `N` is the original byte length. The marker is **byte-identical** to what the audit payload-capture path emits for the same input, so an operator grepping `[truncated:` across span attributes and audit rows sees aligned output. -**Default posture** (no opt-in): the `gen_ai.prompt`, `gen_ai.completion`, `forge.tool.args`, `forge.tool.result` keys are **absent** from spans — not set to empty string. Backends that gate dashboards on "is this key present?" can distinguish "metadata-only by default" from "operator opted in but the field happened to be empty." +**Default posture** (no opt-in): the `gen_ai.input.messages`, `gen_ai.output.messages`, `forge.tool.args`, `forge.tool.result` keys are **absent** from spans — not set to empty string. Backends that gate dashboards on "is this key present?" can distinguish "metadata-only by default" from "operator opted in but the field happened to be empty." + +**OTel semconv versioning note**: the GenAI semantic conventions moved from flat-string (`gen_ai.prompt`, `gen_ai.completion`) to structured (`gen_ai.input.messages`, `gen_ai.output.messages`) attributes. Forge emits only the **current** structured keys. Backends that only recognize the deprecated flat-string attributes will not show prompt / completion text on Forge spans — upgrade the backend's semconv mapping or use a span processor to translate. ## End-to-end propagation (Phase 5) diff --git a/forge-core/observability/attrs.go b/forge-core/observability/attrs.go index 1b0e419..0649466 100644 --- a/forge-core/observability/attrs.go +++ b/forge-core/observability/attrs.go @@ -106,14 +106,18 @@ const ( // so the same scrub passes both the OTel pipeline and (in the // future) the audit payload-capture path. - // AttrGenAIPrompt is the serialized inbound chat-messages array - // the agent sent to the LLM (JSON-encoded role+content pairs). - // Per the OTel GenAI semantic conventions. - AttrGenAIPrompt = "gen_ai.prompt" - - // AttrGenAICompletion is the model's response text on success. - // Per the OTel GenAI semantic conventions. - AttrGenAICompletion = "gen_ai.completion" + // AttrGenAIInputMessages is the structured inbound message array + // the agent sent to the LLM — a JSON array of role+content pairs. + // Per OTel GenAI semantic conventions (current). Supersedes the + // deprecated `gen_ai.prompt` flat-string attribute. + AttrGenAIInputMessages = "gen_ai.input.messages" + + // AttrGenAIOutputMessages is the structured response array from + // the model — a JSON array of role+content pairs (single element + // for a non-streaming, single-choice completion). Per OTel GenAI + // semantic conventions (current). Supersedes the deprecated + // `gen_ai.completion` flat-string attribute. + AttrGenAIOutputMessages = "gen_ai.output.messages" // AttrForgeToolArgs is the raw arguments JSON the agent passed to // a tool. Set on tool. spans. diff --git a/forge-core/runtime/loop.go b/forge-core/runtime/loop.go index 0679d21..aa75369 100644 --- a/forge-core/runtime/loop.go +++ b/forge-core/runtime/loop.go @@ -368,15 +368,17 @@ func (e *LLMExecutor) Execute(ctx context.Context, task *a2a.Task, msg *a2a.Mess attribute.String(observability.AttrGenAISystem, e.provider), attribute.String(observability.AttrGenAIRequestModel, e.modelName), ) - // Phase 3.5 (#130) — stamp the serialized request messages on - // the span when CaptureContent is enabled. Runs through the + // Phase 3.5 (#130) — stamp the structured input messages on the + // span when CaptureContent is enabled. Runs through the // redact-then-truncate pipeline (PrepareSpanContent) so PII // scrubbing is identical to what audit payload-capture will - // emit for the same event. + // emit for the same event. The attribute key matches the + // current OTel GenAI semconv (gen_ai.input.messages) which + // supersedes the deprecated flat-string `gen_ai.prompt`. if e.tracingCfg.CaptureContent { if prompt := serializeChatMessages(req.Messages); prompt != "" { llmSpan.SetAttributes(attribute.String( - observability.AttrGenAIPrompt, + observability.AttrGenAIInputMessages, PrepareSpanContent(prompt, e.tracingCfg.Redact, DefaultSpanContentCapBytes), )) } @@ -420,14 +422,18 @@ func (e *LLMExecutor) Execute(ctx context.Context, task *a2a.Task, msg *a2a.Mess if resp.FinishReason != "" { llmSpan.SetAttributes(attribute.StringSlice(observability.AttrGenAIResponseFinishReasons, []string{resp.FinishReason})) } - // Phase 3.5 (#130) — completion text stamped after success. - // Empty content (e.g. tool-call-only assistant turns) yields - // no attribute so an absent key remains the "no opt-in" signal. + // Phase 3.5 (#130) — completion stamped after success as a + // structured single-element messages array, matching the + // current OTel GenAI semconv (gen_ai.output.messages). Empty + // content (e.g. tool-call-only assistant turns) yields no + // attribute so an absent key remains the "no opt-in" signal. if e.tracingCfg.CaptureContent && resp.Message.Content != "" { - llmSpan.SetAttributes(attribute.String( - observability.AttrGenAICompletion, - PrepareSpanContent(resp.Message.Content, e.tracingCfg.Redact, DefaultSpanContentCapBytes), - )) + if out := serializeChatMessages([]llm.ChatMessage{resp.Message}); out != "" { + llmSpan.SetAttributes(attribute.String( + observability.AttrGenAIOutputMessages, + PrepareSpanContent(out, e.tracingCfg.Redact, DefaultSpanContentCapBytes), + )) + } } llmSpan.End() diff --git a/forge-core/runtime/loop_spans_content_test.go b/forge-core/runtime/loop_spans_content_test.go index 9643071..24cd7ab 100644 --- a/forge-core/runtime/loop_spans_content_test.go +++ b/forge-core/runtime/loop_spans_content_test.go @@ -77,7 +77,7 @@ func runOnePromptOneCompletion(t *testing.T, tracingCfg observability.TracingCon // TestExecute_CaptureContentTrue_StampsRedactedPromptOnLLMSpan — // issue #130 acceptance case. Operator opts in (CaptureContent=true, // Redact=true), sends a prompt containing an AWS access key shape. -// The gen_ai.prompt attribute must be present and the raw key must +// The gen_ai.input.messages attribute must be present and the raw key must // NOT appear in its value. func TestExecute_CaptureContentTrue_StampsRedactedPromptOnLLMSpan(t *testing.T) { cfg := observability.TracingConfig{CaptureContent: true, Redact: true} @@ -86,15 +86,15 @@ func TestExecute_CaptureContentTrue_StampsRedactedPromptOnLLMSpan(t *testing.T) "ok", ) - got, ok := findAttr(span, observability.AttrGenAIPrompt) + got, ok := findAttr(span, observability.AttrGenAIInputMessages) if !ok { - t.Fatal("gen_ai.prompt attribute missing — CaptureContent=true did not stamp") + t.Fatal("gen_ai.input.messages attribute missing — CaptureContent=true did not stamp") } if strings.Contains(got, awsKeyFixture) { - t.Errorf("raw AWS key survived redaction on gen_ai.prompt:\n%s", got) + t.Errorf("raw AWS key survived redaction on gen_ai.input.messages:\n%s", got) } if !strings.Contains(got, RedactionMarker) { - t.Errorf("expected redaction marker %q in gen_ai.prompt; got %q", RedactionMarker, got) + t.Errorf("expected redaction marker %q in gen_ai.input.messages; got %q", RedactionMarker, got) } } @@ -109,9 +109,9 @@ func TestExecute_CaptureContentTrue_RedactFalse_StampsRawPromptOnLLMSpan(t *test "done", ) - got, ok := findAttr(span, observability.AttrGenAIPrompt) + got, ok := findAttr(span, observability.AttrGenAIInputMessages) if !ok { - t.Fatal("gen_ai.prompt missing") + t.Fatal("gen_ai.input.messages missing") } if !strings.Contains(got, awsKeyFixture) { t.Errorf("Redact=false must preserve raw content; expected raw key in span attribute, got %q", got) @@ -122,7 +122,7 @@ func TestExecute_CaptureContentTrue_RedactFalse_StampsRawPromptOnLLMSpan(t *test } // TestExecute_CaptureContentFalse_NoContentAttribute pins the default -// posture. With no opt-in, the gen_ai.prompt / gen_ai.completion +// posture. With no opt-in, the gen_ai.input.messages / gen_ai.output.messages // attributes are absent — not set to empty string. Backends that // look for "is the key present?" must see "no" so the // metadata-only contract holds. @@ -133,11 +133,11 @@ func TestExecute_CaptureContentFalse_NoContentAttribute(t *testing.T) { "any completion", ) - if _, ok := findAttr(span, observability.AttrGenAIPrompt); ok { - t.Errorf("CaptureContent=false must not set gen_ai.prompt") + if _, ok := findAttr(span, observability.AttrGenAIInputMessages); ok { + t.Errorf("CaptureContent=false must not set gen_ai.input.messages") } - if _, ok := findAttr(span, observability.AttrGenAICompletion); ok { - t.Errorf("CaptureContent=false must not set gen_ai.completion") + if _, ok := findAttr(span, observability.AttrGenAIOutputMessages); ok { + t.Errorf("CaptureContent=false must not set gen_ai.output.messages") } } @@ -152,9 +152,9 @@ func TestExecute_LargePrompt_TruncatesWithSameMarkerAsAudit(t *testing.T) { cfg := observability.TracingConfig{CaptureContent: true, Redact: false} span := runOnePromptOneCompletion(t, cfg, bigPrompt, "ok") - got, ok := findAttr(span, observability.AttrGenAIPrompt) + got, ok := findAttr(span, observability.AttrGenAIInputMessages) if !ok { - t.Fatal("gen_ai.prompt missing") + t.Fatal("gen_ai.input.messages missing") } // The executor serializes the messages into a JSON array, so the @@ -170,24 +170,37 @@ func TestExecute_LargePrompt_TruncatesWithSameMarkerAsAudit(t *testing.T) { got, wantTruncated) } if !strings.Contains(got, "[truncated:") { - t.Errorf("expected truncation marker in gen_ai.prompt; got prefix %q…", got[:64]) + t.Errorf("expected truncation marker in gen_ai.input.messages; got prefix %q…", got[:64]) } } // TestExecute_CaptureContentTrue_StampsCompletionOnLLMSpan covers the // happy completion path — when CaptureContent=true, the model's // response text appears on the llm.completion span (post-success, -// before End). +// before End) as a structured single-element messages array +// matching the current OTel GenAI semconv. func TestExecute_CaptureContentTrue_StampsCompletionOnLLMSpan(t *testing.T) { cfg := observability.TracingConfig{CaptureContent: true, Redact: false} span := runOnePromptOneCompletion(t, cfg, "question", "the answer is 42") - got, ok := findAttr(span, observability.AttrGenAICompletion) + got, ok := findAttr(span, observability.AttrGenAIOutputMessages) if !ok { - t.Fatal("gen_ai.completion missing") + t.Fatal("gen_ai.output.messages missing") } - if got != "the answer is 42" { - t.Errorf("gen_ai.completion = %q; want %q", got, "the answer is 42") + + // Must be JSON: a single-element [{role,content}] array. + var msgs []llm.ChatMessage + if err := json.Unmarshal([]byte(got), &msgs); err != nil { + t.Fatalf("gen_ai.output.messages is not a JSON message array: %v\nvalue: %s", err, got) + } + if len(msgs) != 1 { + t.Fatalf("expected exactly 1 output message, got %d: %v", len(msgs), msgs) + } + if msgs[0].Role != llm.RoleAssistant { + t.Errorf("output message role = %q; want %q", msgs[0].Role, llm.RoleAssistant) + } + if msgs[0].Content != "the answer is 42" { + t.Errorf("output message content = %q; want %q", msgs[0].Content, "the answer is 42") } } @@ -201,8 +214,8 @@ func TestExecute_CaptureContentTrue_EmptyCompletion_SkipsAttribute(t *testing.T) cfg := observability.TracingConfig{CaptureContent: true, Redact: false} span := runOnePromptOneCompletion(t, cfg, "question", "") - if _, ok := findAttr(span, observability.AttrGenAICompletion); ok { - t.Errorf("empty completion must not stamp gen_ai.completion attribute") + if _, ok := findAttr(span, observability.AttrGenAIOutputMessages); ok { + t.Errorf("empty completion must not stamp gen_ai.output.messages attribute") } }