diff --git a/crates/buzz-agent/src/agent.rs b/crates/buzz-agent/src/agent.rs index bee51a625..a955e959b 100644 --- a/crates/buzz-agent/src/agent.rs +++ b/crates/buzz-agent/src/agent.rs @@ -133,13 +133,21 @@ impl RunCtx<'_> { // exactly the history that was just sent to `complete()` (the // assistant response is appended below, after this point). Pairing // them lets the gate add a conservative estimate for any history - // appended before the next request. Preserve both when a response - // omits usage (`None`) rather than clobbering — a one-off missing - // field shouldn't blind the gate or zero the growth baseline. + // appended before the next request. Uses `context_pressure_bytes` + // (the same measure the gate's `current_bytes` uses) so the + // `grown` delta is coherent — an image contributes its visual- + // token equivalent here, not its base64 length. Preserve both when + // a response omits usage (`None`) rather than clobbering — a + // one-off missing field shouldn't blind the gate or zero the + // growth baseline. if let Some(tokens) = response.input_tokens { *self.last_request_input_tokens = Some(tokens); - *self.last_request_history_bytes = - Some(self.history.iter().map(HistoryItem::estimated_bytes).sum()); + *self.last_request_history_bytes = Some( + self.history + .iter() + .map(HistoryItem::context_pressure_bytes) + .sum(), + ); } if !response.text.is_empty() { diff --git a/crates/buzz-agent/src/handoff.rs b/crates/buzz-agent/src/handoff.rs index 65b733538..d4e396f7d 100644 --- a/crates/buzz-agent/src/handoff.rs +++ b/crates/buzz-agent/src/handoff.rs @@ -100,8 +100,11 @@ impl RunCtx<'_> { // token estimate of the bytes added since the measurement. Some(measured_tokens) => { let measured_bytes = self.last_request_history_bytes.unwrap_or(0); - let current_bytes: usize = - self.history.iter().map(HistoryItem::estimated_bytes).sum(); + let current_bytes: usize = self + .history + .iter() + .map(HistoryItem::context_pressure_bytes) + .sum(); let grown = current_bytes.saturating_sub(measured_bytes); let projected = measured_tokens.saturating_add(estimate_tokens_from_bytes(grown)); projected @@ -119,7 +122,11 @@ impl RunCtx<'_> { // since a handoff re-adds the current prompt verbatim — that is a // prompt-cap concern (MAX_PROMPT_BYTES), not this gate. None => { - let bytes: usize = self.history.iter().map(HistoryItem::estimated_bytes).sum(); + let bytes: usize = self + .history + .iter() + .map(HistoryItem::context_pressure_bytes) + .sum(); bytes > byte_fallback_threshold( self.cfg.max_context_tokens, diff --git a/crates/buzz-agent/src/types.rs b/crates/buzz-agent/src/types.rs index a0e77f83e..0b90f9b51 100644 --- a/crates/buzz-agent/src/types.rs +++ b/crates/buzz-agent/src/types.rs @@ -1,6 +1,18 @@ use serde::Deserialize; use serde_json::Value; +/// Byte-equivalent charged to the handoff/context-pressure gate for a single +/// image tool result. The gate maps bytes to tokens at 1 byte/token (see +/// `handoff::CONSERVATIVE_BYTES_PER_TOKEN`), so this is also the per-image +/// token budget. Providers bill an image as visual *tiles*, not its base64 +/// length: Anthropic caps at ~1600 tokens/image and OpenAI high-detail lands +/// ~1.1K–1.5K. We charge 16 KiB — a generous ceiling that still over-counts +/// the real ~2K cost, while being ~190× smaller than the base64 length of a +/// typical multi-MiB screenshot. Charging `data.len()` to the gate instead +/// made a single `view_image` (~3.1M base64 bytes) trip the handoff gate on a +/// fresh context. +const IMAGE_CONTEXT_TOKEN_EQUIV: usize = 16 * 1024; + #[derive(Debug, Clone)] pub enum ToolResultContent { Text(String), @@ -8,16 +20,32 @@ pub enum ToolResultContent { } impl ToolResultContent { + /// Real serialized size in bytes. Used by `truncate_history` to keep the + /// outgoing request body under `max_history_bytes` — an image rides the + /// wire as its full base64 string, so that string's length is what counts + /// here. For context-window/handoff pressure use + /// [`Self::context_pressure_bytes`] instead, which charges an image its + /// (far smaller) visual-token equivalent. pub fn estimated_bytes(&self) -> usize { match self { Self::Text(s) => s.len(), - // This is request-size pressure accounting, not a visual-token - // estimate. Count the base64 bytes we will actually serialize so - // image-heavy sessions cannot silently exceed provider/body caps. Self::Image { data, mime_type } => data.len() + mime_type.len(), } } + /// Token-equivalent context-window pressure, in bytes (the handoff gate + /// maps bytes→tokens at 1:1). Identical to [`Self::estimated_bytes`] for + /// text, but an image is charged a flat [`IMAGE_CONTEXT_TOKEN_EQUIV`] + /// budget rather than its base64 length — providers bill it as visual + /// tiles (~2K tokens), so counting `data.len()` over-counts by ~1500× and + /// forces a handoff on a single image. + pub fn context_pressure_bytes(&self) -> usize { + match self { + Self::Text(s) => s.len(), + Self::Image { data: _, mime_type } => IMAGE_CONTEXT_TOKEN_EQUIV + mime_type.len(), + } + } + pub fn as_text_lossy(&self) -> String { match self { Self::Text(s) => s.clone(), @@ -40,6 +68,19 @@ pub enum HistoryItem { impl HistoryItem { pub fn estimated_bytes(&self) -> usize { + self.size_with(ToolResultContent::estimated_bytes) + } + + /// Token-equivalent context-window pressure, in bytes. Mirrors + /// [`Self::estimated_bytes`] but charges image tool results their visual- + /// token equivalent rather than their base64 length — see + /// [`ToolResultContent::context_pressure_bytes`]. The handoff gate uses + /// this; `truncate_history` (request-body sizing) uses `estimated_bytes`. + pub fn context_pressure_bytes(&self) -> usize { + self.size_with(ToolResultContent::context_pressure_bytes) + } + + fn size_with(&self, content_size: fn(&ToolResultContent) -> usize) -> usize { match self { Self::User(s) => s.len(), Self::Assistant { text, tool_calls } => { @@ -56,11 +97,7 @@ impl HistoryItem { .sum::() } Self::ToolResult(r) => { - r.provider_id.len() - + r.content - .iter() - .map(ToolResultContent::estimated_bytes) - .sum::() + r.provider_id.len() + r.content.iter().map(content_size).sum::() } } } @@ -219,3 +256,82 @@ pub fn clamp(mut s: String, max: usize) -> String { } s } + +#[cfg(test)] +mod tests { + use super::*; + + fn image_item(base64_len: usize) -> HistoryItem { + HistoryItem::ToolResult(ToolResult { + provider_id: "call_1".into(), + content: vec![ToolResultContent::Image { + data: "A".repeat(base64_len), + mime_type: "image/png".into(), + }], + is_error: false, + }) + } + + #[test] + fn image_estimated_bytes_is_real_wire_size() { + // `truncate_history` relies on this to keep the request body under + // `max_history_bytes`, so an image must report its full base64 length. + let img = ToolResultContent::Image { + data: "A".repeat(3_000_000), + mime_type: "image/png".into(), + }; + assert_eq!(img.estimated_bytes(), 3_000_000 + "image/png".len()); + } + + #[test] + fn image_context_pressure_is_token_equivalent_not_base64_len() { + // The handoff gate must charge an image its visual-token equivalent, + // not its base64 length — otherwise one screenshot trips the gate. + let img = ToolResultContent::Image { + data: "A".repeat(3_000_000), + mime_type: "image/png".into(), + }; + assert_eq!( + img.context_pressure_bytes(), + IMAGE_CONTEXT_TOKEN_EQUIV + "image/png".len() + ); + // And it must be independent of the (huge) base64 payload length. + let bigger = ToolResultContent::Image { + data: "A".repeat(10_000_000), + mime_type: "image/png".into(), + }; + assert_eq!( + img.context_pressure_bytes(), + bigger.context_pressure_bytes() + ); + } + + #[test] + fn single_image_does_not_trip_default_handoff_threshold() { + // Regression: a single ~3.1M-base64-byte `view_image` result on an + // otherwise-empty history must NOT exceed the default pre-usage + // handoff cap. The gate's byte-fallback threshold with the shipped + // defaults (max_context_tokens=200_000, max_output_tokens=32_768) is + // min(200_000*9/10, 200_000-32_768) = 167_232 "bytes". Before the fix + // this item counted ~3.1M and tripped instantly. + let item = image_item(3_118_884); + const DEFAULT_PRE_USAGE_THRESHOLD: usize = 167_232; + assert!( + item.context_pressure_bytes() <= DEFAULT_PRE_USAGE_THRESHOLD, + "one image charged {} bytes of context pressure, over the {} threshold", + item.context_pressure_bytes(), + DEFAULT_PRE_USAGE_THRESHOLD + ); + // The real wire size, by contrast, is still the full base64 payload. + assert!(item.estimated_bytes() >= 3_118_884); + } + + #[test] + fn text_content_size_is_identical_for_both_measures() { + // Only images diverge; text must size the same under both paths. + let text = ToolResultContent::Text("hello world".into()); + assert_eq!(text.estimated_bytes(), text.context_pressure_bytes()); + let item = HistoryItem::User("a user message".into()); + assert_eq!(item.estimated_bytes(), item.context_pressure_bytes()); + } +}