From 129b8c3ec11afc0055bc272b7f79dd907302343d Mon Sep 17 00:00:00 2001 From: Ritesh Singh Date: Tue, 10 Mar 2026 23:57:33 +0530 Subject: [PATCH 1/9] fix(chat-agent): improve context handling --- agents/analyze_agent_v2/agent.py | 6 +- .../architecture_endpoints_skill/SKILL.md | 20 -- .../references/architecture_and_endpoints.md | 101 ------ .../references/calling_flow.md | 11 - .../references/contact_center_flow.md | 24 -- .../skills/mobius_error_id_skill/SKILL.md | 20 -- .../references/mobius_error_ids.md | 325 ------------------ .../skills/sip_flow_skill/SKILL.md | 19 - .../sip_flow_skill/references/sip_flows.md | 250 -------------- agents/chat_agent/agent.py | 159 +++++++-- agents/query_router/agent.py | 1 + agents/search_agent_v2/agent.py | 10 + 12 files changed, 148 insertions(+), 798 deletions(-) delete mode 100644 agents/analyze_agent_v2/skills/architecture_endpoints_skill/SKILL.md delete mode 100644 agents/analyze_agent_v2/skills/architecture_endpoints_skill/references/architecture_and_endpoints.md delete mode 100644 agents/analyze_agent_v2/skills/architecture_endpoints_skill/references/calling_flow.md delete mode 100644 agents/analyze_agent_v2/skills/architecture_endpoints_skill/references/contact_center_flow.md delete mode 100644 agents/analyze_agent_v2/skills/mobius_error_id_skill/SKILL.md delete mode 100644 agents/analyze_agent_v2/skills/mobius_error_id_skill/references/mobius_error_ids.md delete mode 100644 agents/analyze_agent_v2/skills/sip_flow_skill/SKILL.md delete mode 100644 agents/analyze_agent_v2/skills/sip_flow_skill/references/sip_flows.md diff --git a/agents/analyze_agent_v2/agent.py b/agents/analyze_agent_v2/agent.py index 8122859..8b1ceb0 100644 --- a/agents/analyze_agent_v2/agent.py +++ b/agents/analyze_agent_v2/agent.py @@ -221,17 +221,17 @@ def _make_model() -> LiteLlm: # ═══════════════════════════════════════════════════════════════════════════════ mobius_error_skill = load_skill_from_dir( - Path(__file__).parent / "skills" / "mobius_error_id_skill" + Path(__file__).parent / "skills" / "mobius-error-id-skill" ) mobius_skill_toolset = skill_toolset.SkillToolset(skills=[mobius_error_skill]) architecture_endpoints_skill = load_skill_from_dir( - Path(__file__).parent / "skills" / "architecture_endpoints_skill" + Path(__file__).parent / "skills" / "architecture-endpoints-skill" ) architecture_skill_toolset = skill_toolset.SkillToolset(skills=[architecture_endpoints_skill]) sip_flow_skill = load_skill_from_dir( - Path(__file__).parent / "skills" / "sip_flow_skill" + Path(__file__).parent / "skills" / "sip-flow-skill" ) sip_flow_skill_toolset = skill_toolset.SkillToolset(skills=[sip_flow_skill]) diff --git a/agents/analyze_agent_v2/skills/architecture_endpoints_skill/SKILL.md b/agents/analyze_agent_v2/skills/architecture_endpoints_skill/SKILL.md deleted file mode 100644 index 231201a..0000000 --- a/agents/analyze_agent_v2/skills/architecture_endpoints_skill/SKILL.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -name: architecture-endpoints-skill -description: Reference for Webex Calling and Contact Center service architecture, endpoint roles, signaling paths, and media paths. Use when analyzing logs to understand which service does what and how traffic flows. ---- - -# Architecture and Endpoints - -When analyzing Mobius, SSE, MSE, or WxCAS logs, use this skill to look up: - -- **Service roles**: What each endpoint (Mobius, SSE, MSE, WxCAS, CPAPI, CXAPI, U2C, WDM, Mercury, Kamailio, RTMS, RAS) does and how they interact. -- **Signaling and media paths**: End-to-end flow for WebRTC Calling vs. Contact Center (browser → Mobius → SSE → …). -- **Call types and routing**: WebRTC-to-WebRTC, WebRTC-to-PSTN, WebRTC-to-Desk Phone, and Contact Center flows. - -Consult the reference documents: - -- **references/architecture_and_endpoints.md** — service roles and endpoint descriptions. -- **references/calling_flow.md** — WebRTC Calling end-to-end architecture (signaling, media, call types). -- **references/contact_center_flow.md** — Contact Center end-to-end architecture (signaling, media, Kamailio/RTMS/RAS, health ping, timers, failover). - -Use them to attribute log lines to the correct service and to explain signaling/media paths in your analysis. diff --git a/agents/analyze_agent_v2/skills/architecture_endpoints_skill/references/architecture_and_endpoints.md b/agents/analyze_agent_v2/skills/architecture_endpoints_skill/references/architecture_and_endpoints.md deleted file mode 100644 index 9245699..0000000 --- a/agents/analyze_agent_v2/skills/architecture_endpoints_skill/references/architecture_and_endpoints.md +++ /dev/null @@ -1,101 +0,0 @@ -# Architecture and Endpoints Reference - -## Endpoints — service roles and descriptions - -- **Webex SDK/Client** (Web or native app making the request): Chrome extension or any third-party web application that consumes the Webex Calling SDK. - -- **Mobius**: Microservice that interworks between WebRTC and SIP to enable Webex Calling users to register and make calls using a web browser. It translates browser-originated signaling (HTTP/WSS) into SIP for backend communication. - - **Mobius Multi-Instance Architecture**: Multiple Mobius servers are deployed across different geographic regions (e.g., US, EU, APAC). When a user initiates a call from their browser, their geolocation (based on IP) is used to route them to the nearest Mobius instance using a GeoDNS or load balancer. - - Mobius talks to the following components: - - **CPAPI** (Cisco Platform API): User entitlement and application metadata. - - **CXAPI** (Webex Calling Call Control API): Stateless micro-service that implements the messaging logic behind the Webex Calling Call Control API. When incoming requests are received, it validates that the customer/user the request is made on behalf of belongs to Webex Calling and has the appropriate scope and roles. It then converts the request to the appropriate OCI requests and routes it to the OCIRouter to route to the correct WxC deployment. - -- **U2C (User to Cluster)**: Microservice that helps services find other service instances across multiple Data Centers. It takes a user's email or UUID and optional service name, and returns the catalog containing the service URLs. - -- **WDM (Webex squared Device Manager)**: Microservice responsible for registering a device and proxying feature toggles and settings for the user to bootstrap the Webex clients. - - If WDM shows many 502 responses, possible failing dependencies: Common Identity CI, Mercury API, U2C, Feature Service, Cassandra, Redis. - - If WDM is generating errors, either Locus will produce 502 responses or the clients will show an error. - -- **SSE (Signalling Service Edge)**: Edge component for SIP signalling. It communicates with two endpoints — Mobius and the application server WxCAS. - -- **MSE (Media Service Engine)**: Edge component for media relay that handles RTP for WebRTC clients. - -- **Webex Calling Application Server (WxCAS)**: Core control application in Webex Calling responsible for enabling communication between source SSE and destination SSE. - -- **Mercury**: Webex's real-time messaging and signaling service that establishes WebSocket connections and exchanges information in the form of events. Mobius uses Mercury to send events to the SDK. The SDK establishes a Mercury connection (WebSocket) to receive communication from Mobius. - ---- - -## High-Level Component Topology (MSE Option with Mobius in WxC) - -``` - ┌─────────────────────────────────────────────────┐ - │ Webex Cloud │ - │ │ - │ ┌─────────┐ ┌──────────┐ │ - WebSocket │ │ Mercury │ │ Webex CI │ │ -Registered ┌──────────────────────┼──>│ │ │ │ │ -User │ │ └─────────┘ └──────────┘ │ - [Browser │ │ │ - Extension] │ │ Async events ┌────────┐ │ - │ │ REST API (HTTPS) │ │ │ CXAPI │ │ - │ │ │ ▼ │ 3PCC │ │ - │ └──────────────────────┼──> ┌──────────────┐ └───┬────┘ │ - │ │ │ Mobius │ │ Supplementary │ - │ │ │ Micro Service │ │ Services for calls│ - │ Enterprise │ └──────┬───────┘ │ │ - │ Web Server │ │ Provisioning │ │ - │ │ │ data query ┌──┴─────┐ ┌────────┐│ - │ │ └────────────>│ CPAPI │ │ CH UI ││ - │ │ └────────┘ │(Control││ - │ │ │ Hub) ││ - │ └──────────────────────────────────────┴────────┘│ - │ │ - │ Media (DTLS-SRTP) ┌─────────────────────────────────────────────────┐ - │ STUN consent, ICE │ Webex Calling │ - │ │ │ - │ ┌───────────────────┼──> ┌───────┐ mTLS SIP persistent ┌─────────┐ │ - │ │ │ │ MSE │ connection w/ │ SSE │ │ - │ │ │ │ │ webRTC domain in │ │ │ - │ │ │ └───┬───┘ cert SAN └────┬────┘ │ - │ │ │ │ RTP │ │ - │ │ │ ┌───┴───┐ ┌────┴──────┐│ - ▼ ▼ │ │ MSE │ │WxC Call ││ -Peer User [Peer Device] │ │ │◄────── RTP ─────────>│Control ││ - Media (SRTP) │ └───────┘ │(AS/WxCAS)││ - │ └────┬──────┘│ - │ ┌───────┐ ┌────┴──────┐│ - │ │ SSE │ │OCI Router ││ - │ └───────┘ └───────────┘│ - └─────────────────────────────────────────────────┘ - -Legend: - ───────> signaling - ═══════> media - - - - -> media control done by Mobius - [dark] new components - [light] existing components, need change -``` - -### Connection Types Between Components - -| From | To | Protocol | Purpose | -|------|----|----------|---------| -| Browser/Extension | Mercury | WebSocket | Async event delivery (call notifications, status updates) | -| Browser/Extension | Mobius | REST API (HTTPS) | Call control signaling (register, call, hold, transfer) | -| Browser/Extension | MSE | DTLS-SRTP | Encrypted media (audio/video), STUN consent, ICE | -| Mobius | Mercury | Internal | Async event push to SDK/Client | -| Mobius | CPAPI | HTTPS | Provisioning data query, user entitlements | -| Mobius | CXAPI (3PCC) | HTTPS | Supplementary call services | -| Mobius | SSE | mTLS SIP | SIP signaling (persistent connection, webRTC domain in cert SAN) | -| MSE | MSE | RTP | Media relay between caller and callee media engines | -| SSE | WxC Call Control (WxCAS) | SIP | Call routing, destination resolution | -| WxCAS | OCI Router | Internal | Route OCI requests to correct WxC deployment | -| Admin | Control Hub (CH UI) | HTTPS | Administration and configuration | -| Control Hub | CPAPI | HTTPS | Provisioning and configuration | - ---- - -For **WebRTC Calling** flow details (signaling path, media path, call types & routing), see **references/calling_flow.md**. - -For **Contact Center** flow details (signaling path, media path, Kamailio/RTMS/RAS, health ping, timers, Kafka failover, inter-regional failover), see **references/contact_center_flow.md**. diff --git a/agents/analyze_agent_v2/skills/architecture_endpoints_skill/references/calling_flow.md b/agents/analyze_agent_v2/skills/architecture_endpoints_skill/references/calling_flow.md deleted file mode 100644 index ddb731e..0000000 --- a/agents/analyze_agent_v2/skills/architecture_endpoints_skill/references/calling_flow.md +++ /dev/null @@ -1,11 +0,0 @@ -# WebRTC Calling — End-to-End Architecture - -**Signaling Path**: Browser → Mobius (HTTP/WSS→SIP translation) → SSE (SIP edge) → WxCAS (Application Server) → Destination - -**Media Path**: Browser ↔ MSE (DTLS-SRTP) ↔ Destination - -**Call Types & Routing:** - -- **WebRTC to WebRTC**: WxCAS resolves destination browser → Mobius notifies Browser 2 → Both browsers establish DTLS-SRTP with their local MSE. -- **WebRTC to PSTN**: WxCAS resolves PSTN destination → SSE signals toward Local Gateway → Browser↔MSE1 (DTLS-SRTP), MSE1↔MSE2 (RTP), MSE2→LGW (RTP→PSTN). -- **WebRTC to Desk Phone**: WxCAS resolves desk phone → SSE coordinates with MSE → Browser↔MSE1 (DTLS-SRTP), MSE1↔MSE2 (RTP), MSE2→Desk Phone. diff --git a/agents/analyze_agent_v2/skills/architecture_endpoints_skill/references/contact_center_flow.md b/agents/analyze_agent_v2/skills/architecture_endpoints_skill/references/contact_center_flow.md deleted file mode 100644 index 9f5f82c..0000000 --- a/agents/analyze_agent_v2/skills/architecture_endpoints_skill/references/contact_center_flow.md +++ /dev/null @@ -1,24 +0,0 @@ -# Contact Center — End-to-End Architecture - -**Signaling Path**: Browser → Mobius → SSE → Kamailio (SIP proxy) → Destination - -**Media Path**: Browser ↔ MSE ↔ Destination - -**Additional Contact Center Components:** - -- **Kamailio**: SIP proxy for Contact Center — handles SIP REGISTER, stores registration details on RTMS Application Server, routes calls to the appropriate destination. -- **RTMS**: Real-time microservice enabling persistent WebSocket connections between clients and backend services. -- **RAS** (Registration, Activation, and provisioning Service): Stores SIP REGISTER Contact and Path headers with expiry, maintains metrics for WebRTC active sessions and calls to WebRTC phones. - -**Health ping:** Mobius exposes `/api/v1/ping` (and internal variants). Response should be 200 OK with body `{"message":"Healthy"}`. Per-region endpoints exist for INT and PROD (e.g. US, CA, EU, UAE, UK, AU Sydney/Melbourne, South Africa, Saudi Arabia, Singapore). Backup clusters exist in some regions (e.g. UK, AU, South Africa, Saudi). Use the Mobius Runbook for the full health-check endpoint table and cluster list. - -**Timers in Mobius:** - -- **Registration keepalive (browser):** Browser sends keepalive periodically to keep registration active. Every **30 seconds**; after **5 missed** keepalives, Mobius triggers unregistration. -- **Call keepalive:** Browser sends keepalive during a call to keep it active; valid **within 15 minutes**. -- **SIP APP – Registration refresh:** Depends on Expires header from REGISTER. Mobius refreshes registration with SSE at **3/4 of the Expires value** before expiry. -- **SIP APP – Options scheduler:** Mobius sends OPTIONS ping to SSE to check connectivity; interval **35 seconds**. - -**Kafka (call failover):** When Mobius cannot find registration in local cache but finds it in global DB (e.g. after pod/region failover), the new pod communicates with the old pod via Kafka for owner change. Topic: `MOBIUS_REG_HANDOVER_TOPIC = "RegistrationHandover"`, group: `KAFKA_GROUP_ID = "registration_handover"`. - -**Inter-regional failover (updated):** In regions with a local backup (e.g. AU: SYD + MEL), if both primary and backup Mobius clusters are down (e.g. both SYD and MEL cannot reach SSE), Client/SDK will **not** failover to US. All WebRTC calls (WxC, WxCC, Guest Calling) in that region can be non-operational. If only the primary cluster is down (SSEs up), Client/SDK fails over to the backup in the same region (e.g. SYD → MEL). Singapore (SIN) and Canada (CA) have a single Mobius cluster each; they continue to use backup regions (e.g. SYD, US-East) when needed. diff --git a/agents/analyze_agent_v2/skills/mobius_error_id_skill/SKILL.md b/agents/analyze_agent_v2/skills/mobius_error_id_skill/SKILL.md deleted file mode 100644 index 8a5d048..0000000 --- a/agents/analyze_agent_v2/skills/mobius_error_id_skill/SKILL.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -name: mobius-error-id-skill -description: Interpret Mobius error and call IDs from logs using the reference documentation. ---- - -# Mobius Error ID Lookup - -When analyzing Mobius logs, use this skill to interpret error IDs and call-related identifiers. - -## Instructions - -1. **Identify Mobius error IDs and call IDs** in the logs you are analyzing (e.g. from `mobius_logs`, or any field containing Mobius error codes, call IDs, or tracking identifiers that map to known error semantics). - -2. **Consult the reference document** `references/mobius_error_ids.md` to look up each such ID. That document contains the authoritative mapping of Mobius error/call IDs to their meanings, causes, and remediation notes. - -3. **Include the interpretation in your analysis:** - - In **Error Detection and Root Cause Analysis**: For each Mobius error ID found, state what the ID means (from the reference), likely cause, and suggested fix. - - In **Root Cause Analysis** (if applicable): Reference the documentation so your explanation is consistent with the defined semantics of each ID. - -4. If an ID is not present in the reference document, say so and describe the ID and context so a human can triage or update the documentation. diff --git a/agents/analyze_agent_v2/skills/mobius_error_id_skill/references/mobius_error_ids.md b/agents/analyze_agent_v2/skills/mobius_error_id_skill/references/mobius_error_ids.md deleted file mode 100644 index 50e8dd8..0000000 --- a/agents/analyze_agent_v2/skills/mobius_error_id_skill/references/mobius_error_ids.md +++ /dev/null @@ -1,325 +0,0 @@ -# Mobius Error IDs and Call ID Reference - -Reference for Mobius HTTP response codes and `mobius-error` codes. Use this to interpret errors in Mobius logs (e.g. `fields.response_status`, mobius-error in response body). **For each error, use the detailed sections below to explain what the error means, user/call impact, likely root cause, and what to check next in your analysis.** - ---- - -## Registration errors (detailed) - -### 403 FORBIDDEN — mobius-error 101: Per-user device limit exceeded - -**What it means:** The client tried to register a device (browser/WebRTC client) with Mobius, but the system rejected it because the user has hit the maximum number of allowed device registrations, or this specific device is already considered registered, or a registration for this device is already in progress. - -**User/call impact:** The user cannot register this device for WebRTC calling. They may see a registration failure in the client; existing calls on other devices may work, but this device will not be able to place or receive calls until registration succeeds. - -**Root cause direction:** (1) User has genuinely reached the global per-user device limit — check entitlement and how many devices are registered. (2) Stale or duplicate registration: the device is already in Mobius/Redis cache, or a previous registration never completed and is still in progress. (3) Race: two registration requests for the same device; one may succeed and the other gets 101. - -**What to check in logs:** Look for multiple REGISTER attempts for the same `DEVICE_ID` or `USER_ID`; check Redis/local cache state if available. Correlate with unregistration and keepalive logs to see if an old registration was not cleaned up. In your analysis, state whether this looks like a limit issue vs. duplicate/stale registration. - -**Log pattern:** Log sample available for error code 101. - ---- - -### 403 FORBIDDEN — mobius-error 102: Device creation not enabled for user - -**What it means:** Mobius attempted to create or enable a device for this user by talking to CPAPI (Cisco Platform API). CPAPI either did not return a valid browser client ID or SIP address, or returned a 4xx response, so Mobius will not create the device. - -**User/call impact:** Registration fails. The user cannot use this device for WebRTC calling. This is typically an entitlement or provisioning issue, not a transient network failure. - -**Root cause direction:** (1) User not entitled for WebRTC/browser calling in CPAPI. (2) CPAPI returned invalid or missing browserClientId/sipAddress. (3) CPAPI returned 4xx (e.g. forbidden, not found) — the exact CPAPI response in upstream logs is key. - -**What to check in logs:** Find the CPAPI request/response preceding this error (same time window, same user/device). Check for CPAPI 4xx, timeouts, or empty/invalid browserClientId or sipAddress. In analysis, state whether the failure is at CPAPI (entitlement/provisioning) vs. Mobius logic. - -**Log pattern:** Log sample available for error code 102. - ---- - -### 403 FORBIDDEN — mobius-error 103: Device creation failed - -**What it means:** Mobius tried to complete device creation by sending a SIP REGISTER to the SSE (Signaling Service Edge). The SSE responded with a final error (e.g. 403), so device creation failed. - -**User/call impact:** Registration fails. The browser/client cannot be used for calling. The failure is downstream of Mobius (at the SIP/SSE layer). - -**Root cause direction:** SSE rejected the REGISTER — reasons can include policy, capacity, or invalid/duplicate registration data. The SSE 403 (or other final error) is the direct cause; Mobius is correctly surfacing it. - -**What to check in logs:** Correlate with SSE logs for the same time and REGISTER transaction (e.g. same sipCallId or correlation IDs). Identify the exact SIP response code and reason from SSE. In analysis, attribute the root cause to SSE and note the SIP response code and any reason phrase. - -**Log pattern:** None specifically; look for REGISTER and final response in SIP/SSE logs. - ---- - -### 503 SERVICE UNAVAILABLE (registration) - -**What it means:** Registration could not be completed because of a transient or backend failure: timeout to SSE or CPAPI, failure to store state in Redis or local map, outbound proxy (OBP) resolution failure, or an unhandled exception in the registration handler. - -**User/call impact:** Registration fails. The user may retry; some causes are transient (timeouts, temporary SSE/CPAPI unavailability), others indicate a misconfiguration or backend issue (Redis, OBP). - -**Root cause direction:** Use the log message to narrow down: - -- **"Registration failed due to connection timeout for device: {}"** — Mobius could not establish or maintain connection to SSE within the timeout. Check SSE health, network path, and load. -- **"No more retries left, failed to setup connection with all remote hosts"** — Mobius tried all known SSE nodes and could not connect. Points to SSE cluster or network issue. -- **"Failed to add device: {} in sorted set"** — Redis write failed. Check Redis connectivity and capacity. -- **"Device entry insertion failed for deviceId {} sipAoR {} outboundProxy {} userUUID {}"** — Local in-memory map insertion failed (e.g. duplicate key or resource). -- **"BaseHandler Exception: {}"** — Unhandled exception in registration handler; inspect stack trace for the actual failure. -- **"OBP resolution failed"** — Outbound proxy (SSE address) resolution failed (DNS or config). -- **"Unhandled SIP response code received"** — SSE sent a SIP response that Mobius does not handle explicitly; note the code in analysis. -- **"No more retries left, registration failed for device: {} with status: {} {}"** — SSE returned an error after retries; the status and message are the direct cause. -- **"Received Client Exception from Provisioning Client while querying Browser Client Id."** — CPAPI call (for browser client id) timed out or threw. Check CPAPI availability and latency. - -**What to check in logs:** Match the exact log message above to the failure; then correlate with SSE, CPAPI, or Redis logs in the same time window. In analysis, name the failing dependency (SSE, CPAPI, Redis, OBP) and the concrete log message. - -**Log samples:** "Registration: Error code: 503 SERVICE UNAVAILABLE due to error response from SSE" or "due to CPAPI timeout". - ---- - -### 401 UNAUTHORIZED (registration) - -**What it means:** The user’s token (used for the registration request) was rejected. Common Identity (CI) returned 401, so Mobius does not consider the user authenticated. - -**User/call impact:** Registration fails. The client must present a valid token (re-auth, token refresh, or re-login). - -**Root cause direction:** Expired or invalid token; CI outage or misconfiguration; wrong token or audience. Not usually a Mobius bug. - -**What to check in logs:** Confirm CI 401 in upstream calls; check token expiry and issuer/audience if available. In analysis, state that authentication failed and whether it is token lifecycle vs. CI/service issue. - ---- - -### 501 NOT IMPLEMENTED (registration) - -**What it means:** An uncaught or unhandled exception occurred during registration. The "501 NOT IMPLEMENTED" is often a generic surface for unexpected code paths (e.g. "Incorrect User Data"). - -**User/call impact:** Registration fails. May be a bug or unexpected input. - -**Root cause direction:** Look for "Incorrect User Data" or similar in logs; indicates bad or unexpected user/device data. Could be client bug, schema change, or missing validation. - -**What to check in logs:** Search for "Incorrect User Data" and any stack trace in the same request. In analysis, note whether this looks like bad input vs. server-side bug. - ---- - -## Unregistration errors (detailed) - -### 404 NOT FOUND (unregistration) - -**What it means:** The client asked to unregister a device, but Mobius has no registration record for that identifier (e.g. device already unregistered, or wrong ID). - -**User/call impact:** Unregistration request fails. From a user perspective this is often harmless (device was already unregistered); sometimes it indicates a client sending unregister for a non-existent registration. - -**Root cause direction:** (1) Idempotent: device was already unregistered. (2) Client sent wrong device/session ID. (3) Registration expired or was cleaned up elsewhere before unregister arrived. - -**What to check in logs:** "Registration not found for {}" — confirm the identifier in the message. In analysis, state whether this is expected (idempotent) or points to client/identifier mix-up. - ---- - -### 503 SERVICE UNAVAILABLE (unregistration) - -**What it means:** Unregistration could not be completed. Two main cases: (1) Mobius intentionally blocks unregister because the device still has active call(s) — unregister would be unsafe. (2) An exception occurred while handling the unregistration request. - -**User/call impact:** Unregistration fails. In case (1), this is correct behavior (protecting active calls). In case (2), the device may remain in a registered state until retry or cleanup. - -**Root cause direction:** (1) "Call/s exist for this device, can not continue with unregister request." — By design; do not treat as a bug. (2) "Caught exception {} while handling unregistration" — Inspect the exception; could be backend (e.g. Redis) or logic bug. - -**What to check in logs:** Match the message above; if calls exist, correlate with call logs for that device. In analysis, distinguish "blocked by design" vs. "exception during unregister". - ---- - -### 501 NOT IMPLEMENTED (unregistration) - -**What it means:** Unhandled exception during unregistration (e.g. "Incorrect User Data"). Same interpretation as 501 for registration. - -**User/call impact:** Unregistration may not complete; device state may be inconsistent. - -**What to check in logs:** Look for "Incorrect User Data" or exception details. In analysis, note bad input or server-side handling gap. - ---- - -## Call errors (detailed) - -### 403 FORBIDDEN — mobius-error 112: Device is not registered - -**What it means:** The client sent a call-related request (e.g. make call, answer, disconnect) using a device-id that Mobius does not have in its registration state. The device either never registered successfully or already unregistered. - -**User/call impact:** The call action fails. User may see "device not registered" or similar. They may need to re-register (refresh, reload, or re-login) before placing or answering calls. - -**Root cause direction:** (1) Registration failed earlier (check for 101/102/103/503 in registration flow). (2) Registration expired or was removed (keepalive miss, unregister). (3) Client using wrong device-id or session. (4) Race: unregister completed before call request was processed. - -**What to check in logs:** Confirm there is no successful registration for this device-id in the same time window; look for prior registration failures or unregister. "Call: Error code: 403 : FORBIDDEN, mobius-error: 112, due to Device not registered". In analysis, tie 112 to the registration state and any prior registration/unregistration events. - ---- - -### 403 FORBIDDEN — mobius-error 115: User Busy - -**What it means:** The device cannot accept this call (or call action) because it is already busy: either a call is in the process of being set up ("allocating" state) or the SSE returned SIP 486 Busy Here. - -**User/call impact:** Incoming call may not be presented, or a new outbound call may fail. The user or the other party may see "busy" or "user busy". - -**Root cause direction:** (1) Legitimate: user/device is on another call or call is being set up. (2) Stuck state: a previous call left the device in "allocating" and never cleared — look for incomplete call teardown. (3) SSE sent 486 (callee busy) — downstream routing or endpoint state. - -**What to check in logs:** Look for another call or "allocating" state for the same device; look for 486 in SIP logs. "Call: Error code: 403: FORBIDDEN, mobius-error: 115, due to User busy". In analysis, state whether this is expected (user busy) vs. possible stuck state. - ---- - -### 403 FORBIDDEN — mobius-error 118: Not Acceptable - -**What it means:** The call or session setup was rejected by the SSE with SIP 488 Not Acceptable. Typically relates to SDP/negotiation: codecs, media, or session parameters were not acceptable to the far end or the network. - -**User/call impact:** Call setup fails. User may see a generic call failure or "not acceptable" type message. - -**Root cause direction:** SDP/codec mismatch, unsupported media type, or policy rejection. Check SDP in SIP messages (offer/answer) and any codec or media restrictions in SSE/downstream. - -**What to check in logs:** Find the 488 response from SSE and the associated INVITE/offer. Compare SDP (m= lines, codecs) with what the client sent. In analysis, describe the negotiation failure (e.g. no common codec, or rejected media). - ---- - -### 403 FORBIDDEN — mobius-error 119: Call Rejected - -**What it means:** The call was explicitly rejected by the SSE with SIP 403 or 603 (or equivalent). The network or destination rejected the call, not Mobius. - -**User/call impact:** Call fails; user may see "call rejected" or "declined". - -**Root cause direction:** Policy (e.g. blocking), destination rejected (603 Decline), or SSE/backend returned 403/603. The root cause is downstream; Mobius is forwarding the rejection. - -**What to check in logs:** Find the 403/603 from SSE and the reason phrase. Correlate with WxCAS/SSE logs for routing and rejection reason. In analysis, state who rejected (SSE, destination, policy) and the SIP code and reason. - ---- - -### 403 FORBIDDEN — mobius-error 121: Mid Call Request Rejected - -**What it means:** A mid-call request (e.g. hold, resume, transfer, add media) was sent to CXAPI (Call Control API), and CXAPI returned a 4xx response. Mobius surfaces this as 121. - -**User/call impact:** The mid-call action (hold, transfer, etc.) fails. The call may remain in its previous state; user may see an error for that action. - -**Root cause direction:** CXAPI rejected the request — invalid state, invalid parameters, or policy. Check CXAPI logs and the specific 4xx code and body. - -**What to check in logs:** Correlate with CXAPI request/response for the same call and timestamp. Note the 4xx code and any error message. In analysis, attribute the failure to CXAPI and the reason (state, params, or policy). - ---- - -### 503 SERVICE UNAVAILABLE — mobius-error 117: Timeout error - -**What it means:** Offer-answer (SDP) negotiation did not complete within the expected time (ROAP_TIMEOUT). The client or the network did not complete the exchange in time. - -**User/call impact:** Call setup fails with a timeout. User may see "call failed" or "timeout". - -**Root cause direction:** (1) Network or client latency; (2) Client or far end not responding to offer/answer in time; (3) ROAP timeout value too short for the path. Check timing between offer and answer in logs. - -**What to check in logs:** Measure time between sending offer and receiving answer (or timeout). Look for delayed or lost SIP/HTTP messages. In analysis, state whether the timeout is client-side, network, or backend delay. - ---- - -### 503 SERVICE UNAVAILABLE — mobius-error 120: Not Available - -**What it means:** The call or request could not be fulfilled because the SSE rejected it, CXAPI returned 5xx/6xx or threw, or an unknown exception occurred in the call handler (e.g. null pointer). This is a catch-all for "service or logic unavailable". - -**User/call impact:** Call or call action fails. User may see a generic error or "not available". - -**Root cause direction:** (1) SSE rejection — check SSE logs for the same transaction. (2) CXAPI 5xx/6xx or exception — check CXAPI logs. (3) Null pointer or exception in Mobius call handler — look for stack trace; may be a bug or unexpected state. Example: "Call: Error code: 503 :SERVICE UNAVAILABLE, mobius-error: 120, due to null pointer exception while processing Connect request from client". - -**What to check in logs:** Match the exact "due to" message; correlate with SSE and CXAPI. In analysis, name the failing component and whether it looks like backend failure vs. Mobius bug (e.g. NPE). - ---- - -### 404 NOT FOUND — mobius-error 113: Call not found - -**What it means:** The client sent a request for an existing call (e.g. answer, disconnect, update) using a call-id that Mobius does not have in its call state. The call may have already ended, or the call-id is wrong or from another instance. - -**User/call impact:** The call action fails. User may see "call not found" or similar. The call may have been torn down already, or there is a client/state sync issue. - -**Root cause direction:** (1) Call already ended (BYE, timeout, or cleanup). (2) Client using stale or wrong call-id. (3) Request routed to a Mobius instance that does not have this call (instances may not share call state). (4) Race: teardown completed before the request was processed. - -**What to check in logs:** "Call: Error code: 404 NOT FOUND, mobius-error: 113, Call not found". Look for BYE or call teardown for this call-id before the 404; check if multiple Mobius instances are involved. In analysis, state whether the call was already gone vs. wrong ID vs. instance mismatch. - ---- - -### 500 INTERNAL SERVER ERROR — mobius-error 114: Error in processing call - -**What it means:** Something went wrong while Mobius was processing the call: SSE rejected a request, an exception occurred while processing the SSE response, or the client sent an event that is not valid in the current call state (e.g. answer when not ringing). - -**User/call impact:** Call or call action fails; user may see a generic server error. - -**Root cause direction:** (1) SSE rejection — see SSE logs. (2) Unknown exception processing SSE response — look for stack trace. (3) "Client event isn't supported in current call state" — client sent an out-of-order or invalid event (e.g. answer before 180/183, or disconnect in wrong state). Example: "Call: Error code: 500 SERVER ERROR, mobius-error: 114, due to client event isn't supported in current call state." - -**What to check in logs:** Match the "due to" message; correlate call state (ringing, connected, etc.) with the client event. In analysis, state whether the failure is client protocol/state machine vs. SSE/Mobius backend. - ---- - -### 400 BAD REQUEST (calls) - -**What it means:** The request body or parameters were invalid: parse error, schema mismatch, or missing required field. Mobius could not interpret or validate the request. - -**User/call impact:** Call or call action fails with a bad request. Often a client bug or version mismatch. - -**Root cause direction:** Malformed JSON, wrong schema, or client sending unexpected/old format. Check the request payload in logs. In analysis, note the invalid field or parse error if present. - ---- - -### 501 NOT IMPLEMENTED (calls) - -**What it means:** The endpoint or call flow is not implemented in Mobius. The client may be using a newer API or a flow that this version of Mobius does not support. - -**User/call impact:** Request fails with "not implemented". - -**Root cause direction:** API/version mismatch or feature not yet implemented. In analysis, note the endpoint or flow and suggest checking client and Mobius versions. - ---- - -## Other / Ingress and platform errors (detailed) - -### 429 TOO MANY REQUESTS - -**What it means:** Nginx (or the ingress layer) is rate-limiting because the number of requests exceeded the configured threshold. This is a DoS/abuse protection. - -**User/call impact:** Requests are rejected with 429; users may see errors or throttling. Can affect many users if a single client or script is noisy. - -**Root cause direction:** (1) Noisy client or script (e.g. retries, polling). (2) Traffic spike. (3) Misconfigured threshold. Check request rate per client/IP in logs. In analysis, state whether this is expected rate limiting and which client or IP is driving the load. - ---- - -### 503 SERVICE UNAVAILABLE (service not ready) - -**What it means:** The Mobius instance is not considered "ONLINE" — e.g. health check (ping) failed or the service has not finished starting. Load balancer or orchestrator may stop sending traffic to this instance. - -**User/call impact:** Requests to this instance fail with 503. Users may be routed to other instances; if all are down, calling fails. - -**Root cause direction:** Instance startup, dependency (e.g. Redis, SSE) failing health check, or overload. Check Mobius ping/health and dependency health. In analysis, state whether this is single-instance vs. broader outage. - ---- - -### 499 CLIENT CLOSED REQUEST - -**What it means:** The client closed the TCP/HTTP connection before the server sent the response. Nginx records this as 499. - -**User/call impact:** The request did not complete; the client may have navigated away, refreshed, or timed out on its side. - -**Root cause direction:** User action (close tab, navigate away), client timeout, or network drop. Usually not a server bug. In analysis, note that the client closed the connection and whether it correlates with timeouts or user actions. - ---- - -## Quick lookup: mobius-error code → meaning - -| mobius-error | Category | Short meaning | -|--------------|------------|------------------------------------------------------------| -| 101 | Registration | Per-user device limit exceeded | -| 102 | Registration | Device creation not enabled for user (CPAPI) | -| 103 | Registration | Device creation failed (SSE rejected REGISTER) | -| 112 | Calls | Device is not registered | -| 113 | Calls | Call not found | -| 114 | Calls | Error in processing call (SSE/call state/event) | -| 115 | Calls | User busy | -| 117 | Calls | Timeout (e.g. ROAP offer-answer) | -| 118 | Calls | Not acceptable (e.g. 488 from SSE, SDP/codec) | -| 119 | Calls | Call rejected (403/603 from SSE) | -| 120 | Calls | Not available (SSE/CXAPI rejection or exception) | -| 121 | Calls | Mid-call request rejected (CXAPI 4xx) | - ---- - -## Timers (keepalive and unregistration) - -When correlating registration/call failures or unexpected unregisters, use these values (Mobius Basic Training and 2 AM Guide): - -- **Registration keepalive:** Browser sends keepalive every **30 seconds**. After **5 missed** keepalives, Mobius triggers **unregistration**. So ~150 seconds of no keepalive → unregister. -- **Call keepalive:** Browser sends keepalive during a call; valid **within 15 minutes**. -- **SIP APP – Registration refresh:** Refreshes with SSE at **3/4 of the Expires** header from REGISTER. -- **SIP APP – Options:** OPTIONS ping to SSE every **35 seconds** to check connectivity. - -If you see unregistration (e.g. 404 or 503 with "Call/s exist for this device") shortly after gaps in logs, consider keepalive miss (network, client suspend, or load) as a cause. diff --git a/agents/analyze_agent_v2/skills/sip_flow_skill/SKILL.md b/agents/analyze_agent_v2/skills/sip_flow_skill/SKILL.md deleted file mode 100644 index 1cb8994..0000000 --- a/agents/analyze_agent_v2/skills/sip_flow_skill/SKILL.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -name: sip-flow-skill -description: Reference for SIP protocol flows, message sequences, error codes, and debugging patterns. Use when analyzing SIP signaling in SSE, MSE, WxCAS, or Mobius logs to understand expected message flows and diagnose failures. ---- - -# SIP Flow Reference - -When analyzing logs that contain SIP signaling (SSE/MSE logs from logstash-wxcalling, Mobius logs from logstash-wxm-app, or WxCAS logs), use this skill to: - -- **Understand expected SIP message sequences** for different call scenarios (basic call, call hold, transfer, forwarding, conference). -- **Interpret SIP response codes** and map them to root causes. -- **Identify SIP flow anomalies** such as missing ACKs, retransmissions, unexpected BYEs, or timeout-triggered responses. -- **Correlate SIP dialogs** across services using Call-ID, From/To tags, CSeq, and branch parameters. - -Consult the reference document: - -- **references/sip_flows.md** — SIP message sequences, response code reference, SDP negotiation, timers, and common failure patterns. - -Use this skill alongside the architecture-endpoints-skill to attribute SIP messages to the correct service and understand the signaling path. diff --git a/agents/analyze_agent_v2/skills/sip_flow_skill/references/sip_flows.md b/agents/analyze_agent_v2/skills/sip_flow_skill/references/sip_flows.md deleted file mode 100644 index 4c49647..0000000 --- a/agents/analyze_agent_v2/skills/sip_flow_skill/references/sip_flows.md +++ /dev/null @@ -1,250 +0,0 @@ -# SIP Flow Reference - -## 1. Basic Call Setup (INVITE Transaction) - -### Successful Call -``` -Caller (Mobius) SSE WxCAS Callee (Mobius) - |--- INVITE -------->| | | - |<-- 100 Trying -----| | | - | |--- INVITE --->| | - | |<-- 100 Trying-| | - | | |--- INVITE -------->| - | | |<-- 100 Trying -----| - | | |<-- 180 Ringing ----| - | |<-- 180 Ring---| | - |<-- 180 Ringing -----| | | - | | |<-- 200 OK ---------| - | |<-- 200 OK ----| | - |<-- 200 OK ----------| | | - |--- ACK ------------>| | | - | |--- ACK ------>| | - | | |--- ACK ----------->| - | | | | - |==================== RTP Media (via MSE) =================| - | | | | - |--- BYE ------------>| | | - | |--- BYE ------>| | - | | |--- BYE ----------->| - | | |<-- 200 OK ---------| - | |<-- 200 OK ----| | - |<-- 200 OK ----------| | | -``` - -### Early Media (183 Session Progress) -When the callee provides early media (ringback tone, IVR prompts): -``` -Caller SSE WxCAS Callee - |--- INVITE ---->| | | - |<-- 100 Trying -| | | - | |--- INVITE --->| | - | | |--- INVITE ---->| - | | |<-- 183 + SDP --| - | |<-- 183 + SDP -| | - |<-- 183 + SDP --| | | - |--- PRACK ----->| | | - | |--- PRACK ---->| | - | | |--- PRACK ----->| - | | |<-- 200 (PRACK)-| - |============ Early Media (RTP via MSE) ==========| - | | |<-- 200 OK -----| - |<-- 200 OK -----| | | - |--- ACK ------->| | | -``` - -## 2. Call Hold / Resume - -### Hold (re-INVITE with sendonly) -``` -Holder SSE WxCAS Held Party - |--- re-INVITE -->| | | - | (a=sendonly) |--- re-INVITE->| | - | | |--- re-INVITE ->| - | | |<-- 200 OK -----| - | | | (a=recvonly) | - | |<-- 200 OK ----| | - |<-- 200 OK ------| | | - |--- ACK -------->| | | -``` - -### Resume (re-INVITE with sendrecv) -Same flow as hold but with `a=sendrecv` in SDP. - -## 3. Call Transfer (REFER) - -### Blind Transfer -``` -A (Transferor) SSE WxCAS B (Transferee) C (Target) - |--- REFER ------>| | | | - | Refer-To: C | | | | - |<-- 202 Accepted-| | | | - | |--- INVITE (B to C) ----->| | - | | | |--- INVITE --->| - |<-- NOTIFY ------| | | | - | (100 Trying) | | |<-- 200 OK ---| - |<-- NOTIFY ------| | | | - | (200 OK) | | | | - |--- BYE -------->| (A hangs up) | | - |<-- 200 OK ------| | | | -``` - -### Attended Transfer -Same as blind but the transferor first establishes a call with the target (consultation call), then sends REFER with `Replaces` header. - -## 4. Call Forwarding - -### Forwarding (302 Moved Temporarily) -``` -Caller SSE WxCAS Forwarder Target - |--- INVITE ---->| | | | - | |--- INVITE --->| | | - | | |--- INVITE ---->| | - | | |<-- 302 --------| | - | | | Contact: target | - | | |--- ACK ------->| | - | | |--- INVITE ---->|-------------->| - | | |<-- 200 OK -----|---------------| - | |<-- 200 OK ----| | | - |<-- 200 OK -----| | | | -``` - -## 5. SIP Response Code Reference - -### 1xx Provisional -| Code | Meaning | Notes | -|------|---------|-------| -| 100 | Trying | Hop-by-hop. Suppresses INVITE retransmissions. | -| 180 | Ringing | End-to-end. Callee alerting. | -| 181 | Call Is Being Forwarded | Optional. Indicates forwarding. | -| 183 | Session Progress | Early media. Contains SDP for early RTP. Requires PRACK if 100rel. | - -### 2xx Success -| Code | Meaning | Notes | -|------|---------|-------| -| 200 | OK | Final success. For INVITE: must be ACKed. | -| 202 | Accepted | Used for REFER. Subscription implicitly created. | - -### 3xx Redirection -| Code | Meaning | Notes | -|------|---------|-------| -| 301 | Moved Permanently | Target has moved. Update address. | -| 302 | Moved Temporarily | Used for call forwarding. Contains new Contact. | - -### 4xx Client Errors -| Code | Meaning | Common Causes in Webex Calling | -|------|---------|-------------------------------| -| 400 | Bad Request | Malformed SIP message, invalid SDP. | -| 401 | Unauthorized | Authentication required. Check credentials. | -| 403 | Forbidden | User not authorized. Check entitlements in CPAPI. | -| 404 | Not Found | Destination not registered or unknown. Check WxCAS routing. | -| 407 | Proxy Auth Required | Proxy authentication needed. | -| 408 | Request Timeout | No response from destination within timer B (32s). Destination unreachable. | -| 415 | Unsupported Media | SDP codec mismatch. Check offered vs supported codecs. | -| 480 | Temporarily Unavailable | Destination offline or not reachable. Check registration status. | -| 481 | Call/Transaction Does Not Exist | BYE/ACK for unknown dialog. Possible state mismatch or race condition. | -| 486 | Busy Here | Callee busy. | -| 487 | Request Terminated | INVITE cancelled by caller (CANCEL sent before final response). | -| 488 | Not Acceptable Here | SDP negotiation failure. Offered codecs/media not acceptable. | -| 491 | Request Pending | Glare: simultaneous re-INVITEs. Retry after random delay. | - -### 5xx Server Errors -| Code | Meaning | Common Causes in Webex Calling | -|------|---------|-------------------------------| -| 500 | Server Internal Error | Service crash or unhandled exception. Check SSE/WxCAS logs. | -| 502 | Bad Gateway | Upstream service failure. SSE couldn't reach WxCAS or destination. | -| 503 | Service Unavailable | Overloaded or in maintenance. Check health endpoints. | -| 504 | Server Timeout | Upstream response timeout. Check inter-service connectivity. | - -### 6xx Global Errors -| Code | Meaning | Notes | -|------|---------|-------| -| 600 | Busy Everywhere | Callee busy on all devices. | -| 603 | Decline | Callee explicitly rejected the call. | -| 604 | Does Not Exist Anywhere | Number/URI not found globally. | - -## 6. SDP Negotiation (Offer/Answer) - -### Key SDP Fields to Check -- `v=` — Protocol version (always 0) -- `o=` — Session originator (username, session-id, version, address) -- `c=` — Connection info (IP address for media) -- `m=` — Media line: `m=audio RTP/SAVPF ` or `m=video ...` -- `a=rtpmap:` — Codec mapping (e.g., `a=rtpmap:111 opus/48000/2`) -- `a=sendrecv` / `a=sendonly` / `a=recvonly` / `a=inactive` — Media direction -- `a=ice-ufrag` / `a=ice-pwd` — ICE credentials -- `a=candidate:` — ICE candidates (host, srflx, relay) -- `a=fingerprint:` — DTLS fingerprint for SRTP key exchange -- `a=setup:` — DTLS role (`actpass`, `active`, `passive`) - -### Common SDP Issues -- **Codec mismatch**: Offer contains codecs not in answer → 488 or no media -- **Missing ICE candidates**: No relay candidates → can fail behind strict NAT/firewall -- **Port 0 in answer**: Media stream rejected by answerer -- **IP mismatch**: SDP `c=` address unreachable → one-way or no audio -- **Direction conflict**: Both sides `sendonly` → no bidirectional media - -## 7. SIP Timers (RFC 3261) - -| Timer | Default | Purpose | -|-------|---------|---------| -| T1 | 500ms | RTT estimate. Base for retransmission intervals. | -| T2 | 4s | Maximum retransmission interval for non-INVITE requests. | -| T4 | 5s | Maximum time a message remains in the network. | -| Timer A | Initially T1 | INVITE retransmission interval (doubles each retransmit). | -| Timer B | 64*T1 (32s) | INVITE transaction timeout. No response → 408. | -| Timer C | >3min | Proxy INVITE transaction timeout. | -| Timer D | >32s (UDP) | Wait time after INVITE client receives non-2xx. | -| Timer F | 64*T1 (32s) | Non-INVITE transaction timeout. | -| Timer H | 64*T1 (32s) | Wait time for ACK after sending non-2xx to INVITE. | - -### Debugging with Timers -- **No 100 Trying within T1**: Possible network issue or destination down. -- **INVITE retransmissions (Timer A doubling)**: 500ms, 1s, 2s, 4s... indicates no response from next hop. -- **Timer B expiry (32s)**: No final response to INVITE. Results in 408 Request Timeout. -- **Missing ACK after 200 OK (Timer H)**: Dialog state leak. Possible NAT/firewall blocking ACK. - -## 8. Common Failure Patterns - -### One-Way Audio -- **Symptoms**: One party can hear, other cannot. -- **Check**: SDP `c=` addresses, ICE connectivity, NAT traversal, `a=sendrecv` direction, firewall rules on RTP ports. -- **In logs**: Look for ICE failure events, OODLE/media quality alerts, OOOOOOO (no media flowing). - -### Call Drops After ~32 Seconds -- **Cause**: Timer B expiry — INVITE not answered. -- **Check**: Destination registration, SSE→WxCAS connectivity, WxCAS→destination routing. - -### ooooo (No Audio) / oOOOOOo (Intermittent) -- **Check**: MSE logs for RTP packet counters, ICE state, DTLS handshake completion. - -### Registration Failures -- **401/407 loops**: Authentication issues. Check credentials and nonce handling. -- **Keepalive failures**: 5 missed keepalives (30s interval) → unregistration. Check network stability. - -### Call Setup Failures -- **Location Service Error 404**: User not registered on WxCAS. Check REGISTER flow. -- **488 Not Acceptable Here**: SDP mismatch. Compare offered vs required codecs. -- **Location Service Error 480**: User temporarily unavailable. Check device status. - -### Ooooo (Ooh Pattern - Ooooo in SSE Logs) -- Periodic patterns of capital and lowercase letters in SSE logs represent media flow quality markers. -- All lowercase (`ooooo`) = no media detected. -- Capital letters = media packets detected in that interval. - -## 9. SIP Dialog Correlation Across Services - -To trace a single call across Mobius, SSE, and WxCAS logs: - -1. **Call-ID**: Same across all services for a given dialog leg. Search all log sources with the same Call-ID. -2. **From-tag / To-tag**: Combined with Call-ID, uniquely identifies a dialog. Use to distinguish forked calls. -3. **CSeq**: Sequence number per method. Helps order messages within a dialog. -4. **Via branch**: Transaction identifier. Same branch = same transaction across hops. -5. **Tracking ID**: Webex-specific. Correlates browser session to SIP dialog. Found in Mobius logs and X-headers in SIP. - -### Cross-Service Mapping -| Identifier | Mobius Logs | SSE/MSE Logs | WxCAS Logs | -|-----------|------------|-------------|-----------| -| Call-ID | `sipCallId` field | `Call-ID` header | `Call-ID` header | -| Tracking ID | `trackingId` field | X-Cisco-TrackingId header | X-Cisco-TrackingId header | -| Session ID | `sessionId` / `localSessionId` | Session-ID header | Session-ID header | -| Correlation ID | `correlationId` field | X-Cisco-CorrelationId | X-Cisco-CorrelationId | diff --git a/agents/chat_agent/agent.py b/agents/chat_agent/agent.py index 09370c2..f5d6993 100644 --- a/agents/chat_agent/agent.py +++ b/agents/chat_agent/agent.py @@ -3,10 +3,94 @@ from dotenv import load_dotenv from google.adk.agents import LlmAgent from google.adk.models.lite_llm import LiteLlm +from google.adk.tools import FunctionTool +from google.adk.tools.tool_context import ToolContext env_path = Path(__file__).parent.parent / ".env" load_dotenv(dotenv_path=env_path) +from analyze_agent_v2.agent import ( + architecture_skill_toolset, + sip_flow_skill_toolset, + mobius_skill_toolset as mobius_error_skill_toolset, +) +from search_agent_v2.agent import _log_cache + + +def _get_state_or_cache(tool_context: ToolContext, key: str) -> str: + """Read from tool_context.state first; fall back to the module-level log cache.""" + value = tool_context.state.get(key, "") + if value: + return value + session_id = tool_context._invocation_context.session.id + return _log_cache.get(session_id, {}).get(key, "") + + +def get_raw_logs(service: str, tool_context: ToolContext) -> dict: + """Retrieve raw logs for a specific service from the current analysis. + + Args: + service: One of "mobius", "sse_mse", "wxcas", "sdk", or "all". + + Returns: + A dict with the requested logs, or an error if not available. + """ + key_map = { + "mobius": "mobius_logs", + "sse_mse": "sse_mse_logs", + "sse": "sse_mse_logs", + "mse": "sse_mse_logs", + "wxcas": "wxcas_logs", + "sdk": "sdk_logs", + } + + service_lower = service.lower().strip() + + if service_lower == "all": + return { + "mobius_logs": _get_state_or_cache(tool_context, "mobius_logs"), + "sse_mse_logs": _get_state_or_cache(tool_context, "sse_mse_logs"), + "wxcas_logs": _get_state_or_cache(tool_context, "wxcas_logs"), + "sdk_logs": _get_state_or_cache(tool_context, "sdk_logs"), + } + + state_key = key_map.get(service_lower) + if not state_key: + return { + "error": f"Unknown service '{service}'. Use one of: mobius, sse_mse, wxcas, sdk, all.", + } + + logs = _get_state_or_cache(tool_context, state_key) + if not logs: + return {"logs": "", "message": f"No {service} logs available in the current analysis."} + + return {"logs": logs} + + +def get_sequence_diagram(tool_context: ToolContext) -> dict: + """Retrieve the PlantUML sequence diagram for the current analysis. + + Returns: + A dict with the diagram code, or a message if not available. + """ + diagram = tool_context.state.get("sequence_diagram", "") + if not diagram: + return {"diagram": "", "message": "No sequence diagram available for the current analysis."} + return {"diagram": diagram} + + +def get_search_summary(tool_context: ToolContext) -> dict: + """Retrieve the search statistics for the current analysis. + + Returns: + A dict with log counts, BFS depth, environments, and IDs searched. + """ + summary = _get_state_or_cache(tool_context, "search_summary") + if not summary: + return {"summary": "", "message": "No search summary available."} + return {"summary": summary} + + chat_agent = LlmAgent( model=LiteLlm( model="openai/gpt-4.1", @@ -17,6 +101,14 @@ description="Conversational assistant for the Webex Calling Log Analyzer.", name="chat_agent", output_key="chat_response", + tools=[ + FunctionTool(get_raw_logs), + FunctionTool(get_sequence_diagram), + FunctionTool(get_search_summary), + architecture_skill_toolset, + sip_flow_skill_toolset, + mobius_error_skill_toolset, + ], instruction="""You are a conversational assistant for the Webex Calling Log Analyzer. You help engineers explore and understand analysis results produced by the log-analysis pipeline. You are READ-ONLY — you never run searches, never @@ -26,16 +118,15 @@ AVAILABLE CONTEXT ================================================================ -These state variables are injected from prior pipeline agents. -They may be EMPTY if no search has been run yet. +Primary analysis (always in context): + {analyze_results} + +The following data is available ON-DEMAND via tools (not loaded +into context by default — call the tool only when needed): - Analysis (primary source of truth) : {analyze_results} - Search statistics : {search_summary} - Sequence diagram (PlantUML) : {sequence_diagram} - Raw Mobius logs : {mobius_logs} - Raw SSE/MSE logs : {sse_mse_logs} - Raw WxCAS logs : {wxcas_logs} - SDK/Client logs (uploaded) : {sdk_logs} + get_raw_logs(service) — raw Mobius, SSE/MSE, WxCAS, or SDK logs + get_sequence_diagram() — PlantUML sequence diagram + get_search_summary() — search statistics (log counts, BFS depth, IDs) ================================================================ RULE 0 — CONTEXT TRACKING (READ THIS FIRST) @@ -155,28 +246,53 @@ ── RAW LOG REQUESTS ("show logs", "give me the raw Mobius logs") ── -Clarify which service if not specified: - "Which logs? Mobius, SSE/MSE, or WxCAS?" -Return logs as stored — preserve JSON, sort by @timestamp ascending. +Call get_raw_logs(service) with the appropriate service name: + "mobius", "sse_mse", "wxcas", "sdk", or "all". +If the user doesn't specify which service, ask: + "Which logs? Mobius, SSE/MSE, WxCAS, or SDK?" +Return logs as received — preserve JSON, sort by @timestamp ascending. If user asks for ALL logs, warn: "This is a large output. Continue?" + then call get_raw_logs("all"). ── DIAGRAM REQUESTS ("show diagram", "give PlantUML") ── -Return {sequence_diagram} in a code block. Do NOT return it for -non-diagram questions. For modifications, generate updated PlantUML -keeping the same style. +Call get_sequence_diagram() and return the result in a code block. +Do NOT call this tool for non-diagram questions. +For modifications, generate updated PlantUML keeping the same style. ── SEARCH STATISTICS ("how many logs?", "what was searched?") ── -Use {search_summary} for log counts, BFS depth, environments, IDs. +Call get_search_summary() for log counts, BFS depth, environments, IDs. ── TIMING ("how long did the call take?", "setup time?") ── Extract timestamps from analysis. Calculate and present durations. -── TELECOM CONCEPTS ("what is ICE?", "what is SIP 480?") ── +── TELECOM CONCEPTS ("what is ICE?", "what is SIP 480?", "what does + mobius-error 115 mean?", "explain the role of SSE") ── -2–3 sentences max. Just enough to understand the analysis. +You have three reference skills you can consult for accurate answers: + + • architecture_endpoints_skill — service roles (Mobius, SSE, MSE, + WxCAS, CPAPI, Mercury, etc.), signaling/media paths, call types + and routing (WebRTC-to-PSTN, Contact Center, etc.), and topology. + Use when the user asks about what a service does, how traffic flows, + or how components connect. + + • sip_flow_skill — SIP message sequences (INVITE, BYE, REFER, etc.), + SIP response code meanings (480, 488, 503, etc.), SDP negotiation, + SIP timers, and common failure patterns (one-way audio, 32s drops). + Use when the user asks about SIP codes, call setup flows, or + protocol-level behavior. + + • mobius_error_id_skill — Mobius-specific error codes (101–121), + their meanings, root causes, user impact, and what to check in logs. + Use when the user asks about a mobius-error code or a Mobius HTTP + error (403/503/etc.) in the context of registration or calls. + +Use these skills to give precise, reference-backed answers rather than +relying on general knowledge. Keep answers concise (2–5 sentences) +unless the user asks for more detail. ── NEW / UNKNOWN IDENTIFIER ── @@ -184,13 +300,6 @@ "This identifier does not appear in the current analysis. Please run a new search with that ID." -── COMPARISON REQUESTS ("compare the two searches") ── - -If conversation history contains results from multiple searches, -compare based on what you remember from the conversation. Note that -only the LATEST results are in state — earlier results may have been -overwritten. Be transparent about what you can and cannot compare. - ================================================================ WHAT YOU MUST NEVER DO ================================================================ diff --git a/agents/query_router/agent.py b/agents/query_router/agent.py index d464592..2af7349 100644 --- a/agents/query_router/agent.py +++ b/agents/query_router/agent.py @@ -432,6 +432,7 @@ async def _run_async_impl( logger.info("[query_analyzer] Running pipeline") async for event in pipeline.run_async(ctx): yield event + else: logger.info("[query_analyzer] Skipping pipeline — passing to chat_agent") return diff --git a/agents/search_agent_v2/agent.py b/agents/search_agent_v2/agent.py index 6a88a24..367bd60 100644 --- a/agents/search_agent_v2/agent.py +++ b/agents/search_agent_v2/agent.py @@ -19,6 +19,8 @@ import time import requests from collections import deque + +_log_cache: dict[str, dict[str, str]] = {} from pathlib import Path from typing import Any, AsyncGenerator, Optional from typing_extensions import override @@ -1370,6 +1372,14 @@ async def _run_async_impl( [hit.get("_source", {}) for hit in all_logs["wxcas"]], default=str ) + _log_cache[ctx.session.id] = { + "mobius_logs": ctx.session.state["mobius_logs"], + "sse_mse_logs": ctx.session.state["sse_mse_logs"], + "wxcas_logs": ctx.session.state["wxcas_logs"], + "all_logs": ctx.session.state["all_logs"], + "search_summary": ctx.session.state["search_summary"], + } + logger.info( f"[{self.name}] ══ Search complete ══\n" f" Mobius: {len(all_logs['mobius'])} logs\n" From 6a43a291230e2d67756bf12e89a86077d88a3247 Mon Sep 17 00:00:00 2001 From: Ritesh Singh Date: Tue, 10 Mar 2026 23:59:19 +0530 Subject: [PATCH 2/9] fix : update skills name convention --- .../architecture-endpoints-skill/SKILL.md | 14 + .../references/architecture_and_endpoints.md | 53 +++ .../references/calling_flow.md | 11 + .../references/contact_center_flow.md | 24 ++ .../skills/mobius-error-id-skill/SKILL.md | 20 ++ .../references/mobius_error_ids.md | 325 ++++++++++++++++++ .../skills/sip-flow-skill/SKILL.md | 19 + .../sip-flow-skill/references/sip_flows.md | 250 ++++++++++++++ 8 files changed, 716 insertions(+) create mode 100644 agents/analyze_agent_v2/skills/architecture-endpoints-skill/SKILL.md create mode 100644 agents/analyze_agent_v2/skills/architecture-endpoints-skill/references/architecture_and_endpoints.md create mode 100644 agents/analyze_agent_v2/skills/architecture-endpoints-skill/references/calling_flow.md create mode 100644 agents/analyze_agent_v2/skills/architecture-endpoints-skill/references/contact_center_flow.md create mode 100644 agents/analyze_agent_v2/skills/mobius-error-id-skill/SKILL.md create mode 100644 agents/analyze_agent_v2/skills/mobius-error-id-skill/references/mobius_error_ids.md create mode 100644 agents/analyze_agent_v2/skills/sip-flow-skill/SKILL.md create mode 100644 agents/analyze_agent_v2/skills/sip-flow-skill/references/sip_flows.md diff --git a/agents/analyze_agent_v2/skills/architecture-endpoints-skill/SKILL.md b/agents/analyze_agent_v2/skills/architecture-endpoints-skill/SKILL.md new file mode 100644 index 0000000..08d6af2 --- /dev/null +++ b/agents/analyze_agent_v2/skills/architecture-endpoints-skill/SKILL.md @@ -0,0 +1,14 @@ +--- +name: architecture-endpoints-skill +description: Reference for Webex Calling and Contact Center service architecture, endpoint roles, signaling paths, and media paths. Use when analyzing logs to understand which service does what and how traffic flows. +--- + +# Architecture and Endpoints + +When analyzing Mobius, SSE, MSE, or WxCAS logs, use this skill to look up: + +- **Service roles**: What each endpoint (Mobius, SSE, MSE, WxCAS, CPAPI, CXAPI, U2C, WDM, Mercury, Kamailio, RTMS, RAS) does and how they interact. +- **Signaling and media paths**: End-to-end flow for WebRTC Calling vs. Contact Center (browser → Mobius → SSE → …). +- **Call types and routing**: WebRTC-to-WebRTC, WebRTC-to-PSTN, WebRTC-to-Desk Phone, and Contact Center flows. + +Consult the reference document **references/architecture_and_endpoints.md** for the full descriptions. Use it to attribute log lines to the correct service and to explain signaling/media paths in your analysis. diff --git a/agents/analyze_agent_v2/skills/architecture-endpoints-skill/references/architecture_and_endpoints.md b/agents/analyze_agent_v2/skills/architecture-endpoints-skill/references/architecture_and_endpoints.md new file mode 100644 index 0000000..86a87c7 --- /dev/null +++ b/agents/analyze_agent_v2/skills/architecture-endpoints-skill/references/architecture_and_endpoints.md @@ -0,0 +1,53 @@ +# Architecture and Endpoints Reference + +## Endpoints — service roles and descriptions + +- **Webex SDK/Client** (Web or native app making the request): Chrome extension or any third-party web application that consumes the Webex Calling SDK. + +- **Mobius**: Microservice that interworks between WebRTC and SIP to enable Webex Calling users to register and make calls using a web browser. It translates browser-originated signaling (HTTP/WSS) into SIP for backend communication. + - **Mobius Multi-Instance Architecture**: Multiple Mobius servers are deployed across different geographic regions (e.g., US, EU, APAC). When a user initiates a call from their browser, their geolocation (based on IP) is used to route them to the nearest Mobius instance using a GeoDNS or load balancer. + - Mobius talks to the following components: + - **CPAPI** (Cisco Platform API): User entitlement and application metadata. + - **CXAPI** (Webex Calling Call Control API): Stateless micro-service that implements the messaging logic behind the Webex Calling Call Control API. When incoming requests are received, it validates that the customer/user the request is made on behalf of belongs to Webex Calling and has the appropriate scope and roles. It then converts the request to the appropriate OCI requests and routes it to the OCIRouter to route to the correct WxC deployment. + +- **U2C (User to Cluster)**: Microservice that helps services find other service instances across multiple Data Centers. It takes a user's email or UUID and optional service name, and returns the catalog containing the service URLs. + +- **WDM (Webex squared Device Manager)**: Microservice responsible for registering a device and proxying feature toggles and settings for the user to bootstrap the Webex clients. + - If WDM shows many 502 responses, possible failing dependencies: Common Identity CI, Mercury API, U2C, Feature Service, Cassandra, Redis. + - If WDM is generating errors, either Locus will produce 502 responses or the clients will show an error. + +- **SSE (Signalling Service Edge)**: Edge component for SIP signalling. It communicates with two endpoints — Mobius and the application server WxCAS. + +- **MSE (Media Service Engine)**: Edge component for media relay that handles RTP for WebRTC clients. + +- **Webex Calling Application Server (WxCAS)**: Core control application in Webex Calling responsible for enabling communication between source SSE and destination SSE. + +- **Mercury**: Webex's real-time messaging and signaling service that establishes WebSocket connections and exchanges information in the form of events. Mobius uses Mercury to send events to the SDK. The SDK establishes a Mercury connection (WebSocket) to receive communication from Mobius. + +--- + +## WebRTC Calling — End-to-End Architecture + +**Signaling Path**: Browser → Mobius (HTTP/WSS→SIP translation) → SSE (SIP edge) → WxCAS (Application Server) → Destination + +**Media Path**: Browser ↔ MSE (DTLS-SRTP) ↔ Destination + +**Call Types & Routing:** + +- **WebRTC to WebRTC**: WxCAS resolves destination browser → Mobius notifies Browser 2 → Both browsers establish DTLS-SRTP with their local MSE. +- **WebRTC to PSTN**: WxCAS resolves PSTN destination → SSE signals toward Local Gateway → Browser↔MSE1 (DTLS-SRTP), MSE1↔MSE2 (RTP), MSE2→LGW (RTP→PSTN). +- **WebRTC to Desk Phone**: WxCAS resolves desk phone → SSE coordinates with MSE → Browser↔MSE1 (DTLS-SRTP), MSE1↔MSE2 (RTP), MSE2→Desk Phone. + +--- + +## Contact Center — End-to-End Architecture + +**Signaling Path**: Browser → Mobius → SSE → Kamailio (SIP proxy) → Destination + +**Media Path**: Browser ↔ MSE ↔ Destination + +**Additional Contact Center Components:** + +- **Kamailio**: SIP proxy for Contact Center — handles SIP REGISTER, stores registration details on RTMS Application Server, routes calls to the appropriate destination. +- **RTMS**: Real-time microservice enabling persistent WebSocket connections between clients and backend services. +- **RAS** (Registration, Activation, and provisioning Service): Stores SIP REGISTER Contact and Path headers with expiry, maintains metrics for WebRTC active sessions and calls to WebRTC phones. diff --git a/agents/analyze_agent_v2/skills/architecture-endpoints-skill/references/calling_flow.md b/agents/analyze_agent_v2/skills/architecture-endpoints-skill/references/calling_flow.md new file mode 100644 index 0000000..ddb731e --- /dev/null +++ b/agents/analyze_agent_v2/skills/architecture-endpoints-skill/references/calling_flow.md @@ -0,0 +1,11 @@ +# WebRTC Calling — End-to-End Architecture + +**Signaling Path**: Browser → Mobius (HTTP/WSS→SIP translation) → SSE (SIP edge) → WxCAS (Application Server) → Destination + +**Media Path**: Browser ↔ MSE (DTLS-SRTP) ↔ Destination + +**Call Types & Routing:** + +- **WebRTC to WebRTC**: WxCAS resolves destination browser → Mobius notifies Browser 2 → Both browsers establish DTLS-SRTP with their local MSE. +- **WebRTC to PSTN**: WxCAS resolves PSTN destination → SSE signals toward Local Gateway → Browser↔MSE1 (DTLS-SRTP), MSE1↔MSE2 (RTP), MSE2→LGW (RTP→PSTN). +- **WebRTC to Desk Phone**: WxCAS resolves desk phone → SSE coordinates with MSE → Browser↔MSE1 (DTLS-SRTP), MSE1↔MSE2 (RTP), MSE2→Desk Phone. diff --git a/agents/analyze_agent_v2/skills/architecture-endpoints-skill/references/contact_center_flow.md b/agents/analyze_agent_v2/skills/architecture-endpoints-skill/references/contact_center_flow.md new file mode 100644 index 0000000..9f5f82c --- /dev/null +++ b/agents/analyze_agent_v2/skills/architecture-endpoints-skill/references/contact_center_flow.md @@ -0,0 +1,24 @@ +# Contact Center — End-to-End Architecture + +**Signaling Path**: Browser → Mobius → SSE → Kamailio (SIP proxy) → Destination + +**Media Path**: Browser ↔ MSE ↔ Destination + +**Additional Contact Center Components:** + +- **Kamailio**: SIP proxy for Contact Center — handles SIP REGISTER, stores registration details on RTMS Application Server, routes calls to the appropriate destination. +- **RTMS**: Real-time microservice enabling persistent WebSocket connections between clients and backend services. +- **RAS** (Registration, Activation, and provisioning Service): Stores SIP REGISTER Contact and Path headers with expiry, maintains metrics for WebRTC active sessions and calls to WebRTC phones. + +**Health ping:** Mobius exposes `/api/v1/ping` (and internal variants). Response should be 200 OK with body `{"message":"Healthy"}`. Per-region endpoints exist for INT and PROD (e.g. US, CA, EU, UAE, UK, AU Sydney/Melbourne, South Africa, Saudi Arabia, Singapore). Backup clusters exist in some regions (e.g. UK, AU, South Africa, Saudi). Use the Mobius Runbook for the full health-check endpoint table and cluster list. + +**Timers in Mobius:** + +- **Registration keepalive (browser):** Browser sends keepalive periodically to keep registration active. Every **30 seconds**; after **5 missed** keepalives, Mobius triggers unregistration. +- **Call keepalive:** Browser sends keepalive during a call to keep it active; valid **within 15 minutes**. +- **SIP APP – Registration refresh:** Depends on Expires header from REGISTER. Mobius refreshes registration with SSE at **3/4 of the Expires value** before expiry. +- **SIP APP – Options scheduler:** Mobius sends OPTIONS ping to SSE to check connectivity; interval **35 seconds**. + +**Kafka (call failover):** When Mobius cannot find registration in local cache but finds it in global DB (e.g. after pod/region failover), the new pod communicates with the old pod via Kafka for owner change. Topic: `MOBIUS_REG_HANDOVER_TOPIC = "RegistrationHandover"`, group: `KAFKA_GROUP_ID = "registration_handover"`. + +**Inter-regional failover (updated):** In regions with a local backup (e.g. AU: SYD + MEL), if both primary and backup Mobius clusters are down (e.g. both SYD and MEL cannot reach SSE), Client/SDK will **not** failover to US. All WebRTC calls (WxC, WxCC, Guest Calling) in that region can be non-operational. If only the primary cluster is down (SSEs up), Client/SDK fails over to the backup in the same region (e.g. SYD → MEL). Singapore (SIN) and Canada (CA) have a single Mobius cluster each; they continue to use backup regions (e.g. SYD, US-East) when needed. diff --git a/agents/analyze_agent_v2/skills/mobius-error-id-skill/SKILL.md b/agents/analyze_agent_v2/skills/mobius-error-id-skill/SKILL.md new file mode 100644 index 0000000..8a5d048 --- /dev/null +++ b/agents/analyze_agent_v2/skills/mobius-error-id-skill/SKILL.md @@ -0,0 +1,20 @@ +--- +name: mobius-error-id-skill +description: Interpret Mobius error and call IDs from logs using the reference documentation. +--- + +# Mobius Error ID Lookup + +When analyzing Mobius logs, use this skill to interpret error IDs and call-related identifiers. + +## Instructions + +1. **Identify Mobius error IDs and call IDs** in the logs you are analyzing (e.g. from `mobius_logs`, or any field containing Mobius error codes, call IDs, or tracking identifiers that map to known error semantics). + +2. **Consult the reference document** `references/mobius_error_ids.md` to look up each such ID. That document contains the authoritative mapping of Mobius error/call IDs to their meanings, causes, and remediation notes. + +3. **Include the interpretation in your analysis:** + - In **Error Detection and Root Cause Analysis**: For each Mobius error ID found, state what the ID means (from the reference), likely cause, and suggested fix. + - In **Root Cause Analysis** (if applicable): Reference the documentation so your explanation is consistent with the defined semantics of each ID. + +4. If an ID is not present in the reference document, say so and describe the ID and context so a human can triage or update the documentation. diff --git a/agents/analyze_agent_v2/skills/mobius-error-id-skill/references/mobius_error_ids.md b/agents/analyze_agent_v2/skills/mobius-error-id-skill/references/mobius_error_ids.md new file mode 100644 index 0000000..9f3b508 --- /dev/null +++ b/agents/analyze_agent_v2/skills/mobius-error-id-skill/references/mobius_error_ids.md @@ -0,0 +1,325 @@ +# Mobius Error IDs and Call ID Reference + +Reference for Mobius HTTP response codes and `mobius-error` codes. Use this to interpret errors in Mobius logs (e.g. `fields.response_status`, mobius-error in response body). **For each error, use the detailed sections below to explain what the error means, user/call impact, likely root cause, and what to check next in your analysis.** + +--- + +## Registration errors (detailed) + +### 403 FORBIDDEN — mobius-error 101: Per-user device limit exceeded + +**What it means:** The client tried to register a device (browser/WebRTC client) with Mobius, but the system rejected it because the user has hit the maximum number of allowed device registrations, or this specific device is already considered registered, or a registration for this device is already in progress. + +**User/call impact:** The user cannot register this device for WebRTC calling. They may see a registration failure in the client; existing calls on other devices may work, but this device will not be able to place or receive calls until registration succeeds. + +**Root cause direction:** (1) User has genuinely reached the global per-user device limit — check entitlement and how many devices are registered. (2) Stale or duplicate registration: the device is already in Mobius/Redis cache, or a previous registration never completed and is still in progress. (3) Race: two registration requests for the same device; one may succeed and the other gets 101. + +**What to check in logs:** Look for multiple REGISTER attempts for the same `DEVICE_ID` or `USER_ID`; check Redis/local cache state if available. Correlate with unregistration and keepalive logs to see if an old registration was not cleaned up. In your analysis, state whether this looks like a limit issue vs. duplicate/stale registration. + +**Log pattern:** Log sample available for error code 101. + +--- + +### 403 FORBIDDEN — mobius-error 102: Device creation not enabled for user + +**What it means:** Mobius attempted to create or enable a device for this user by talking to CPAPI (Cisco Platform API). CPAPI either did not return a valid browser client ID or SIP address, or returned a 4xx response, so Mobius will not create the device. + +**User/call impact:** Registration fails. The user cannot use this device for WebRTC calling. This is typically an entitlement or provisioning issue, not a transient network failure. + +**Root cause direction:** (1) User not entitled for WebRTC/browser calling in CPAPI. (2) CPAPI returned invalid or missing browserClientId/sipAddress. (3) CPAPI returned 4xx (e.g. forbidden, not found) — the exact CPAPI response in upstream logs is key. + +**What to check in logs:** Find the CPAPI request/response preceding this error (same time window, same user/device). Check for CPAPI 4xx, timeouts, or empty/invalid browserClientId or sipAddress. In analysis, state whether the failure is at CPAPI (entitlement/provisioning) vs. Mobius logic. + +**Log pattern:** Log sample available for error code 102. + +--- + +### 403 FORBIDDEN — mobius-error 103: Device creation failed + +**What it means:** Mobius tried to complete device creation by sending a SIP REGISTER to the SSE (Signaling Service Edge). The SSE responded with a final error (e.g. 403), so device creation failed. + +**User/call impact:** Registration fails. The browser/client cannot be used for calling. The failure is downstream of Mobius (at the SIP/SSE layer). + +**Root cause direction:** SSE rejected the REGISTER — reasons can include policy, capacity, or invalid/duplicate registration data. The SSE 403 (or other final error) is the direct cause; Mobius is correctly surfacing it. + +**What to check in logs:** Correlate with SSE logs for the same time and REGISTER transaction (e.g. same sipCallId or correlation IDs). Identify the exact SIP response code and reason from SSE. In analysis, attribute the root cause to SSE and note the SIP response code and any reason phrase. + +**Log pattern:** None specifically; look for REGISTER and final response in SIP/SSE logs. + +--- + +### 503 SERVICE UNAVAILABLE (registration) + +**What it means:** Registration could not be completed because of a transient or backend failure: timeout to SSE or CPAPI, failure to store state in Redis or local map, outbound proxy (OBP) resolution failure, or an unhandled exception in the registration handler. + +**User/call impact:** Registration fails. The user may retry; some causes are transient (timeouts, temporary SSE/CPAPI unavailability), others indicate a misconfiguration or backend issue (Redis, OBP). + +**Root cause direction:** Use the log message to narrow down: + +- **"Registration failed due to connection timeout for device: {}"** — Mobius could not establish or maintain connection to SSE within the timeout. Check SSE health, network path, and load. +- **"No more retries left, failed to setup connection with all remote hosts"** — Mobius tried all known SSE nodes and could not connect. Points to SSE cluster or network issue. +- **"Failed to add device: {} in sorted set"** — Redis write failed. Check Redis connectivity and capacity. +- **"Device entry insertion failed for deviceId {} sipAoR {} outboundProxy {} userUUID {}"** — Local in-memory map insertion failed (e.g. duplicate key or resource). +- **"BaseHandler Exception: {}"** — Unhandled exception in registration handler; inspect stack trace for the actual failure. +- **"OBP resolution failed"** — Outbound proxy (SSE address) resolution failed (DNS or config). +- **"Unhandled SIP response code received"** — SSE sent a SIP response that Mobius does not handle explicitly; note the code in analysis. +- **"No more retries left, registration failed for device: {} with status: {} {}"** — SSE returned an error after retries; the status and message are the direct cause. +- **"Received Client Exception from Provisioning Client while querying Browser Client Id."** — CPAPI call (for browser client id) timed out or threw. Check CPAPI availability and latency. + +**What to check in logs:** Match the exact log message above to the failure; then correlate with SSE, CPAPI, or Redis logs in the same time window. In analysis, name the failing dependency (SSE, CPAPI, Redis, OBP) and the concrete log message. + +**Log samples:** "Registration: Error code: 503 SERVICE UNAVAILABLE due to error response from SSE" or "due to CPAPI timeout". + +--- + +### 401 UNAUTHORIZED (registration) + +**What it means:** The user’s token (used for the registration request) was rejected. Common Identity (CI) returned 401, so Mobius does not consider the user authenticated. + +**User/call impact:** Registration fails. The client must present a valid token (re-auth, token refresh, or re-login). + +**Root cause direction:** Expired or invalid token; CI outage or misconfiguration; wrong token or audience. Not usually a Mobius bug. + +**What to check in logs:** Confirm CI 401 in upstream calls; check token expiry and issuer/audience if available. In analysis, state that authentication failed and whether it is token lifecycle vs. CI/service issue. + +--- + +### 501 NOT IMPLEMENTED (registration) + +**What it means:** An uncaught or unhandled exception occurred during registration. The "501 NOT IMPLEMENTED" is often a generic surface for unexpected code paths (e.g. "Incorrect User Data"). + +**User/call impact:** Registration fails. May be a bug or unexpected input. + +**Root cause direction:** Look for "Incorrect User Data" or similar in logs; indicates bad or unexpected user/device data. Could be client bug, schema change, or missing validation. + +**What to check in logs:** Search for "Incorrect User Data" and any stack trace in the same request. In analysis, note whether this looks like bad input vs. server-side bug. + +--- + +## Unregistration errors (detailed) + +### 404 NOT FOUND (unregistration) + +**What it means:** The client asked to unregister a device, but Mobius has no registration record for that identifier (e.g. device already unregistered, or wrong ID). + +**User/call impact:** Unregistration request fails. From a user perspective this is often harmless (device was already unregistered); sometimes it indicates a client sending unregister for a non-existent registration. + +**Root cause direction:** (1) Idempotent: device was already unregistered. (2) Client sent wrong device/session ID. (3) Registration expired or was cleaned up elsewhere before unregister arrived. + +**What to check in logs:** "Registration not found for {}" — confirm the identifier in the message. In analysis, state whether this is expected (idempotent) or points to client/identifier mix-up. + +--- + +### 503 SERVICE UNAVAILABLE (unregistration) + +**What it means:** Unregistration could not be completed. Two main cases: (1) Mobius intentionally blocks unregister because the device still has active call(s) — unregister would be unsafe. (2) An exception occurred while handling the unregistration request. + +**User/call impact:** Unregistration fails. In case (1), this is correct behavior (protecting active calls). In case (2), the device may remain in a registered state until retry or cleanup. + +**Root cause direction:** (1) "Call/s exist for this device, can not continue with unregister request." — By design; do not treat as a bug. (2) "Caught exception {} while handling unregistration" — Inspect the exception; could be backend (e.g. Redis) or logic bug. + +**What to check in logs:** Match the message above; if calls exist, correlate with call logs for that device. In analysis, distinguish "blocked by design" vs. "exception during unregister". + +--- + +### 501 NOT IMPLEMENTED (unregistration) + +**What it means:** Unhandled exception during unregistration (e.g. "Incorrect User Data"). Same interpretation as 501 for registration. + +**User/call impact:** Unregistration may not complete; device state may be inconsistent. + +**What to check in logs:** Look for "Incorrect User Data" or exception details. In analysis, note bad input or server-side handling gap. + +--- + +## Call errors (detailed) + +### 403 FORBIDDEN — mobius-error 112: Device is not registered + +**What it means:** The client sent a call-related request (e.g. make call, answer, disconnect) using a device-id that Mobius does not have in its registration state. The device either never registered successfully or already unregistered. + +**User/call impact:** The call action fails. User may see "device not registered" or similar. They may need to re-register (refresh, reload, or re-login) before placing or answering calls. + +**Root cause direction:** (1) Registration failed earlier (check for 101/102/103/503 in registration flow). (2) Registration expired or was removed (keepalive miss, unregister). (3) Client using wrong device-id or session. (4) Race: unregister completed before call request was processed. + +**What to check in logs:** Confirm there is no successful registration for this device-id in the same time window; look for prior registration failures or unregister. "Call: Error code: 403 : FORBIDDEN, mobius-error: 112, due to Device not registered". In analysis, tie 112 to the registration state and any prior registration/unregistration events. + +--- + +### 403 FORBIDDEN — mobius-error 115: User Busy + +**What it means:** The device cannot accept this call (or call action) because it is already busy: either a call is in the process of being set up ("allocating" state) or the SSE returned SIP 486 Busy Here. + +**User/call impact:** Incoming call may not be presented, or a new outbound call may fail. The user or the other party may see "busy" or "user busy". + +**Root cause direction:** (1) Legitimate: user/device is on another call or call is being set up. (2) Stuck state: a previous call left the device in "allocating" and never cleared — look for incomplete call teardown. (3) SSE sent 486 (callee busy) — downstream routing or endpoint state. + +**What to check in logs:** Look for another call or "allocating" state for the same device; look for 486 in SIP logs. "Call: Error code: 403: FORBIDDEN, mobius-error: 115, due to User busy". In analysis, state whether this is expected (user busy) vs. possible stuck state. + +--- + +### 403 FORBIDDEN — mobius-error 118: Not Acceptable + +**What it means:** The call or session setup was rejected by the SSE with SIP 488 Not Acceptable. Typically relates to SDP/negotiation: codecs, media, or session parameters were not acceptable to the far end or the network. + +**User/call impact:** Call setup fails. User may see a generic call failure or "not acceptable" type message. + +**Root cause direction:** SDP/codec mismatch, unsupported media type, or policy rejection. Check SDP in SIP messages (offer/answer) and any codec or media restrictions in SSE/downstream. + +**What to check in logs:** Find the 488 response from SSE and the associated INVITE/offer. Compare SDP (m= lines, codecs) with what the client sent. In analysis, describe the negotiation failure (e.g. no common codec, or rejected media). + +--- + +### 403 FORBIDDEN — mobius-error 119: Call Rejected + +**What it means:** The call was explicitly rejected by the SSE with SIP 403 or 603 (or equivalent). The network or destination rejected the call, not Mobius. + +**User/call impact:** Call fails; user may see "call rejected" or "declined". + +**Root cause direction:** Policy (e.g. blocking), destination rejected (603 Decline), or SSE/backend returned 403/603. The root cause is downstream; Mobius is forwarding the rejection. + +**What to check in logs:** Find the 403/603 from SSE and the reason phrase. Correlate with WxCAS/SSE logs for routing and rejection reason. In analysis, state who rejected (SSE, destination, policy) and the SIP code and reason. + +--- + +### 403 FORBIDDEN — mobius-error 121: Mid Call Request Rejected + +**What it means:** A mid-call request (e.g. hold, resume, transfer, add media) was sent to CXAPI (Call Control API), and CXAPI returned a 4xx response. Mobius surfaces this as 121. + +**User/call impact:** The mid-call action (hold, transfer, etc.) fails. The call may remain in its previous state; user may see an error for that action. + +**Root cause direction:** CXAPI rejected the request — invalid state, invalid parameters, or policy. Check CXAPI logs and the specific 4xx code and body. + +**What to check in logs:** Correlate with CXAPI request/response for the same call and timestamp. Note the 4xx code and any error message. In analysis, attribute the failure to CXAPI and the reason (state, params, or policy). + +--- + +### 503 SERVICE UNAVAILABLE — mobius-error 117: Timeout error + +**What it means:** Offer-answer (SDP) negotiation did not complete within the expected time (ROAP_TIMEOUT). The client or the network did not complete the exchange in time. + +**User/call impact:** Call setup fails with a timeout. User may see "call failed" or "timeout". + +**Root cause direction:** (1) Network or client latency; (2) Client or far end not responding to offer/answer in time; (3) ROAP timeout value too short for the path. Check timing between offer and answer in logs. + +**What to check in logs:** Measure time between sending offer and receiving answer (or timeout). Look for delayed or lost SIP/HTTP messages. In analysis, state whether the timeout is client-side, network, or backend delay. + +--- + +### 503 SERVICE UNAVAILABLE — mobius-error 120: Not Available + +**What it means:** The call or request could not be fulfilled because the SSE rejected it, CXAPI returned 5xx/6xx or threw, or an unknown exception occurred in the call handler (e.g. null pointer). This is a catch-all for "service or logic unavailable". + +**User/call impact:** Call or call action fails. User may see a generic error or "not available". + +**Root cause direction:** (1) SSE rejection — check SSE logs for the same transaction. (2) CXAPI 5xx/6xx or exception — check CXAPI logs. (3) Null pointer or exception in Mobius call handler — look for stack trace; may be a bug or unexpected state. Example: "Call: Error code: 503 :SERVICE UNAVAILABLE, mobius-error: 120, due to null pointer exception while processing Connect request from client". + +**What to check in logs:** Match the exact "due to" message; correlate with SSE and CXAPI. In analysis, name the failing component and whether it looks like backend failure vs. Mobius bug (e.g. NPE). + +--- + +### 404 NOT FOUND — mobius-error 113: Call not found + +**What it means:** The client sent a request for an existing call (e.g. answer, disconnect, update) using a call-id that Mobius does not have in its call state. The call may have already ended, or the call-id is wrong or from another instance. + +**User/call impact:** The call action fails. User may see "call not found" or similar. The call may have been torn down already, or there is a client/state sync issue. + +**Root cause direction:** (1) Call already ended (BYE, timeout, or cleanup). (2) Client using stale or wrong call-id. (3) Request routed to a Mobius instance that does not have this call (instances may not share call state). (4) Race: teardown completed before the request was processed. + +**What to check in logs:** "Call: Error code: 404 NOT FOUND, mobius-error: 113, Call not found". Look for BYE or call teardown for this call-id before the 404; check if multiple Mobius instances are involved. In analysis, state whether the call was already gone vs. wrong ID vs. instance mismatch. + +--- + +### 500 INTERNAL SERVER ERROR — mobius-error 114: Error in processing call + +**What it means:** Something went wrong while Mobius was processing the call: SSE rejected a request, an exception occurred while processing the SSE response, or the client sent an event that is not valid in the current call state (e.g. answer when not ringing). + +**User/call impact:** Call or call action fails; user may see a generic server error. + +**Root cause direction:** (1) SSE rejection — see SSE logs. (2) Unknown exception processing SSE response — look for stack trace. (3) "Client event isn't supported in current call state" — client sent an out-of-order or invalid event (e.g. answer before 180/183, or disconnect in wrong state). Example: "Call: Error code: 500 SERVER ERROR, mobius-error: 114, due to client event isn't supported in current call state." + +**What to check in logs:** Match the "due to" message; correlate call state (ringing, connected, etc.) with the client event. In analysis, state whether the failure is client protocol/state machine vs. SSE/Mobius backend. + +--- + +### 400 BAD REQUEST (calls) + +**What it means:** The request body or parameters were invalid: parse error, schema mismatch, or missing required field. Mobius could not interpret or validate the request. + +**User/call impact:** Call or call action fails with a bad request. Often a client bug or version mismatch. + +**Root cause direction:** Malformed JSON, wrong schema, or client sending unexpected/old format. Check the request payload in logs. In analysis, note the invalid field or parse error if present. + +--- + +### 501 NOT IMPLEMENTED (calls) + +**What it means:** The endpoint or call flow is not implemented in Mobius. The client may be using a newer API or a flow that this version of Mobius does not support. + +**User/call impact:** Request fails with "not implemented". + +**Root cause direction:** API/version mismatch or feature not yet implemented. In analysis, note the endpoint or flow and suggest checking client and Mobius versions. + +--- + +## Other / Ingress and platform errors (detailed) + +### 429 TOO MANY REQUESTS + +**What it means:** Nginx (or the ingress layer) is rate-limiting because the number of requests exceeded the configured threshold. This is a DoS/abuse protection. + +**User/call impact:** Requests are rejected with 429; users may see errors or throttling. Can affect many users if a single client or script is noisy. + +**Root cause direction:** (1) Noisy client or script (e.g. retries, polling). (2) Traffic spike. (3) Misconfigured threshold. Check request rate per client/IP in logs. In analysis, state whether this is expected rate limiting and which client or IP is driving the load. + +--- + +### 503 SERVICE UNAVAILABLE (service not ready) + +**What it means:** The Mobius instance is not considered "ONLINE" — e.g. health check (ping) failed or the service has not finished starting. Load balancer or orchestrator may stop sending traffic to this instance. + +**User/call impact:** Requests to this instance fail with 503. Users may be routed to other instances; if all are down, calling fails. + +**Root cause direction:** Instance startup, dependency (e.g. Redis, SSE) failing health check, or overload. Check Mobius ping/health and dependency health. In analysis, state whether this is single-instance vs. broader outage. + +--- + +### 499 CLIENT CLOSED REQUEST + +**What it means:** The client closed the TCP/HTTP connection before the server sent the response. Nginx records this as 499. + +**User/call impact:** The request did not complete; the client may have navigated away, refreshed, or timed out on its side. + +**Root cause direction:** User action (close tab, navigate away), client timeout, or network drop. Usually not a server bug. In analysis, note that the client closed the connection and whether it correlates with timeouts or user actions. + +--- + +## Quick lookup: mobius-error code → meaning + +| mobius-error | Category | Short meaning | +|--------------|------------|------------------------------------------------------------| +| 101 | Registration | Per-user device limit exceeded | +| 102 | Registration | Device creation not enabled for user (CPAPI) | +| 103 | Registration | Device creation failed (SSE rejected REGISTER) | +| 112 | Calls | Device is not registered | +| 113 | Calls | Call not found | +| 114 | Calls | Error in processing call (SSE/call state/event) | +| 115 | Calls | User busy | +| 117 | Calls | Timeout (e.g. ROAP offer-answer) | +| 118 | Calls | Not acceptable (e.g. 488 from SSE, SDP/codec) | +| 119 | Calls | Call rejected (403/603 from SSE) | +| 120 | Calls | Not available (SSE/CXAPI rejection or exception) | +| 121 | Calls | Mid-call request rejected (CXAPI 4xx) | + +--- + +## Useful Mobius log filters (Kibana / OpenSearch) + +- `tags: mobius` — all Mobius-related logs +- `fields.eventType`: `mobiusapp` (application), `sipmsg` (SIP), `httpmsg` (HTTP) +- `fields.id`: `mobiusreg`, `mobiuscall`, `mobiusconn`, `mobiusdns`, `mobiuscallkeepalive`, `mobiusregkeepalive` +- `fields.localAlias`: `mobius-reg-app`, `mobius-sip-app`, `mobius-call-app` +- `fields.response_status` — HTTP response to client +- `fields.mobiusCallId`, `fields.sipCallId`, `fields.DEVICE_ID`, `fields.USER_ID`, `fields.WEBEX_TRACKINGID` + +Index pattern: `wxm-app:logs*` (EU may use a separate index). diff --git a/agents/analyze_agent_v2/skills/sip-flow-skill/SKILL.md b/agents/analyze_agent_v2/skills/sip-flow-skill/SKILL.md new file mode 100644 index 0000000..1cb8994 --- /dev/null +++ b/agents/analyze_agent_v2/skills/sip-flow-skill/SKILL.md @@ -0,0 +1,19 @@ +--- +name: sip-flow-skill +description: Reference for SIP protocol flows, message sequences, error codes, and debugging patterns. Use when analyzing SIP signaling in SSE, MSE, WxCAS, or Mobius logs to understand expected message flows and diagnose failures. +--- + +# SIP Flow Reference + +When analyzing logs that contain SIP signaling (SSE/MSE logs from logstash-wxcalling, Mobius logs from logstash-wxm-app, or WxCAS logs), use this skill to: + +- **Understand expected SIP message sequences** for different call scenarios (basic call, call hold, transfer, forwarding, conference). +- **Interpret SIP response codes** and map them to root causes. +- **Identify SIP flow anomalies** such as missing ACKs, retransmissions, unexpected BYEs, or timeout-triggered responses. +- **Correlate SIP dialogs** across services using Call-ID, From/To tags, CSeq, and branch parameters. + +Consult the reference document: + +- **references/sip_flows.md** — SIP message sequences, response code reference, SDP negotiation, timers, and common failure patterns. + +Use this skill alongside the architecture-endpoints-skill to attribute SIP messages to the correct service and understand the signaling path. diff --git a/agents/analyze_agent_v2/skills/sip-flow-skill/references/sip_flows.md b/agents/analyze_agent_v2/skills/sip-flow-skill/references/sip_flows.md new file mode 100644 index 0000000..4c49647 --- /dev/null +++ b/agents/analyze_agent_v2/skills/sip-flow-skill/references/sip_flows.md @@ -0,0 +1,250 @@ +# SIP Flow Reference + +## 1. Basic Call Setup (INVITE Transaction) + +### Successful Call +``` +Caller (Mobius) SSE WxCAS Callee (Mobius) + |--- INVITE -------->| | | + |<-- 100 Trying -----| | | + | |--- INVITE --->| | + | |<-- 100 Trying-| | + | | |--- INVITE -------->| + | | |<-- 100 Trying -----| + | | |<-- 180 Ringing ----| + | |<-- 180 Ring---| | + |<-- 180 Ringing -----| | | + | | |<-- 200 OK ---------| + | |<-- 200 OK ----| | + |<-- 200 OK ----------| | | + |--- ACK ------------>| | | + | |--- ACK ------>| | + | | |--- ACK ----------->| + | | | | + |==================== RTP Media (via MSE) =================| + | | | | + |--- BYE ------------>| | | + | |--- BYE ------>| | + | | |--- BYE ----------->| + | | |<-- 200 OK ---------| + | |<-- 200 OK ----| | + |<-- 200 OK ----------| | | +``` + +### Early Media (183 Session Progress) +When the callee provides early media (ringback tone, IVR prompts): +``` +Caller SSE WxCAS Callee + |--- INVITE ---->| | | + |<-- 100 Trying -| | | + | |--- INVITE --->| | + | | |--- INVITE ---->| + | | |<-- 183 + SDP --| + | |<-- 183 + SDP -| | + |<-- 183 + SDP --| | | + |--- PRACK ----->| | | + | |--- PRACK ---->| | + | | |--- PRACK ----->| + | | |<-- 200 (PRACK)-| + |============ Early Media (RTP via MSE) ==========| + | | |<-- 200 OK -----| + |<-- 200 OK -----| | | + |--- ACK ------->| | | +``` + +## 2. Call Hold / Resume + +### Hold (re-INVITE with sendonly) +``` +Holder SSE WxCAS Held Party + |--- re-INVITE -->| | | + | (a=sendonly) |--- re-INVITE->| | + | | |--- re-INVITE ->| + | | |<-- 200 OK -----| + | | | (a=recvonly) | + | |<-- 200 OK ----| | + |<-- 200 OK ------| | | + |--- ACK -------->| | | +``` + +### Resume (re-INVITE with sendrecv) +Same flow as hold but with `a=sendrecv` in SDP. + +## 3. Call Transfer (REFER) + +### Blind Transfer +``` +A (Transferor) SSE WxCAS B (Transferee) C (Target) + |--- REFER ------>| | | | + | Refer-To: C | | | | + |<-- 202 Accepted-| | | | + | |--- INVITE (B to C) ----->| | + | | | |--- INVITE --->| + |<-- NOTIFY ------| | | | + | (100 Trying) | | |<-- 200 OK ---| + |<-- NOTIFY ------| | | | + | (200 OK) | | | | + |--- BYE -------->| (A hangs up) | | + |<-- 200 OK ------| | | | +``` + +### Attended Transfer +Same as blind but the transferor first establishes a call with the target (consultation call), then sends REFER with `Replaces` header. + +## 4. Call Forwarding + +### Forwarding (302 Moved Temporarily) +``` +Caller SSE WxCAS Forwarder Target + |--- INVITE ---->| | | | + | |--- INVITE --->| | | + | | |--- INVITE ---->| | + | | |<-- 302 --------| | + | | | Contact: target | + | | |--- ACK ------->| | + | | |--- INVITE ---->|-------------->| + | | |<-- 200 OK -----|---------------| + | |<-- 200 OK ----| | | + |<-- 200 OK -----| | | | +``` + +## 5. SIP Response Code Reference + +### 1xx Provisional +| Code | Meaning | Notes | +|------|---------|-------| +| 100 | Trying | Hop-by-hop. Suppresses INVITE retransmissions. | +| 180 | Ringing | End-to-end. Callee alerting. | +| 181 | Call Is Being Forwarded | Optional. Indicates forwarding. | +| 183 | Session Progress | Early media. Contains SDP for early RTP. Requires PRACK if 100rel. | + +### 2xx Success +| Code | Meaning | Notes | +|------|---------|-------| +| 200 | OK | Final success. For INVITE: must be ACKed. | +| 202 | Accepted | Used for REFER. Subscription implicitly created. | + +### 3xx Redirection +| Code | Meaning | Notes | +|------|---------|-------| +| 301 | Moved Permanently | Target has moved. Update address. | +| 302 | Moved Temporarily | Used for call forwarding. Contains new Contact. | + +### 4xx Client Errors +| Code | Meaning | Common Causes in Webex Calling | +|------|---------|-------------------------------| +| 400 | Bad Request | Malformed SIP message, invalid SDP. | +| 401 | Unauthorized | Authentication required. Check credentials. | +| 403 | Forbidden | User not authorized. Check entitlements in CPAPI. | +| 404 | Not Found | Destination not registered or unknown. Check WxCAS routing. | +| 407 | Proxy Auth Required | Proxy authentication needed. | +| 408 | Request Timeout | No response from destination within timer B (32s). Destination unreachable. | +| 415 | Unsupported Media | SDP codec mismatch. Check offered vs supported codecs. | +| 480 | Temporarily Unavailable | Destination offline or not reachable. Check registration status. | +| 481 | Call/Transaction Does Not Exist | BYE/ACK for unknown dialog. Possible state mismatch or race condition. | +| 486 | Busy Here | Callee busy. | +| 487 | Request Terminated | INVITE cancelled by caller (CANCEL sent before final response). | +| 488 | Not Acceptable Here | SDP negotiation failure. Offered codecs/media not acceptable. | +| 491 | Request Pending | Glare: simultaneous re-INVITEs. Retry after random delay. | + +### 5xx Server Errors +| Code | Meaning | Common Causes in Webex Calling | +|------|---------|-------------------------------| +| 500 | Server Internal Error | Service crash or unhandled exception. Check SSE/WxCAS logs. | +| 502 | Bad Gateway | Upstream service failure. SSE couldn't reach WxCAS or destination. | +| 503 | Service Unavailable | Overloaded or in maintenance. Check health endpoints. | +| 504 | Server Timeout | Upstream response timeout. Check inter-service connectivity. | + +### 6xx Global Errors +| Code | Meaning | Notes | +|------|---------|-------| +| 600 | Busy Everywhere | Callee busy on all devices. | +| 603 | Decline | Callee explicitly rejected the call. | +| 604 | Does Not Exist Anywhere | Number/URI not found globally. | + +## 6. SDP Negotiation (Offer/Answer) + +### Key SDP Fields to Check +- `v=` — Protocol version (always 0) +- `o=` — Session originator (username, session-id, version, address) +- `c=` — Connection info (IP address for media) +- `m=` — Media line: `m=audio RTP/SAVPF ` or `m=video ...` +- `a=rtpmap:` — Codec mapping (e.g., `a=rtpmap:111 opus/48000/2`) +- `a=sendrecv` / `a=sendonly` / `a=recvonly` / `a=inactive` — Media direction +- `a=ice-ufrag` / `a=ice-pwd` — ICE credentials +- `a=candidate:` — ICE candidates (host, srflx, relay) +- `a=fingerprint:` — DTLS fingerprint for SRTP key exchange +- `a=setup:` — DTLS role (`actpass`, `active`, `passive`) + +### Common SDP Issues +- **Codec mismatch**: Offer contains codecs not in answer → 488 or no media +- **Missing ICE candidates**: No relay candidates → can fail behind strict NAT/firewall +- **Port 0 in answer**: Media stream rejected by answerer +- **IP mismatch**: SDP `c=` address unreachable → one-way or no audio +- **Direction conflict**: Both sides `sendonly` → no bidirectional media + +## 7. SIP Timers (RFC 3261) + +| Timer | Default | Purpose | +|-------|---------|---------| +| T1 | 500ms | RTT estimate. Base for retransmission intervals. | +| T2 | 4s | Maximum retransmission interval for non-INVITE requests. | +| T4 | 5s | Maximum time a message remains in the network. | +| Timer A | Initially T1 | INVITE retransmission interval (doubles each retransmit). | +| Timer B | 64*T1 (32s) | INVITE transaction timeout. No response → 408. | +| Timer C | >3min | Proxy INVITE transaction timeout. | +| Timer D | >32s (UDP) | Wait time after INVITE client receives non-2xx. | +| Timer F | 64*T1 (32s) | Non-INVITE transaction timeout. | +| Timer H | 64*T1 (32s) | Wait time for ACK after sending non-2xx to INVITE. | + +### Debugging with Timers +- **No 100 Trying within T1**: Possible network issue or destination down. +- **INVITE retransmissions (Timer A doubling)**: 500ms, 1s, 2s, 4s... indicates no response from next hop. +- **Timer B expiry (32s)**: No final response to INVITE. Results in 408 Request Timeout. +- **Missing ACK after 200 OK (Timer H)**: Dialog state leak. Possible NAT/firewall blocking ACK. + +## 8. Common Failure Patterns + +### One-Way Audio +- **Symptoms**: One party can hear, other cannot. +- **Check**: SDP `c=` addresses, ICE connectivity, NAT traversal, `a=sendrecv` direction, firewall rules on RTP ports. +- **In logs**: Look for ICE failure events, OODLE/media quality alerts, OOOOOOO (no media flowing). + +### Call Drops After ~32 Seconds +- **Cause**: Timer B expiry — INVITE not answered. +- **Check**: Destination registration, SSE→WxCAS connectivity, WxCAS→destination routing. + +### ooooo (No Audio) / oOOOOOo (Intermittent) +- **Check**: MSE logs for RTP packet counters, ICE state, DTLS handshake completion. + +### Registration Failures +- **401/407 loops**: Authentication issues. Check credentials and nonce handling. +- **Keepalive failures**: 5 missed keepalives (30s interval) → unregistration. Check network stability. + +### Call Setup Failures +- **Location Service Error 404**: User not registered on WxCAS. Check REGISTER flow. +- **488 Not Acceptable Here**: SDP mismatch. Compare offered vs required codecs. +- **Location Service Error 480**: User temporarily unavailable. Check device status. + +### Ooooo (Ooh Pattern - Ooooo in SSE Logs) +- Periodic patterns of capital and lowercase letters in SSE logs represent media flow quality markers. +- All lowercase (`ooooo`) = no media detected. +- Capital letters = media packets detected in that interval. + +## 9. SIP Dialog Correlation Across Services + +To trace a single call across Mobius, SSE, and WxCAS logs: + +1. **Call-ID**: Same across all services for a given dialog leg. Search all log sources with the same Call-ID. +2. **From-tag / To-tag**: Combined with Call-ID, uniquely identifies a dialog. Use to distinguish forked calls. +3. **CSeq**: Sequence number per method. Helps order messages within a dialog. +4. **Via branch**: Transaction identifier. Same branch = same transaction across hops. +5. **Tracking ID**: Webex-specific. Correlates browser session to SIP dialog. Found in Mobius logs and X-headers in SIP. + +### Cross-Service Mapping +| Identifier | Mobius Logs | SSE/MSE Logs | WxCAS Logs | +|-----------|------------|-------------|-----------| +| Call-ID | `sipCallId` field | `Call-ID` header | `Call-ID` header | +| Tracking ID | `trackingId` field | X-Cisco-TrackingId header | X-Cisco-TrackingId header | +| Session ID | `sessionId` / `localSessionId` | Session-ID header | Session-ID header | +| Correlation ID | `correlationId` field | X-Cisco-CorrelationId | X-Cisco-CorrelationId | From 917438df616953f085f4358e12115410a33eaf3f Mon Sep 17 00:00:00 2001 From: Ritesh Singh Date: Thu, 12 Mar 2026 06:14:35 +0530 Subject: [PATCH 3/9] feat: add incremental analysis --- .gitignore | 3 +- agents/analyze_agent_v2/incremental.py | 888 +++++++++++++++++++++++++ agents/query_analyzer/__init__.py | 1 + agents/query_analyzer/agent.py | 370 +++++++++++ agents/query_router/agent.py | 12 +- agents/root_agent_v2/agent.py | 14 +- agents/search_agent_v2/agent.py | 55 +- incremental_map-reduce_analysis.md | 268 ++++++++ 8 files changed, 1594 insertions(+), 17 deletions(-) create mode 100644 agents/analyze_agent_v2/incremental.py create mode 100644 agents/query_analyzer/__init__.py create mode 100644 agents/query_analyzer/agent.py create mode 100644 incremental_map-reduce_analysis.md diff --git a/.gitignore b/.gitignore index cc15720..8813c10 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ __pycache__ .adk/ -.env \ No newline at end of file +.env +.venv/ \ No newline at end of file diff --git a/agents/analyze_agent_v2/incremental.py b/agents/analyze_agent_v2/incremental.py new file mode 100644 index 0000000..7b3d7a3 --- /dev/null +++ b/agents/analyze_agent_v2/incremental.py @@ -0,0 +1,888 @@ +""" +Incremental Map-Reduce Analysis — processes log batches as they arrive from search. + +Exports a clean function interface consumed by search_agent_v2: + - new_rolling_analysis() → empty rolling state + - map_batch() → MAP: one batch + compact memory → structured JSON + - reduce() → REDUCE: merge map output into rolling state + - compress_analysis_summary() → shrink rolling summary when it exceeds token cap + - format_to_markdown() → convert final rolling state to markdown report + - run_analysis_consumer() → asyncio.Queue consumer loop (producer-consumer pattern) + - analyze_upload_only() → single-pass analysis for SDK-only uploads +""" + +import asyncio +import json +import logging +import os +from typing import Any + +import litellm +from dotenv import load_dotenv +from pathlib import Path + +env_path = Path(__file__).parent.parent / ".env" +load_dotenv(dotenv_path=env_path) + +logger = logging.getLogger(__name__) + +# Re-use TokenBudget from search_agent_v2 (imported by callers, passed in as arg). +# We only reference the type for documentation; no import needed at module level. + +# ═══════════════════════════════════════════════════════════════════════════════ +# Skill References (loaded on-demand via tool calls) +# ═══════════════════════════════════════════════════════════════════════════════ + +_SKILLS_DIR = Path(__file__).parent / "skills" + +_SKILL_FILE_MAP = { + "lookup_mobius_error_codes": _SKILLS_DIR / "mobius-error-id-skill" / "references" / "mobius_error_ids.md", + "lookup_architecture": _SKILLS_DIR / "architecture-endpoints-skill" / "references" / "architecture_and_endpoints.md", + "lookup_sip_flows": _SKILLS_DIR / "sip-flow-skill" / "references" / "sip_flows.md", + "lookup_calling_flow": _SKILLS_DIR / "architecture-endpoints-skill" / "references" / "calling_flow.md", + "lookup_contact_center_flow": _SKILLS_DIR / "architecture-endpoints-skill" / "references" / "contact_center_flow.md", +} + +_SKILL_CACHE: dict[str, str] = {} + + +def _load_skill_reference(name: str) -> str: + """Load a skill reference file, with caching.""" + if name in _SKILL_CACHE: + return _SKILL_CACHE[name] + path = _SKILL_FILE_MAP.get(name) + if not path or not path.exists(): + return f"Reference '{name}' not found." + content = path.read_text(encoding="utf-8") + _SKILL_CACHE[name] = content + logger.info(f"[_load_skill_reference] Loaded {name}: {len(content)} chars") + return content + + +_TOOL_DEFINITIONS = [ + { + "type": "function", + "function": { + "name": "lookup_mobius_error_codes", + "description": ( + "Look up Mobius HTTP error codes and mobius-error codes " + "(e.g., 101, 102, 103, 403, 503). Returns detailed reference " + "with root cause direction, user impact, and what to check in logs. " + "Call this when you see mobius-error codes or unexpected HTTP status " + "codes from Mobius in the log batch." + ), + "parameters": {"type": "object", "properties": {}, "required": []}, + }, + }, + { + "type": "function", + "function": { + "name": "lookup_architecture", + "description": ( + "Look up Webex Calling / Contact Center architecture: service roles " + "(Mobius, SSE, MSE, WxCAS, CPAPI, Mercury, WDM, U2C), signaling and " + "media paths, call types, multi-instance deployment, timers, failover. " + "Call this when you need to understand how services connect or what a " + "specific component does." + ), + "parameters": {"type": "object", "properties": {}, "required": []}, + }, + }, + { + "type": "function", + "function": { + "name": "lookup_sip_flows", + "description": ( + "Look up SIP message flow references: call setup (INVITE transaction), " + "early media (183), hold/resume (re-INVITE), call transfer (REFER), " + "registration (REGISTER), SIP response codes, SDP negotiation, timers, " + "and common failure patterns. Call this when you see SIP messages in logs " + "and need to verify the expected flow or diagnose a SIP failure." + ), + "parameters": {"type": "object", "properties": {}, "required": []}, + }, + }, + { + "type": "function", + "function": { + "name": "lookup_calling_flow", + "description": ( + "Look up WebRTC Calling end-to-end flow: signaling path, media path, " + "call types (WebRTC-to-WebRTC, WebRTC-to-PSTN, WebRTC-to-DeskPhone). " + "Call this when analyzing a standard calling flow." + ), + "parameters": {"type": "object", "properties": {}, "required": []}, + }, + }, + { + "type": "function", + "function": { + "name": "lookup_contact_center_flow", + "description": ( + "Look up Contact Center architecture: Kamailio SIP proxy, RTMS, RAS, " + "health ping endpoints, Mobius timers, Kafka failover, inter-regional " + "failover. Call this when logs indicate a Contact Center flow." + ), + "parameters": {"type": "object", "properties": {}, "required": []}, + }, + }, +] + + +def _handle_tool_calls(tool_calls: list) -> list[dict]: + """Execute tool calls and return tool result messages.""" + results = [] + for tc in tool_calls: + name = tc.function.name + content = _load_skill_reference(name) + results.append({ + "role": "tool", + "tool_call_id": tc.id, + "content": content, + }) + logger.info(f"[_handle_tool_calls] Executed {name} -> {len(content)} chars") + return results + +# ═══════════════════════════════════════════════════════════════════════════════ +# Constants +# ═══════════════════════════════════════════════════════════════════════════════ + +CHARS_PER_TOKEN_ESTIMATE = 4 +ROLLING_SUMMARY_TOKEN_CAP = 4_000 +TIMELINE_MAX_EVENTS = 50 + +_IDENTIFIER_KEYS = [ + "session_ids", + "call_ids", + "tracking_ids", + "user_ids", + "device_ids", + "trace_ids", + "sip_call_ids", + "sse_call_ids", +] + + +# ═══════════════════════════════════════════════════════════════════════════════ +# Data Structures +# ═══════════════════════════════════════════════════════════════════════════════ + + +def new_rolling_analysis() -> dict: + """Factory: returns an empty rolling_analysis structure.""" + return { + "identifiers": {k: [] for k in _IDENTIFIER_KEYS}, + "timeline": [], + "errors": [], + "state_machine": [], + "cross_service_correlations": [], + "summary": "", + "evidence_count": 0, + "batch_count": 0, + } + + +def _estimate_tokens(text: str) -> int: + """Estimate token count from character length.""" + return len(text) // CHARS_PER_TOKEN_ESTIMATE + + +def _get_llm_config() -> tuple[str, str]: + """Return (api_key, api_base) for LLM calls.""" + api_key = ( + os.environ.get("OPENAI_API_KEY") + or os.environ.get("AZURE_OPENAI_API_KEY") + or "pending-oauth" + ) + api_base = os.environ["AZURE_OPENAI_ENDPOINT"] + return api_key, api_base + + +def _parse_json_from_llm(raw: Any) -> dict: + """Extract a JSON object from LLM output, handling markdown code blocks.""" + import re + + if isinstance(raw, dict): + return raw + raw = str(raw) + try: + return json.loads(raw) + except (json.JSONDecodeError, TypeError): + pass + m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL) + if m: + try: + return json.loads(m.group(1)) + except (json.JSONDecodeError, TypeError): + pass + m = re.search(r"\{.*\}", raw, re.DOTALL) + if m: + try: + return json.loads(m.group(0)) + except (json.JSONDecodeError, TypeError): + pass + logger.warning("[_parse_json_from_llm] Could not extract JSON, returning empty dict") + return {} + + +# ═══════════════════════════════════════════════════════════════════════════════ +# MAP Step +# ═══════════════════════════════════════════════════════════════════════════════ + +_MAP_INSTRUCTION = """\ +You are an analysis agent with deep expertise in HTTP, WebRTC, \ +SIP protocols and their interactions. You will receive a BATCH of microservice log entries \ +(condensed JSON) and a PRIOR ANALYSIS SUMMARY from earlier batches. + +These logs come from a Webex Calling / Contact Center platform. You have access to \ +reference tools — use them when you need detailed knowledge: + +- **lookup_mobius_error_codes**: Call when you see `mobius-error` codes or unexpected HTTP \ +status codes from Mobius. Returns code-level root cause and debugging guidance. +- **lookup_architecture**: Call when you need to understand service roles (Mobius, SSE, MSE, \ +WxCAS, CPAPI, Mercury, WDM, U2C), signaling/media paths, or how services interconnect. +- **lookup_sip_flows**: Call when analyzing SIP messages (INVITE, BYE, REGISTER, re-INVITE, \ +REFER, etc.) and you need the expected sequence, SDP details, or failure patterns. +- **lookup_calling_flow**: Call when analyzing an end-to-end calling flow (WebRTC-to-WebRTC, \ +WebRTC-to-PSTN, WebRTC-to-DeskPhone). +- **lookup_contact_center_flow**: Call when logs indicate a Contact Center scenario (Kamailio, \ +RTMS, RAS, health pings, Kafka failover). + +## Output Schema + +Analyze THIS batch and produce a structured JSON object. \ +Output ONLY valid JSON — no markdown fences, no preamble. + +{ + "new_identifiers": { + "session_ids": [""], + "call_ids": [""], + "sip_call_ids": [""], + "sse_call_ids": [""], + "tracking_ids": [""], + "user_ids": [""], + "device_ids": [""], + "trace_ids": [""] + }, + "events": [ + { + "timestamp": "", + "type": "HTTP|SIP|media|routing|registration|websocket|error", + "source": "", + "destination": "", + "detail": "" + } + ], + "errors": [ + { + "timestamp": "", + "code": "", + "service": "", + "message": "", + "suspected_cause": "" + } + ], + "state_updates": [ + { + "timestamp": "", + "transition": "", + "from_state": "", + "to_state": "" + } + ], + "evidence_refs": [ + { + "doc_id": "", + "index": "", + "timestamp": "", + "category": "mobius|sse_mse|wxcas", + "relevance": "" + } + ], + "delta_summary": "<2-4 sentence summary of what THIS batch reveals that is NEW compared to the prior summary>" +} + +## Analysis Guidance + +Be THOROUGH and EXHAUSTIVE — every log entry matters for debugging. + +- **HTTP**: capture every request/response with timestamp, source→destination, method, \ +full path, status code, relevant IDs. Flag non-2xx responses. Note latency if visible. +- **SIP**: capture INVITE, 100 Trying, 180 Ringing, 183 Session Progress, 200 OK, ACK, \ +BYE, CANCEL, UPDATE, re-INVITE, PRACK, REFER with Call-ID and CSeq. Extract SDP details \ +(codec, media type, ICE candidates) when visible. Identify retransmissions and timeouts. \ +Use **lookup_sip_flows** if you need to verify the expected sequence. +- **Errors**: every non-2xx HTTP, every 4xx/5xx/6xx SIP, every mobius-error code, every \ +logged error/warning/exception. Use **lookup_mobius_error_codes** for Mobius-specific codes. +- **State transitions**: call state changes (idle→calling→connected→disconnected), \ +registration state (unregistered→registered→expired), SIP dialog state, media negotiation. +- **Cross-service correlation**: the SAME call appears in Mobius (HTTP side), SSE (SIP side), \ +and WxCAS (routing side) with shared IDs. Note when you see the same transaction across services. \ +Identify gaps. Use **lookup_architecture** if you need to understand the expected path. +- **Timing**: note delays >2s between expected sequential events. Calculate setup time \ +(INVITE to 200 OK). Flag timeouts. +- **Evidence**: mark log entries critical for debugging (errors, state changes, first/last events, \ +SIP milestones). +- **delta_summary**: focus on what is NEW in this batch vs the prior summary — avoid repeating. + +If no items exist for a category, use an empty list []. +""" + +_MAP_USER_TEMPLATE = """\ +## Prior Analysis Summary +{compact_memory} + +## Log Batch (analyze this) +{batch_json} +""" + + +async def map_batch( + condensed_hits: list[dict], + compact_memory: str, + budget: "TokenBudget", +) -> dict: + """MAP step: analyze one batch of log entries via LLM. + + Args: + condensed_hits: list of condensed log entries (from extract_id_fields_for_llm) + compact_memory: the rolling_analysis["summary"] from prior batches (few KB) + budget: TokenBudget instance for tracking/limiting token usage + + Returns: + MapOutput dict matching the schema in _MAP_INSTRUCTION, or empty dict on failure. + """ + api_key, api_base = _get_llm_config() + batch_json = json.dumps(condensed_hits, default=str) + + user_content = _MAP_USER_TEMPLATE.format( + compact_memory=compact_memory or "(No prior analysis — this is the first batch)", + batch_json=batch_json, + ) + + full_prompt = _MAP_INSTRUCTION + user_content + est_tokens = _estimate_tokens(full_prompt) + + if budget and not budget.can_afford(full_prompt): + allowed_chars = ( + budget.remaining_stage() * CHARS_PER_TOKEN_ESTIMATE + - len(_MAP_INSTRUCTION) + - len(_MAP_USER_TEMPLATE) + - len(compact_memory or "") + - 200 + ) + if allowed_chars < 500: + logger.warning("[map_batch] Budget too tight, skipping batch") + return {} + batch_json = batch_json[:allowed_chars] + user_content = _MAP_USER_TEMPLATE.format( + compact_memory=compact_memory or "(No prior analysis — this is the first batch)", + batch_json=batch_json, + ) + logger.info(f"[map_batch] Trimmed batch for budget: {len(batch_json)} chars") + + MAX_TOOL_ROUNDS = 3 + + messages = [ + {"role": "system", "content": _MAP_INSTRUCTION}, + {"role": "user", "content": user_content}, + ] + + try: + for _round in range(MAX_TOOL_ROUNDS + 1): + response = await litellm.acompletion( + model="openai/gpt-4.1", + api_key=api_key, + api_base=api_base, + extra_headers={"x-cisco-app": "microservice-log-analyzer"}, + messages=messages, + tools=_TOOL_DEFINITIONS, + tool_choice="auto", + temperature=0, + ) + if budget: + budget.record_usage(est_tokens) + + choice = response.choices[0] + + if choice.finish_reason == "tool_calls" or ( + choice.message.tool_calls and not choice.message.content + ): + tool_calls = choice.message.tool_calls + logger.info( + f"[map_batch] Round {_round}: LLM requested " + f"{len(tool_calls)} skill(s): " + f"{[tc.function.name for tc in tool_calls]}" + ) + messages.append(choice.message) + messages.extend(_handle_tool_calls(tool_calls)) + continue + + raw = choice.message.content or "{}" + result = _parse_json_from_llm(raw) + + logger.info( + f"[map_batch] Extracted (after {_round} tool round(s)): " + f"events={len(result.get('events', []))}, " + f"errors={len(result.get('errors', []))}, " + f"state_updates={len(result.get('state_updates', []))}, " + f"evidence_refs={len(result.get('evidence_refs', []))}" + ) + return result + + logger.warning("[map_batch] Exhausted tool rounds, returning last response") + return _parse_json_from_llm(response.choices[0].message.content or "{}") + + except Exception as e: + logger.error(f"[map_batch] LLM call failed: {e}") + return {} + + +# ═══════════════════════════════════════════════════════════════════════════════ +# REDUCE Step +# ═══════════════════════════════════════════════════════════════════════════════ + + +def reduce( + rolling: dict, + map_output: dict, + evidence_index: list[dict], +) -> tuple[dict, list[dict]]: + """REDUCE step: merge one map_batch output into the rolling analysis. + + Pure Python — no LLM calls. Deduplicates identifiers, appends events/errors/ + state_updates, moves evidence_refs to the separate evidence_index, and appends + the delta_summary to the rolling summary. + + Args: + rolling: the current rolling_analysis dict (mutated in place and returned) + map_output: the structured dict returned by map_batch() + evidence_index: the accumulated evidence list (mutated in place and returned) + + Returns: + (updated rolling_analysis, updated evidence_index) + """ + if not map_output: + return rolling, evidence_index + + rolling["batch_count"] += 1 + + # ── Merge identifiers (deduplicated) ── + new_ids = map_output.get("new_identifiers", {}) + for key in _IDENTIFIER_KEYS: + existing = set(rolling["identifiers"].get(key, [])) + for val in new_ids.get(key, []): + val = str(val).strip() + if val and val not in existing: + existing.add(val) + rolling["identifiers"].setdefault(key, []).append(val) + + # ── Append timeline events (capped at TIMELINE_MAX_EVENTS) ── + new_events = map_output.get("events", []) + rolling["timeline"].extend(new_events) + if len(rolling["timeline"]) > TIMELINE_MAX_EVENTS: + rolling["timeline"] = _prune_timeline(rolling["timeline"]) + + # ── Append errors (never pruned) ── + new_errors = map_output.get("errors", []) + rolling["errors"].extend(new_errors) + + # ── Append state machine transitions ── + new_states = map_output.get("state_updates", []) + rolling["state_machine"].extend(new_states) + + # ── Move evidence_refs to separate index ── + new_evidence = map_output.get("evidence_refs", []) + evidence_index.extend(new_evidence) + rolling["evidence_count"] = len(evidence_index) + + # ── Append delta_summary to rolling summary ── + delta = map_output.get("delta_summary", "") + if delta: + if rolling["summary"]: + rolling["summary"] = f"{rolling['summary']}\n\n[Batch {rolling['batch_count']}] {delta}" + else: + rolling["summary"] = f"[Batch {rolling['batch_count']}] {delta}" + + logger.info( + f"[reduce] Batch {rolling['batch_count']}: " + f"+{len(new_events)} events, +{len(new_errors)} errors, " + f"+{len(new_states)} state_updates, +{len(new_evidence)} evidence_refs, " + f"summary={_estimate_tokens(rolling['summary'])} tokens" + ) + + return rolling, evidence_index + + +def _prune_timeline(timeline: list[dict]) -> list[dict]: + """Keep timeline within TIMELINE_MAX_EVENTS by removing low-value entries. + + Preserves: errors, first/last events, SIP milestones, state changes. + Removes: routine success HTTP requests, redundant info entries. + """ + if len(timeline) <= TIMELINE_MAX_EVENTS: + return timeline + + high_priority_types = {"SIP", "error", "routing", "media", "registration"} + + high = [] + low = [] + for event in timeline: + etype = event.get("type", "") + detail = event.get("detail", "") + is_error = "error" in etype.lower() or "error" in detail.lower() + is_high = etype in high_priority_types or is_error + if is_high: + high.append(event) + else: + low.append(event) + + remaining_slots = TIMELINE_MAX_EVENTS - len(high) + if remaining_slots > 0: + kept_low = low[:remaining_slots] + else: + kept_low = [] + high = high[:TIMELINE_MAX_EVENTS] + + result = high + kept_low + result.sort(key=lambda e: e.get("timestamp", "")) + + logger.info( + f"[_prune_timeline] Pruned {len(timeline)} -> {len(result)} events " + f"({len(high)} high-priority, {len(kept_low)} low-priority kept)" + ) + return result + + +# ═══════════════════════════════════════════════════════════════════════════════ +# Compression +# ═══════════════════════════════════════════════════════════════════════════════ + +_ANALYSIS_COMPRESS_INSTRUCTION = """\ +You are a log analysis compressor. The rolling analysis summary below has grown \ +too large and must be compressed to approximately HALF its current length. + +MUST preserve: +1. ALL errors — timestamps, codes, services, suspected causes (never drop these) +2. ALL correlation-critical IDs (session IDs, call IDs, tracking IDs linking services) +3. Key timeline milestones (first event, last event, SIP state transitions, error events) +4. Cross-service correlation evidence +5. Any unresolved questions or anomalies + +MAY abbreviate or remove: +- Redundant success confirmations +- Verbose details of normal/expected HTTP 200 responses +- Duplicate information across batch summaries +- Routine registration or keep-alive events + +Output the compressed summary directly, no preamble or explanation.\ +""" + + +async def compress_analysis_summary( + rolling: dict, + budget: "TokenBudget", +) -> dict: + """Compress rolling_analysis['summary'] when it exceeds ROLLING_SUMMARY_TOKEN_CAP. + + Calls the LLM to produce a shorter version that preserves errors, IDs, and + key milestones. Mutates and returns the rolling dict. + """ + summary = rolling.get("summary", "") + current_tokens = _estimate_tokens(summary) + + if current_tokens <= ROLLING_SUMMARY_TOKEN_CAP: + return rolling + + logger.info( + f"[compress_analysis_summary] Summary at {current_tokens} tokens " + f"(cap={ROLLING_SUMMARY_TOKEN_CAP}), compressing..." + ) + + api_key, api_base = _get_llm_config() + + try: + response = await litellm.acompletion( + model="openai/gpt-4.1", + api_key=api_key, + api_base=api_base, + extra_headers={"x-cisco-app": "microservice-log-analyzer"}, + messages=[ + {"role": "system", "content": _ANALYSIS_COMPRESS_INSTRUCTION}, + {"role": "user", "content": summary}, + ], + temperature=0, + ) + + compressed = response.choices[0].message.content or summary + old_tokens = current_tokens + new_tokens = _estimate_tokens(compressed) + + if budget: + budget.record_usage( + _estimate_tokens(summary) + _estimate_tokens(_ANALYSIS_COMPRESS_INSTRUCTION) + ) + + rolling["summary"] = compressed + logger.info( + f"[compress_analysis_summary] Compressed: {old_tokens} -> {new_tokens} tokens " + f"(saved ~{old_tokens - new_tokens})" + ) + + except Exception as e: + logger.error(f"[compress_analysis_summary] Compression failed: {e}") + + return rolling + + +# ═══════════════════════════════════════════════════════════════════════════════ +# Format to Markdown +# ═══════════════════════════════════════════════════════════════════════════════ + + +def format_to_markdown( + rolling: dict, + evidence_index: list[dict], + search_summary: str = "", +) -> str: + """Convert the final rolling_analysis + evidence_index into a markdown report. + + The output mirrors the section structure expected by downstream agents + (sequence_diagram, chat_agent). + """ + sections = [] + + # ── Root Cause Analysis (errors) ── + sections.append("---\n### Root Cause Analysis") + errors = rolling.get("errors", []) + if not errors: + sections.append( + "No errors or issues detected. The flow appears to have completed normally." + ) + else: + for err in errors: + ts = err.get("timestamp", "unknown") + code = err.get("code", "N/A") + svc = err.get("service", "unknown") + msg = err.get("message", "") + cause = err.get("suspected_cause", "") + sections.append( + f"**[{ts}]**: {code}\n" + f" **Service**: {svc}\n" + f" **Description**: {msg}\n" + f" **Suspected Root Cause**: {cause}" + ) + + # ── Extracted Identifiers ── + sections.append("\n---\n### Extracted Identifiers") + ids = rolling.get("identifiers", {}) + label_map = { + "session_ids": "Session ID", + "call_ids": "Call ID (Mobius)", + "sip_call_ids": "Call ID (SIP)", + "sse_call_ids": "Call ID (SSE)", + "tracking_ids": "Tracking ID", + "user_ids": "User ID", + "device_ids": "Device ID", + "trace_ids": "Trace ID", + } + for key, label in label_map.items(): + vals = ids.get(key, []) + if vals: + sections.append(f"- **{label}**: {', '.join(vals)}") + else: + sections.append(f"- **{label}**: (not found)") + + # ── Search Scope ── + if search_summary: + sections.append("\n---\n### Search Scope") + sections.append(search_summary) + + # ── Cross-Service Correlation ── + sections.append("\n---\n### Cross-Service Correlation") + corrs = rolling.get("cross_service_correlations", []) + if corrs: + for c in corrs: + sections.append(f"- {c}") + else: + summary_text = rolling.get("summary", "") + if "cross" in summary_text.lower() or "correlat" in summary_text.lower(): + sections.append("(See analysis summary below for cross-service details)") + else: + sections.append("No explicit cross-service correlations captured.") + + # ── Timing Analysis ── + sections.append("\n---\n### Timing Analysis") + timeline = rolling.get("timeline", []) + if timeline: + first = timeline[0].get("timestamp", "") + last = timeline[-1].get("timestamp", "") + sections.append(f"- **First event**: {first}") + sections.append(f"- **Last event**: {last}") + sections.append(f"- **Events captured**: {len(timeline)}") + + sip_events = [e for e in timeline if e.get("type") == "SIP"] + if sip_events: + sections.append(f"- **SIP messages**: {len(sip_events)}") + else: + sections.append("No timeline events captured.") + + # ── Final Outcome (analysis summary) ── + sections.append("\n---\n### Final Outcome") + summary = rolling.get("summary", "") + if summary: + sections.append(summary) + else: + sections.append("Analysis produced no summary.") + + # ── Timeline (condensed) ── + if timeline: + sections.append("\n---\n### Communication Flow") + for event in timeline: + ts = event.get("timestamp", "?") + etype = event.get("type", "") + src = event.get("source", "?") + dst = event.get("destination", "?") + detail = event.get("detail", "") + sections.append(f"**[{ts}]** {src} -> {dst}: {etype} {detail}") + + # ── Evidence References ── + if evidence_index: + sections.append(f"\n---\n### Evidence Index ({len(evidence_index)} references)") + for i, ref in enumerate(evidence_index[:20], 1): + doc_id = ref.get("doc_id", "?") + idx = ref.get("index", "?") + ts = ref.get("timestamp", "?") + cat = ref.get("category", "?") + rel = ref.get("relevance", "") + sections.append(f"{i}. `{doc_id}` ({idx}, {cat}) [{ts}] — {rel}") + if len(evidence_index) > 20: + sections.append(f" ... and {len(evidence_index) - 20} more references") + + # ── Stats ── + sections.append(f"\n---\n*Analysis: {rolling.get('batch_count', 0)} batches processed, " + f"{rolling.get('evidence_count', 0)} evidence references collected.*") + + return "\n".join(sections) + + +# ═══════════════════════════════════════════════════════════════════════════════ +# Producer-Consumer: analysis consumer loop +# ═══════════════════════════════════════════════════════════════════════════════ + +SENTINEL = None # pushed by the producer to signal "no more batches" + + +async def run_analysis_consumer( + queue: "asyncio.Queue[list[dict] | None]", + budget: "TokenBudget", + search_summary: str = "", +) -> tuple[str, dict, list[dict]]: + """Consume condensed hit batches from an asyncio.Queue and run MAP-REDUCE. + + The search producer pushes list[dict] items (condensed hits per page) onto + the queue, then pushes SENTINEL (None) when done. This consumer processes + them one-at-a-time with map_batch -> reduce, compressing the summary when + it exceeds the token cap. + + Args: + queue: asyncio.Queue fed by the search producer; items are + list[dict] (condensed hits) or None (sentinel). + budget: TokenBudget instance shared with the caller. + search_summary: optional search_summary string for the final markdown. + + Returns: + (markdown_report, rolling_analysis, evidence_index) + """ + rolling = new_rolling_analysis() + evidence_index: list[dict] = [] + + batch_num = 0 + while True: + item = await queue.get() + if item is SENTINEL: + queue.task_done() + logger.info("[analysis_consumer] Received sentinel, finishing analysis") + break + + batch_num += 1 + condensed_hits = item + logger.info( + f"[analysis_consumer] Processing batch {batch_num} " + f"({len(condensed_hits)} entries)" + ) + + compact_memory = rolling["summary"] + + map_output = await map_batch(condensed_hits, compact_memory, budget) + if map_output: + rolling, evidence_index = reduce(rolling, map_output, evidence_index) + + summary_tokens = _estimate_tokens(rolling.get("summary", "")) + if summary_tokens > ROLLING_SUMMARY_TOKEN_CAP: + rolling = await compress_analysis_summary(rolling, budget) + + queue.task_done() + + markdown = format_to_markdown(rolling, evidence_index, search_summary) + logger.info( + f"[analysis_consumer] Done — {batch_num} batches, " + f"{len(evidence_index)} evidence refs, " + f"{_estimate_tokens(markdown)} tokens in report" + ) + return markdown, rolling, evidence_index + + +# ═══════════════════════════════════════════════════════════════════════════════ +# Upload-only (SDK logs pasted directly, no search) +# ═══════════════════════════════════════════════════════════════════════════════ + +_UPLOAD_BATCH_SIZE = 200 + + +async def analyze_upload_only( + sdk_logs: str, + budget: "TokenBudget | None" = None, +) -> tuple[str, dict, list[dict]]: + """Analyze SDK logs that were uploaded directly (no OpenSearch search). + + Splits the raw log text into line-based batches and runs the same + map -> reduce -> compress pipeline. + + Args: + sdk_logs: raw log text pasted or uploaded by the user. + budget: optional TokenBudget for controlling LLM spend. + + Returns: + (markdown_report, rolling_analysis, evidence_index) + """ + if not sdk_logs or not sdk_logs.strip(): + return "(No SDK logs provided)", new_rolling_analysis(), [] + + lines = sdk_logs.strip().splitlines() + logger.info(f"[analyze_upload_only] Processing {len(lines)} lines of SDK logs") + + rolling = new_rolling_analysis() + evidence_index: list[dict] = [] + + for start in range(0, len(lines), _UPLOAD_BATCH_SIZE): + batch_lines = lines[start : start + _UPLOAD_BATCH_SIZE] + condensed = [{"raw_line": line, "line_num": start + i + 1} + for i, line in enumerate(batch_lines)] + + compact_memory = rolling["summary"] + map_output = await map_batch(condensed, compact_memory, budget) + + if map_output: + rolling, evidence_index = reduce(rolling, map_output, evidence_index) + + summary_tokens = _estimate_tokens(rolling.get("summary", "")) + if summary_tokens > ROLLING_SUMMARY_TOKEN_CAP: + rolling = await compress_analysis_summary(rolling, budget) + + markdown = format_to_markdown(rolling, evidence_index, search_summary="(SDK log upload)") + logger.info( + f"[analyze_upload_only] Done — {rolling['batch_count']} batches, " + f"{len(evidence_index)} evidence refs" + ) + return markdown, rolling, evidence_index diff --git a/agents/query_analyzer/__init__.py b/agents/query_analyzer/__init__.py new file mode 100644 index 0000000..02c597e --- /dev/null +++ b/agents/query_analyzer/__init__.py @@ -0,0 +1 @@ +from . import agent diff --git a/agents/query_analyzer/agent.py b/agents/query_analyzer/agent.py new file mode 100644 index 0000000..1200662 --- /dev/null +++ b/agents/query_analyzer/agent.py @@ -0,0 +1,370 @@ +import os +from pathlib import Path +from dotenv import load_dotenv + +from google.adk.agents import BaseAgent, LlmAgent +from google.adk.agents.invocation_context import InvocationContext +from google.adk.events import Event +from google.adk.models.lite_llm import LiteLlm + +env_path = Path(__file__).parent.parent / ".env" +load_dotenv(dotenv_path=env_path) + +from root_agent_v2.agent import root_agent as pipeline +from chat_agent.agent import chat_agent + +query_analyzer = LlmAgent( + model=LiteLlm( + + ), + name='query-analyzer', + description='', + instruction=""" +You are the Query Analyzer agent. + +Your role is to analyze each user query and delegate the request to the correct agent. + +You MUST NOT answer user questions. +You MUST NOT analyze logs. +You MUST NOT generate explanations. + +You ONLY decide delegation. + +Available sub-agents: + +1) pipeline + Responsible for: + - Searching OpenSearch logs + - Extracting identifiers + - Building call flows + - Finding sequence failures + - Error investigation + +2) chat_agent + Responsible for: + - Follow-up conversations + - Explanations + - Clarifications + - Asking user for missing information + - Answering general questions + +-------------------------------------------------- + +SYSTEM PURPOSE + +The pipeline retrieves information about: + +- Error IDs +- Tracking IDs +- Session IDs +- Call flows +- Sequence failures +- Service errors + +from OpenSearch logs. + +The chat_agent uses already available information in the session state +to communicate with the user. + +-------------------------------------------------- + +CORE PRINCIPLE + +QueryAnalyzer is a lightweight router. + +It should: + +1) Inspect the query +2) Check session state +3) Delegate + +It should NOT: + +- Think deeply +- Analyze logs +- Interpret results + +The QueryAnalyzer should primarily check session state and delegate. + +-------------------------------------------------- + +SESSION STATE + +You have access to: + +Latest Search Results: +{latest_search_results} + +Latest Analyze Results: +{analyze_results} + +These contain all information already retrieved from logs. + +If the required information already exists in state: + +→ Delegate to chat_agent + +Do NOT run pipeline again. + +-------------------------------------------------- + +QUERY CLASSIFICATION + +Queries fall into the following categories: + +-------------------------------------------------- + +1) INFO QUERY → PIPELINE + +These queries request log information. + +Examples: + +- webex-js-sdk_abc123 +- errorId_12345 +- trackingId_xxx +- sessionId_xxx +- callId_xxx + +Examples: + +"Investigate this error" + +"Analyze this call" + +"Find sequence failures" + +"Search logs for this tracking ID" + +If NEW information must be fetched: + +→ Delegate to pipeline + +-------------------------------------------------- + +2) FOLLOW-UP QUERY → CHAT AGENT + +These queries refer to previous results. + +Examples: + +"Explain more" + +"Why did it fail" + +"What happened" + +"Show root cause" + +"Which service failed" + +"Explain the ROAP issue" + +These queries use EXISTING information. + +→ Delegate to chat_agent + +Always. + +-------------------------------------------------- + +3) NO QUERY PARAMETERS → CHAT AGENT + +If the query contains NO identifiers such as: + +- error id +- tracking id +- session id +- call id +- meeting id +- device id + +Then pipeline cannot run. + +Examples: + +"What is ROAP?" + +"Explain SIP flow" + +"What does 403 mean?" + +"Call failed" + +"Something broke" + +In these cases: + +→ Delegate to chat_agent + +The chat agent will ask the user for: + +- Tracking ID +- Error ID +- Session ID +- Environment + +-------------------------------------------------- + +4) STATE-DEPENDENT QUERY → CHECK STATE FIRST + +These queries might require logs OR might be follow-ups. + +Examples: + +"Show the flow" + +"Show failures" + +"What errors occurred?" + +"Analyze this session" + +Decision process: + +Step 1: + +Check: + +{latest_search_results} + +{analyze_results} + +Step 2: + +If relevant information exists: + +→ Delegate to chat_agent + +Step 3: + +If information does NOT exist: + +→ Delegate to pipeline + +-------------------------------------------------- + +5) MIXED QUERIES + +Mixed queries contain both identifiers and explanation requests. + +Examples: + +"Check this error and explain" + +"Analyze trackingId_123 and summarize" + +Decision process: + +Step 1: + +Check state. + +Step 2: + +If identifier already exists in state: + +→ chat_agent + +Step 3: + +If identifier does NOT exist in state: + +→ pipeline + +-------------------------------------------------- + +6) MISSING INFORMATION + +If the query does not contain enough information for log search: + +Examples: + +"Call failed" + +"Error occurred" + +"Logs missing" + +Do NOT run pipeline. + +→ Delegate to chat_agent + +Chat agent will request more details from user. + +-------------------------------------------------- + +STATE RULES + +Rule 1: + +If state is empty: + +latest_search_results = empty +analyze_results = empty + +AND query contains identifiers: + +→ pipeline + +Otherwise: + +→ chat_agent + +-------------------------------------------------- + +Rule 2: + +If identifier exists in state: + +→ chat_agent + +-------------------------------------------------- + +Rule 3: + +If identifier does not exist in state: + +→ pipeline + +-------------------------------------------------- + +Rule 4: + +If unsure: + +Check state first. + +If state contains relevant information: + +→ chat_agent + +Otherwise: + +→ pipeline + +-------------------------------------------------- + +FINAL RULES + +You MUST ONLY delegate. + +Never answer user questions. + +Never explain routing. + +Never analyze logs. + +Never ask user questions. + +Only delegate. + +Valid targets: + +pipeline +chat_agent +""" + , + sub_agents=[pipeline,chat_agent] +) + diff --git a/agents/query_router/agent.py b/agents/query_router/agent.py index 2af7349..3c623b9 100644 --- a/agents/query_router/agent.py +++ b/agents/query_router/agent.py @@ -15,7 +15,7 @@ load_dotenv(dotenv_path=env_path) from root_agent_v2.agent import root_agent as pipeline -from analyze_agent_v2.agent import analyze_agent +from analyze_agent_v2.incremental import analyze_upload_only from visualAgent.agent import sequence_diagram_agent logger = logging.getLogger(__name__) @@ -415,10 +415,12 @@ async def _run_async_impl( # Store SDK logs if uploaded if upload_only: - ctx.session.state["sdk_logs"] = search_params.get("sdk_logs", "") - logger.info("[query_analyzer] Upload-only mode — skipping search, running analyze + visualize") - async for event in analyze_agent.run_async(ctx): - yield event + sdk_logs = search_params.get("sdk_logs", "") + ctx.session.state["sdk_logs"] = sdk_logs + logger.info("[query_analyzer] Upload-only mode — running incremental analysis + visualize") + markdown, _rolling, _evidence = await analyze_upload_only(sdk_logs) + ctx.session.state["analyze_results"] = markdown + ctx.session.state["analysis_evidence"] = json.dumps(_evidence, default=str) async for event in sequence_diagram_agent.run_async(ctx): yield event else: diff --git a/agents/root_agent_v2/agent.py b/agents/root_agent_v2/agent.py index d628764..d0bb2dd 100644 --- a/agents/root_agent_v2/agent.py +++ b/agents/root_agent_v2/agent.py @@ -1,7 +1,8 @@ """ -Root Agent v2 — Sequential pipeline using search_agent_v2 and analyze_agent_v2. +Root Agent v2 — Sequential pipeline: search (with inline incremental analysis) +then sequence diagram generation. -Pipeline: search_agent_v2 → analyze_agent_v2 → sequence_diagram_agent +Pipeline: search_agent_v2 (includes map-reduce analysis) → sequence_diagram_agent Run standalone: adk web agents/root_agent_v2 """ @@ -17,16 +18,11 @@ env_path = Path(__file__).parent.parent / ".env" load_dotenv(dotenv_path=env_path) -# ═══════════════════════════════════════════════════════════════════════════════ -# OAuth Token Initialization — DISABLED (token provided via Webex OAuth login) -# ═══════════════════════════════════════════════════════════════════════════════ - # ═══════════════════════════════════════════════════════════════════════════════ # Import sub-agents # ═══════════════════════════════════════════════════════════════════════════════ from search_agent_v2.agent import search_agent -from analyze_agent_v2.agent import analyze_agent from visualAgent.agent import sequence_diagram_agent logging.info("✓ root_agent_v2: All sub-agents imported successfully") @@ -37,10 +33,10 @@ root_agent = SequentialAgent( name="MicroserviceLogAnalyzerV2", - sub_agents=[search_agent, analyze_agent, sequence_diagram_agent], + sub_agents=[search_agent, sequence_diagram_agent], description=( "Executes a full log analysis pipeline: " - "exhaustive BFS search → analysis (calling/contact-center routing) → " + "exhaustive BFS search with inline incremental analysis → " "PlantUML sequence diagram generation." ), ) diff --git a/agents/search_agent_v2/agent.py b/agents/search_agent_v2/agent.py index e0c2c63..508f0a1 100644 --- a/agents/search_agent_v2/agent.py +++ b/agents/search_agent_v2/agent.py @@ -33,6 +33,12 @@ from google.adk.agents.invocation_context import InvocationContext from google.adk.events import Event from google.adk.models.lite_llm import LiteLlm + +from analyze_agent_v2.incremental import ( + run_analysis_consumer, + format_to_markdown, + SENTINEL, +) from opensearchpy import OpenSearch, RequestsHttpConnection # ═══════════════════════════════════════════════════════════════════════════════ @@ -1381,6 +1387,7 @@ async def _process_hits_progressive( rolling_summary: str, budget: TokenBudget, id_extractor_instruction: str, + analysis_queue: "asyncio.Queue | None" = None, ) -> tuple[str, dict, int]: """ Process a page of hits: deduplicate, extract IDs, summarize, update rolling summary. @@ -1415,6 +1422,13 @@ async def _process_hits_progressive( f"{len(condensed)} entries for LLM" ) + if analysis_queue is not None: + await analysis_queue.put(condensed) + logger.info( + f"[_process_hits_progressive] Pushed {len(condensed)} entries " + f"to analysis queue" + ) + # Run ID extraction + summarization in parallel, respecting budget extracted, batch_summary = await asyncio.gather( _extract_ids_from_batch(condensed, id_extractor_instruction, budget), @@ -1547,7 +1561,7 @@ async def _run_async_impl( ) # ══════════════════════════════════════════════════════════════════════ - # Step 2: Initialize BFS + Token Budget + # Step 2: Initialize BFS + Token Budget + Analysis Queue # ══════════════════════════════════════════════════════════════════════ budget = TokenBudget() all_seen_ids: set[str] = set() @@ -1562,6 +1576,12 @@ async def _run_async_impl( TIME_PADDING_HOURS = 2 derived_time_range: tuple[str, str] | None = None + analysis_queue: asyncio.Queue = asyncio.Queue() + analysis_task = asyncio.create_task( + run_analysis_consumer(queue=analysis_queue, budget=budget) + ) + logger.info(f"[{self.name}] Analysis consumer task started in background") + for ident in identifiers: id_val = ident["value"] id_type = ident.get("type", "unknown") @@ -1685,6 +1705,7 @@ async def _run_async_impl( rolling_summary=rolling_summary, budget=budget, id_extractor_instruction=self.id_extractor.instruction, + analysis_queue=analysis_queue, ) depth_new_hits += new_count @@ -1729,10 +1750,11 @@ async def _run_async_impl( f"in {_search_elapsed:.2f}s" ) + _log_counts = {k: len(v) for k, v in all_logs.items()} logger.info( f"[{self.name}] Depth {current_depth} search phase done: " f"depth_new_hits={depth_new_hits}, derived_time_range={derived_time_range}, " - f"all_logs counts={{ {k}: {len(v)} for k, v in all_logs.items() }}, " + f"all_logs counts={_log_counts}, " f"seen_hit_ids={len(seen_hit_ids)}, budget_stage={budget.remaining_stage()}, " f"budget_run={budget.remaining_run()}" ) @@ -1804,6 +1826,24 @@ async def _run_async_impl( budget.end_stage() + # ══════════════════════════════════════════════════════════════════════ + # Step 3.5: Signal analysis consumer to finish and await results + # ══════════════════════════════════════════════════════════════════════ + await analysis_queue.put(SENTINEL) + logger.info(f"[{self.name}] Sent sentinel to analysis consumer, awaiting results...") + try: + analysis_markdown, analysis_rolling, analysis_evidence = await analysis_task + logger.info( + f"[{self.name}] Analysis consumer finished: " + f"{analysis_rolling.get('batch_count', 0)} batches, " + f"{len(analysis_evidence)} evidence refs" + ) + except Exception as e: + logger.error(f"[{self.name}] Analysis consumer failed: {e}") + analysis_markdown = "" + analysis_rolling = {} + analysis_evidence = [] + # ══════════════════════════════════════════════════════════════════════ # Step 4: Store final results in session state # ══════════════════════════════════════════════════════════════════════ @@ -1852,6 +1892,17 @@ async def _run_async_impl( ctx.session.state["chunk_summaries"] = json.dumps(chunk_summaries, default=str) ctx.session.state["chunk_analysis_summary"] = rolling_summary + # Re-format analysis markdown now that search_summary is available + if analysis_rolling: + analysis_markdown = format_to_markdown( + analysis_rolling, + analysis_evidence, + search_summary=ctx.session.state.get("search_summary", ""), + ) + ctx.session.state["analyze_results"] = analysis_markdown + ctx.session.state["analysis_evidence"] = json.dumps(analysis_evidence, default=str) + _log_cache[ctx.session.id]["analyze_results"] = analysis_markdown + logger.info( f"[{self.name}] == Search complete ==\n" f" Mobius: {len(all_logs['mobius'])} logs\n" diff --git a/incremental_map-reduce_analysis.md b/incremental_map-reduce_analysis.md new file mode 100644 index 0000000..bef52bf --- /dev/null +++ b/incremental_map-reduce_analysis.md @@ -0,0 +1,268 @@ +--- +name: Incremental Map-Reduce Analysis +overview: Add incremental map-reduce analysis alongside the existing search pipeline using a producer-consumer pattern. Search and analysis run in parallel via asyncio.Queue. All analysis logic lives in a new analyze_agent_v2/incremental.py. Existing search_agent_v2 code is kept untouched. +todos: + - id: analysis-interface + content: "Create the incremental analysis API in analyze_agent_v2/incremental.py: map_batch(), reduce(), compress_analysis_summary(), format_to_markdown(), new_rolling_analysis(), and analyze_upload_only()" + status: pending + - id: integrate-search + content: "Add producer-consumer pipeline to search_agent_v2: import incremental API, add analysis_consumer coroutine, wrap existing BFS search+dedup+extract in search_producer, connect via asyncio.Queue, run both with asyncio.gather, format and store analyze_results in Step 4" + status: pending + - id: update-pipeline + content: Remove analyze_agent from root_agent_v2 SequentialAgent; update query_router upload-only flow to call analyze_upload_only() + status: pending + - id: verify-downstream + content: Verify chat_agent, sequence_diagram_agent, and _log_cache still work with the new analyze_results format + status: pending +isProject: false +--- + +# Incremental Map-Reduce Analysis (Decoupled, Pipelined) + +## Principle: Do NOT remove existing search agent code + +All existing functions in `search_agent_v2/agent.py` stay untouched: + +- `_summarize_hits()` -- kept as-is +- `_SUMMARIZER_INSTRUCTION` -- kept as-is +- `_COMPRESS_INSTRUCTION` -- kept as-is +- `_extract_ids_from_batch()` -- kept as-is +- `_process_hits_progressive()` -- kept as-is (search producer calls it unchanged) +- `TokenBudget` -- kept as-is, reused by new analysis code +- BFS loop structure -- wrapped in a producer coroutine, not rewritten + +New analysis logic is **added alongside** as a parallel consumer, not substituted in. + +## Current Architecture + +```mermaid +flowchart LR + Search["search_agent_v2\n(progressive pages)"] -->|"writes ALL raw logs"| State[(Session State)] + State -->|"reads ALL raw logs"| Analyze["analyze_agent_v2\n(monolithic LLM call)"] + Analyze -->|"analyze_results"| SeqDiag["sequence_diagram_agent"] +``` + + + +Problem: The analyze agent receives all raw logs at once (100k+ tokens), hitting context limits and producing shallow analysis on large log sets. + +## Target Architecture (Producer-Consumer Pipeline) + +```mermaid +flowchart LR + subgraph Producer ["Search Producer (existing code, untouched)"] + Fetch["Fetch page"] --> Dedup["Dedup + condense"] + Dedup --> Gather["asyncio.gather"] + Gather --> IDExtract["_extract_ids_from_batch()"] + Gather --> Summarize["_summarize_hits()"] + Summarize --> Push["Push condensed hits to Queue"] + end + + Push --> Queue["asyncio.Queue\n(maxsize=4)"] + + subgraph Consumer ["Analysis Consumer (NEW, runs in parallel)"] + Pull["Pull from Queue"] --> MapFn["map_batch()"] + MapFn --> ReduceFn["reduce()"] + ReduceFn --> Compress{"summary > 4k?"} + Compress -->|yes| CompressFn["compress_analysis_summary()"] + Compress -->|no| Pull + CompressFn --> Pull + end + + Consumer -->|"final rolling_analysis"| FormatFn["format_to_markdown()"] + FormatFn -->|"markdown"| FinalState["Session State: analyze_results"] +``` + + + +**Timeline showing parallelism:** + +``` +Search producer: [fetch p1] [dedup+IDs p1] [fetch p2] [dedup+IDs p2] [fetch p3] [dedup+IDs p3] → done + ↓ push ↓ push ↓ push +Queue: [p1] [p2] [p3] + ↓ pull ↓ pull ↓ pull +Analysis consumer: [map p1] [reduce] [map p2] [reduce] [map p3] [reduce] → done +``` + +Search keeps fetching pages without waiting for analysis. Analysis processes pages as they arrive from the queue. + +**Separation of concerns:** + +- `search_agent_v2` owns: OpenSearch fetch, pagination, dedup, BFS frontier, ID extraction, text summarization (all existing, untouched) +- `analyze_agent_v2/incremental.py` owns: structured analysis map instruction, reduce merge logic, analysis compression, markdown formatting, output schema (all new) +- The only coupling is: (1) the function interface imported by search_agent_v2, (2) the `asyncio.Queue` connecting producer and consumer + +## The Interface (analyze_agent_v2/incremental.py) + +```python +def new_rolling_analysis() -> dict: + """Factory: returns an empty rolling_analysis structure.""" + +async def map_batch( + condensed_hits: list[dict], + compact_memory: str, + budget: TokenBudget, +) -> dict: + """MAP step: analyze one batch of log entries via LLM. + Input: condensed hits + prior compact memory (few KB). + Output: MapOutput dict (structured JSON).""" + +def reduce( + rolling: dict, + map_output: dict, + evidence_index: list[dict], +) -> tuple[dict, list[dict]]: + """REDUCE step: merge map output into rolling analysis (pure Python). + Returns (updated rolling_analysis, updated evidence_index).""" + +async def compress_analysis_summary( + rolling: dict, + budget: TokenBudget, +) -> dict: + """Compress rolling_analysis.summary when it exceeds 4k tokens.""" + +def format_to_markdown( + rolling: dict, + evidence_index: list[dict], + search_summary: dict, + detailed_analysis: bool, +) -> str: + """Convert final rolling_analysis into the markdown output structure.""" + +async def analyze_upload_only( + sdk_logs: str, + budget: TokenBudget, +) -> str: + """Single-pass analysis for upload-only flow (no BFS pagination).""" + +async def run_analysis_consumer( + queue: asyncio.Queue, + budget: TokenBudget, +) -> tuple[dict, list[dict]]: + """Consumer coroutine: pulls condensed batches from queue, + runs map_batch + reduce in a loop until sentinel (None) is received. + Returns (final rolling_analysis, evidence_index).""" +``` + +## Map Output JSON Schema + +Each batch produces this structured output: + +```json +{ + "new_identifiers": { + "session_ids": [], "call_ids": [], "tracking_ids": [], + "user_ids": [], "device_ids": [] + }, + "events": [ + {"timestamp": "...", "type": "HTTP|SIP|media|routing", + "source": "...", "destination": "...", "detail": "..."} + ], + "errors": [ + {"timestamp": "...", "code": "...", "service": "...", + "message": "...", "suspected_cause": "..."} + ], + "state_updates": [ + {"timestamp": "...", "transition": "...", "from_state": "...", "to_state": "..."} + ], + "evidence_refs": [ + {"doc_id": "...", "index": "...", "timestamp": "...", + "category": "mobius|sse_mse|wxcas", "relevance": "..."} + ], + "delta_summary": "Short text summarizing what this batch revealed" +} +``` + +## Rolling Analysis Structure + +The reducer merges each map output into this object: + +```json +{ + "identifiers": { "session_ids": [], "call_ids": [], ... }, + "timeline": [ + {"timestamp": "...", "type": "...", "detail": "..."} + ], + "errors": [ + {"timestamp": "...", "code": "...", "service": "...", + "message": "...", "suspected_cause": "..."} + ], + "state_machine": [ + {"timestamp": "...", "transition": "...", "from_state": "...", "to_state": "..."} + ], + "cross_service_correlations": [ ... ], + "summary": "Running narrative <=4k tokens", + "evidence_count": 42, + "batch_count": 5 +} +``` + +Rules: + +- `summary` capped at ~4k tokens; when exceeded, compress via `compress_analysis_summary()` LLM call +- `identifiers` are deduplicated sets +- `timeline` keeps only milestone events (prune low-value entries when list exceeds ~50) +- `errors` are always preserved (never compressed away) +- `evidence_refs` stored separately in `evidence_index` list (unbounded in storage, only `evidence_count` in the rolling object) + +## Files to Change + +### 1. NEW: `agents/analyze_agent_v2/incremental.py` + +The core new file. Contains ALL incremental analysis logic: + +- `_MAP_INSTRUCTION` -- LLM system prompt for the map step. Incorporates domain knowledge from the existing `_ANALYSIS_POINTS` in agent.py (HTTP, SIP, media, timing, errors, cross-service correlation). Receives one batch + compact memory, outputs the structured JSON schema above. +- `_ANALYSIS_COMPRESS_INSTRUCTION` -- LLM prompt for compressing the rolling analysis summary (separate from the search agent's existing `_COMPRESS_INSTRUCTION` which stays in search_agent_v2) +- `new_rolling_analysis()` -- factory that returns an empty structure. +- `map_batch()` -- async, calls LLM via litellm, parses JSON output. Budget-aware (accepts `TokenBudget`). +- `reduce()` -- pure Python merge. Dedup identifiers, append events/errors/state_updates, move evidence_refs to evidence_index, append delta_summary to rolling summary. +- `compress_analysis_summary()` -- async, calls LLM when summary exceeds ~4k tokens. +- `format_to_markdown()` -- converts rolling_analysis dict into the markdown structure matching the current `_OUTPUT_STRUCTURE` sections (Root Cause, Identifiers, Search Scope, Timing, Final Outcome, HTTP/SIP flows). +- `run_analysis_consumer()` -- the consumer coroutine. Pulls batches from the queue, runs map + reduce loop, handles compression. Returns final `(rolling_analysis, evidence_index)`. +- `analyze_upload_only()` -- single-pass LLM analysis for SDK-only uploads (no BFS pagination needed). + +### 2. MODIFY: `agents/search_agent_v2/agent.py` (additive only) + +No existing code removed. Only additions: + +- **Add import** at top: `from analyze_agent_v2.incremental import run_analysis_consumer, new_rolling_analysis, format_to_markdown as format_analysis_to_markdown` +- **Add `analysis_queue`**: create `asyncio.Queue(maxsize=4)` at the start of `_run_async_impl`, before the BFS loop. +- **Add queue push in `_process_hits_progressive()`**: after the existing `asyncio.gather(_extract_ids_from_batch, _summarize_hits)` completes, push the `condensed` hits to the analysis queue. This is a single `await analysis_queue.put(...)` line added after the gather. The existing function body is untouched otherwise. +- **Wrap BFS loop as producer**: the existing BFS loop becomes the body of a `search_producer` inner async function. After the loop ends, push `None` sentinel to signal the consumer to stop. +- **Run producer + consumer in parallel**: `asyncio.gather(search_producer(), run_analysis_consumer(analysis_queue, budget))` replaces the direct BFS loop call. +- **Extend Step 4**: after gather completes, retrieve `(rolling_analysis, evidence_index)` from the consumer result. Call `format_analysis_to_markdown(rolling_analysis, evidence_index, ...)` and store as `analyze_results`. Store `evidence_index` as JSON in session state. All existing Step 4 writes (`chunk_summaries`, `chunk_analysis_summary`, raw logs, etc.) remain unchanged. + +### 3. NO CHANGE: `agents/analyze_agent_v2/agent.py` + +Kept entirely as-is. The monolithic `calling_agent`, `contact_center_agent`, coordinator `analyze_agent`, all instruction strings, skill toolsets -- all remain. The incremental analysis in `incremental.py` is a parallel path, not a replacement. + +### 4. MODIFY: `agents/root_agent_v2/agent.py` + +- Remove `analyze_agent` from the `SequentialAgent`: change `[search_agent, analyze_agent, sequence_diagram_agent]` to `[search_agent, sequence_diagram_agent]` since analysis now happens incrementally during search. +- The monolithic analyze_agent code stays in `agent.py` but is no longer wired into the pipeline. + +### 5. MODIFY: `agents/query_router/agent.py` + +- Upload-only path: replace `analyze_agent.run_async(ctx)` with a call to `analyze_upload_only(sdk_logs, budget)` from `incremental.py`, store result in `ctx.session.state["analyze_results"]` + +### 6. NO CHANGE: `agents/chat_agent/agent.py` + +- Reads `{analyze_results}` -- unchanged (now written by search_agent_v2 via format_to_markdown) +- `_log_cache` import and fallback -- unchanged +- Skill toolset imports from analyze_agent_v2 -- unchanged + +## Key Design Decisions + +- **Producer-consumer parallelism**: Search (producer) and analysis (consumer) run concurrently via `asyncio.gather`. Connected by `asyncio.Queue(maxsize=4)` with backpressure. +- **Additive, not destructive**: All existing search agent functions (`_summarize_hits`, `_SUMMARIZER_INSTRUCTION`, `_COMPRESS_INSTRUCTION`, `TokenBudget`, `_process_hits_progressive`, etc.) remain untouched. Only additions: queue creation, one `queue.put()` line after gather, producer/consumer wiring. +- **Search never blocked by analysis**: search continues fetching + extracting IDs while analysis processes prior pages in the background. +- **Analysis still sequential internally**: reduce must happen in order (page 1 before page 2), but this is handled naturally by the queue's FIFO ordering. +- **Clean interface boundary**: search_agent_v2 imports `run_analysis_consumer` + `format_to_markdown` from `incremental.py`. No analysis logic in search code. +- **Compact memory**: map step receives only `rolling_analysis["summary"]` (few KB) as prior context. +- **Evidence index**: stored separately (unbounded in storage). Only `evidence_count` in the rolling object. +- **4k token cap on analysis summary**: enforced by `reduce()` calling `compress_analysis_summary()`. +- **Output structure**: `format_to_markdown()` produces the same markdown sections as `_OUTPUT_STRUCTURE` for downstream compatibility. +- **analyze_agent_v2/agent.py kept intact**: monolithic agent code remains available but is unwired from the pipeline. Can be re-wired if needed. +- **Backpressure**: `maxsize=4` on the queue means if analysis falls 4+ pages behind, search will `await` on `queue.put()` until a slot opens. Prevents unbounded memory growth. + From aca6572ac935315cd989e5142bc3fd06efe5ad4b Mon Sep 17 00:00:00 2001 From: Ritesh Singh Date: Thu, 12 Mar 2026 07:18:10 +0530 Subject: [PATCH 4/9] fix: update incremental analysis --- agents/analyze_agent_v2/agent.py | 261 ++++++++++--------------- agents/analyze_agent_v2/incremental.py | 194 ++++++++++++------ agents/chat_agent/agent.py | 17 +- agents/visualAgent/agent.py | 17 +- 4 files changed, 271 insertions(+), 218 deletions(-) diff --git a/agents/analyze_agent_v2/agent.py b/agents/analyze_agent_v2/agent.py index 8b1ceb0..c76a9b5 100644 --- a/agents/analyze_agent_v2/agent.py +++ b/agents/analyze_agent_v2/agent.py @@ -1,15 +1,16 @@ """ -Analyze Agent v2 — Analysis agent designed for search_agent_v2 output. +Analyze Agent v2 — Batch-mode analysis agents for the incremental map-reduce pipeline. -Consumes the state keys set by ExhaustiveSearchAgent: - - mobius_logs (JSON string: list of _source dicts) - - sse_mse_logs (JSON string: list of _source dicts) - - wxcas_logs (JSON string: list of _source dicts) - - search_summary (JSON string: {total_mobius_logs, total_sse_mse_logs, - total_wxcas_logs, max_depth_reached, total_ids_searched, - search_history}) +Invoked programmatically via ADK Runner from incremental.py's run_analysis_consumer(). +Each invocation receives ONE batch of condensed log entries (via user message) plus a +prior compact memory summary, and outputs structured JSON for the reduce() step. -Routes to calling_agent or contact_center_agent based on serviceIndicator. +Keeps the original calling_agent / contact_center_agent split with full instructions, +skills, and cross-service correlation guidance. Only the output format changed from +markdown to structured JSON, and log sources come from the batch message instead of +session state variables. + +Skill toolsets (mobius, architecture, sip_flow) are also exported for use by chat_agent. """ import os @@ -43,23 +44,22 @@ def _make_model() -> LiteLlm: # ═══════════════════════════════════════════════════════════════════════════════ _SEARCH_CONTEXT_PREAMBLE = """ -**Search Context (from exhaustive BFS search):** -The logs below were collected by an exhaustive graph-traversal search agent that: +**Batch Analysis Context (from exhaustive BFS search):** +You will receive ONE BATCH of condensed log entries from a Webex Calling / Contact Center \ +platform, along with a PRIOR ANALYSIS SUMMARY from earlier batches (compact memory). + +These logs were collected by an exhaustive graph-traversal search agent that: - Started from user-provided identifiers and searched OpenSearch indexes - Extracted ALL related IDs (session IDs, call IDs, tracking IDs, etc.) - Recursively searched for those IDs across multiple indexes and services - Ran searches in parallel for speed -Search summary: {search_summary} - -This means you may have logs spanning MULTIPLE call legs, forwarded sessions, +This means the batch may contain logs spanning MULTIPLE call legs, forwarded sessions, retries, or related interactions that a single-ID search would have missed. -Use the search_summary to understand the scope: how many IDs were searched, -what depth the BFS reached, and what indexes were queried. -**IMPORTANT: You must analyze EVERY log entry. Do NOT skip or summarize groups of logs. -Read each log line, extract its meaning, and incorporate it into the analysis. -If there are hundreds of logs, produce a correspondingly detailed analysis.** +**IMPORTANT: You must analyze EVERY log entry in this batch. Do NOT skip or summarize \ +groups of logs. Read each log line, extract its meaning, and incorporate it into the analysis. +If there is a prior analysis summary, build upon it — focus on what is NEW in this batch.** """ _ANALYSIS_POINTS = """ @@ -116,103 +116,70 @@ def _make_model() -> LiteLlm: - Track the same transaction across service boundaries """ -_OUTPUT_STRUCTURE = """ -**Output Detail Level:** {detailed_analysis} -If detailed_analysis is false, empty, or not set: -- COMPLETELY OMIT the "HTTP Communication Flow (Detailed)" section — do not print its header, do not print a placeholder, do not mention it at all -- COMPLETELY OMIT the "SIP Communication Flow (Detailed)" section — do not print its header, do not print a placeholder, do not mention it at all -- Your output must end after the "Final Outcome" section. Nothing after it. - -If detailed_analysis is true: -- Include ALL sections below, including the detailed HTTP and SIP Communication Flow - -**Output structure (follow this EXACTLY):** - ---- -### ❗ Root Cause Analysis -(Place this section at the VERY TOP of your analysis—first section. If no errors/issues were found, state "No errors or issues detected" and briefly confirm the flow succeeded.) - -For EACH issue found: -→ **[Timestamp]**: ErrorType (ErrorCode) -→ **Service**: Which service generated the error -→ **Context**: What was happening when this error occurred -→ **Description**: Detailed explanation of what went wrong -→ **Potential Root Causes**: List all possible causes, ranked by likelihood -→ **Suggested Fix**: Clear, actionable steps to resolve -→ **Impact**: How did this error affect the call/session? -→ **Notes**: Documentation references, escalation contacts, related issues - ---- -### 🔍 Extracted Identifiers -List ALL unique identifiers found across all log sources: -- **User ID**: -- **Device ID**: -- **Tracking ID**: (print baseTrackingID_* to represent multiple suffixes. Don't print all suffixes) -- **Call ID** (Mobius): -- **Call ID** (SSE/SIP): -- **Session ID (local)**: -- **Session ID (remote)**: -- **Meeting ID** (if any): -- **Trace ID** (if any): - ---- -### 📊 Search Scope -- **IDs searched**: (from search_summary.total_ids_searched) -- **Indexes queried**: (list unique indexes from search_summary.search_history) -- **Total logs analyzed**: Mobius: X, SSE/MSE: Y, WxCAS: Z - ---- -### 🔗 Cross-Service Correlation -Map how the same transaction flows across services: -- Tracking ID X in Mobius → corresponds to Call-ID Y in SSE → routed via WxCAS as Z -- Note any missing correlations or gaps in the flow - ---- -### ⏱️ Timing Analysis -- **Call setup time**: (INVITE to 200 OK) -- **Total duration**: (first log to last log, or INVITE to BYE) -- **Notable delays**: List any gaps > 2 seconds between expected sequential events -- **Retries/Retransmissions**: Count and note if any - ---- -### ✅ Final Outcome -Provide a comprehensive summary of the entire flow: -- What type of call was this? (WebRTC-to-WebRTC, WebRTC-to-PSTN, etc.) -- Did the call succeed or fail? -- Complete signaling path taken -- Media path established (or not) -- Any degradation or issues even if the call succeeded - ---- -### 📡 HTTP Communication Flow (Detailed) -Place this section at the BOTTOM of your analysis, after Root Cause Analysis and all summaries. -List ALL HTTP requests and responses in strict chronological order. -Each entry should be ONE concise line with the format: - -→ **[Timestamp]** Source → Destination: METHOD /path - StatusCode (Brief description) - -Example: -→ **[2026-02-13T10:00:00Z]** Client → Mobius: POST /v1/calling/web/devices/.../call - 200 OK (Call initiation) -→ **[2026-02-13T10:00:01Z]** Mobius → CPAPI: GET /features - 200 OK (Feature retrieval) - -**Do NOT skip any HTTP interactions.** If there are 50 requests, list all 50. - ---- -### 📞 SIP Communication Flow (Detailed) -List ALL SIP messages in strict chronological order, after the HTTP Communication Flow. -Keep Mobius, SSE, MSE, and WxCAS as separate participants. - -→ **[Timestamp]** Source → Destination: SIP Method/Response - Call-ID: xxx - Description -→ **[Timestamp]** Mobius → SSE: SIP INVITE - Call-ID: SSE0520... - Initial call setup -→ **[Timestamp]** SSE → Mobius: 100 Trying - Call-ID: SSE0520... - Call being processed -→ **[Timestamp]** SSE → WxCAS: INVITE - Call-ID: SSE0520... - Routing to app server -→ **[Timestamp]** WxCAS → SSE: 200 OK - Call-ID: SSE0520... - Call accepted -→ **[Timestamp]** SSE → Mobius: 200 OK - Call-ID: SSE0520... - Final response - -Include SDP summary when available (codec, media type, ICE candidates count). -**Do NOT skip any SIP messages.** Reconstruct the COMPLETE dialog. - ---- +_JSON_OUTPUT_SCHEMA = """\ +## Output Format + +Output ONLY valid JSON — no markdown fences, no preamble, no explanation outside the JSON. +Your analysis from the sections above must be captured in the structured fields below. + +{ + "new_identifiers": { + "session_ids": [""], + "call_ids": [""], + "sip_call_ids": [""], + "sse_call_ids": [""], + "tracking_ids": [""], + "user_ids": [""], + "device_ids": [""], + "trace_ids": [""] + }, + "events": [ + { + "timestamp": "", + "type": "HTTP|SIP|media|routing|registration|websocket|error", + "source": "", + "destination": "", + "detail": "" + } + ], + "errors": [ + { + "timestamp": "", + "code": "", + "service": "", + "message": "", + "suspected_cause": "", + "context": "", + "suggested_fix": "", + "impact": "" + } + ], + "state_updates": [ + { + "timestamp": "", + "transition": "", + "from_state": "", + "to_state": "" + } + ], + "evidence_refs": [ + { + "doc_id": "", + "index": "", + "timestamp": "", + "category": "mobius|sse_mse|wxcas", + "relevance": "" + } + ], + "delta_summary": "<2-4 sentence summary of what THIS batch reveals that is NEW compared to the prior summary. Include: call type if identifiable, key milestones, errors found, cross-service correlations, timing anomalies.>" +} + +**Capture EVERY HTTP request/response and EVERY SIP message as individual events.** +Do NOT skip any. If there are 50 HTTP requests, produce 50 event entries. +If there are 20 SIP messages, produce 20 event entries. +Include SDP summaries (codec, media type, ICE candidates count) in SIP event details when available. + +If no items exist for a category, use an empty list []. """ @@ -243,7 +210,6 @@ def _make_model() -> LiteLlm: calling_agent = LlmAgent( model=_make_model(), name="calling_agent", - output_key="analyze_results", tools=[mobius_skill_toolset, architecture_skill_toolset, sip_flow_skill_toolset], instruction=f"""You are a senior VoIP/WebRTC debugging expert with deep expertise in HTTP, WebRTC, SIP, SDP, RTP, SRTP, DTLS, ICE, TCP, UDP, TLS, and related protocols. You produce EXHAUSTIVE, production-grade debug analyses that leave no log entry unexamined. @@ -251,12 +217,13 @@ def _make_model() -> LiteLlm: Use the **architecture_endpoints_skill** for service roles, signaling/media paths, and WebRTC Calling architecture (see references/architecture_and_endpoints.md — endpoints and WebRTC Calling sections). Use the **sip_flow_skill** for SIP message sequences, response code meanings, SDP negotiation details, SIP timers, and common failure patterns (see references/sip_flows.md). +Use the **mobius_error_id_skill** when you encounter mobius-error codes or unexpected HTTP status codes from Mobius. -**Log Sources — Analyze ALL of them thoroughly:** -1. **Mobius logs** from {{{{mobius_logs}}}} (logstash-wxm-app indexes) — HTTP/WebSocket signaling, SIP translation, device registration -2. **SSE/MSE logs** from {{{{sse_mse_logs}}}} (logstash-wxcalling indexes) — SIP edge signaling, media relay -3. **WxCAS logs** from {{{{wxcas_logs}}}} (logstash-wxcalling indexes) — Call routing, destination resolution, application server logic -4. **SDK/Client logs** from {{{{sdk_logs}}}} (uploaded by user) — Client-side SDK perspective (browser/app WebRTC logs) +**Log Sources in this batch — recognize them by their tags/index patterns:** +1. **Mobius logs** (logstash-wxm-app indexes, tags: mobius) — HTTP/WebSocket signaling, SIP translation, device registration +2. **SSE/MSE logs** (logstash-wxcalling indexes, tags: sse, mse) — SIP edge signaling, media relay +3. **WxCAS logs** (logstash-wxcalling indexes, tags: wxcas) — Call routing, destination resolution, application server logic +4. **SDK/Client logs** (uploaded by user) — Client-side SDK perspective (browser/app WebRTC logs) When SDK/Client logs are present, these provide the browser/app perspective. Correlate with server-side logs when both are available. @@ -276,7 +243,7 @@ def _make_model() -> LiteLlm: {_ANALYSIS_POINTS} -{_OUTPUT_STRUCTURE} +{_JSON_OUTPUT_SCHEMA} """, ) @@ -288,7 +255,6 @@ def _make_model() -> LiteLlm: contact_center_agent = LlmAgent( model=_make_model(), name="contact_center_agent", - output_key="analyze_results", tools=[architecture_skill_toolset, sip_flow_skill_toolset], instruction=f"""You are a senior VoIP/Contact Center debugging expert with deep expertise in HTTP, WebRTC, SIP, SDP, RTP, SRTP, DTLS, ICE, TCP, UDP, TLS, and related protocols. You produce EXHAUSTIVE, production-grade debug analyses that leave no log entry unexamined. @@ -297,17 +263,20 @@ def _make_model() -> LiteLlm: Use the **architecture_endpoints_skill** for service roles and Contact Center architecture (see references/architecture_and_endpoints.md — endpoints and Contact Center sections). Use the **sip_flow_skill** for SIP message sequences, response code meanings, SDP negotiation details, SIP timers, and common failure patterns (see references/sip_flows.md). -**Log Sources — Analyze ALL of them thoroughly:** -1. **Mobius logs** from {{{{mobius_logs}}}} (logstash-wxm-app indexes) — HTTP/WebSocket signaling, SIP translation -2. **SSE/MSE logs** from {{{{sse_mse_logs}}}} (logstash-wxcalling indexes) — SIP edge signaling, media relay -3. **WxCAS logs** from {{{{wxcas_logs}}}} (logstash-wxcalling indexes) — Call routing logic -4. **SDK/Client logs** from {{{{sdk_logs}}}} (uploaded by user) — Client-side SDK perspective +**Log Sources in this batch — recognize them by their tags/index patterns:** +1. **Mobius logs** (logstash-wxm-app indexes, tags: mobius) — HTTP/WebSocket signaling, SIP translation +2. **SSE/MSE logs** (logstash-wxcalling indexes, tags: sse, mse) — SIP edge signaling, media relay +3. **WxCAS logs** (logstash-wxcalling indexes, tags: wxcas) — Call routing logic +4. **SDK/Client logs** (uploaded by user) — Client-side SDK perspective When SDK/Client logs are present, these provide the browser/app perspective. Correlate with server-side logs when both are available. **Cross-source correlation is CRITICAL:** - The SAME call appears in multiple log sources with different perspectives - Correlate using shared IDs: Call-ID, Session ID, Tracking ID +- Mobius logs show the browser↔server HTTP side +- SSE logs show the SIP signaling side of the same events +- WxCAS logs show routing decisions - Present a UNIFIED view that stitches together all perspectives Because the search was exhaustive (BFS), you may see logs from MULTIPLE call legs, @@ -315,7 +284,7 @@ def _make_model() -> LiteLlm: {_ANALYSIS_POINTS} -{_OUTPUT_STRUCTURE} +{_JSON_OUTPUT_SCHEMA} """, ) @@ -324,35 +293,21 @@ def _make_model() -> LiteLlm: # Coordinator: Routes to calling or contact center based on serviceIndicator # ═══════════════════════════════════════════════════════════════════════════════ - -def _ensure_state_defaults(callback_context) -> None: - """Guarantee optional state keys exist so {var} references don't KeyError.""" - callback_context.state.setdefault("sdk_logs", "") - - -analyze_agent = LlmAgent( - name="analyze_agent_v2", - output_key="analyze_results", +batch_analysis_agent = LlmAgent( + name="batch_analysis_agent", model=_make_model(), - before_agent_callback=_ensure_state_defaults, instruction=""" -Context: You are analyzing logs from a WebRTC Calling or contact center flow, -which involves talking to different endpoints using protocols like HTTP, SIP, -WebRTC, SDP, RTP, TLS. - -You have access to the full search results from an exhaustive BFS search: -- Search summary: {search_summary} -- Mobius logs: {mobius_logs} -- SSE/MSE logs: {sse_mse_logs} -- WxCAS logs: {wxcas_logs} -- SDK/Client logs: {sdk_logs} - -Use `serviceIndicator` from logs to classify the session: +You are a log analysis router. You will receive a batch of condensed log entries +from a Webex Calling / Contact Center platform. + +Look at the log entries for `serviceIndicator` fields to classify the session: - `calling`, `guestCalling` → WebRTC Calling Flow, transfer to `calling_agent` - `contactCenter` → Contact Center Flow, transfer to `contact_center_agent` If no `serviceIndicator` is found, default to `calling_agent`. + +Transfer the FULL user message (batch data) to the selected agent. """, - description="Routes analysis to Calling or ContactCenter agent based on serviceIndicator in logs.", + description="Routes batch analysis to Calling or ContactCenter agent based on serviceIndicator in logs.", sub_agents=[calling_agent, contact_center_agent], ) diff --git a/agents/analyze_agent_v2/incremental.py b/agents/analyze_agent_v2/incremental.py index 7b3d7a3..a5eafe9 100644 --- a/agents/analyze_agent_v2/incremental.py +++ b/agents/analyze_agent_v2/incremental.py @@ -650,120 +650,190 @@ def format_to_markdown( The output mirrors the section structure expected by downstream agents (sequence_diagram, chat_agent). """ - sections = [] - - # ── Root Cause Analysis (errors) ── - sections.append("---\n### Root Cause Analysis") + lines: list[str] = [] errors = rolling.get("errors", []) + timeline = rolling.get("timeline", []) + ids = rolling.get("identifiers", {}) + summary = rolling.get("summary", "") + + # ── Root Cause Analysis ── + lines.append("---") + lines.append("### Root Cause Analysis\n") if not errors: - sections.append( - "No errors or issues detected. The flow appears to have completed normally." + lines.append( + "No errors or issues detected. The flow appears to have completed normally.\n" ) else: - for err in errors: + for i, err in enumerate(errors, 1): ts = err.get("timestamp", "unknown") code = err.get("code", "N/A") svc = err.get("service", "unknown") msg = err.get("message", "") cause = err.get("suspected_cause", "") - sections.append( - f"**[{ts}]**: {code}\n" - f" **Service**: {svc}\n" - f" **Description**: {msg}\n" - f" **Suspected Root Cause**: {cause}" - ) + ctx = err.get("context", "") + fix = err.get("suggested_fix", "") + impact = err.get("impact", "") + + lines.append(f"**{i}. [{ts}]** — `{code}`\n") + lines.append("| Field | Detail |") + lines.append("|-------|--------|") + lines.append(f"| **Service** | {svc} |") + lines.append(f"| **Description** | {msg} |") + if ctx: + lines.append(f"| **Context** | {ctx} |") + lines.append(f"| **Root Cause** | {cause} |") + if fix: + lines.append(f"| **Suggested Fix** | {fix} |") + if impact: + lines.append(f"| **Impact** | {impact} |") + lines.append("") # ── Extracted Identifiers ── - sections.append("\n---\n### Extracted Identifiers") - ids = rolling.get("identifiers", {}) + lines.append("---") + lines.append("### Extracted Identifiers\n") label_map = { - "session_ids": "Session ID", + "tracking_ids": "Tracking ID", "call_ids": "Call ID (Mobius)", "sip_call_ids": "Call ID (SIP)", "sse_call_ids": "Call ID (SSE)", - "tracking_ids": "Tracking ID", + "session_ids": "Session ID", "user_ids": "User ID", "device_ids": "Device ID", "trace_ids": "Trace ID", } + has_any_id = False for key, label in label_map.items(): vals = ids.get(key, []) if vals: - sections.append(f"- **{label}**: {', '.join(vals)}") - else: - sections.append(f"- **{label}**: (not found)") + has_any_id = True + lines.append(f"- **{label}**: `{'`, `'.join(vals)}`") + if not has_any_id: + lines.append("No identifiers extracted.") + lines.append("") # ── Search Scope ── if search_summary: - sections.append("\n---\n### Search Scope") - sections.append(search_summary) + lines.append("---") + lines.append("### Search Scope\n") + lines.append(search_summary) + lines.append("") # ── Cross-Service Correlation ── - sections.append("\n---\n### Cross-Service Correlation") + lines.append("---") + lines.append("### Cross-Service Correlation\n") corrs = rolling.get("cross_service_correlations", []) if corrs: for c in corrs: - sections.append(f"- {c}") + lines.append(f"- {c}") else: - summary_text = rolling.get("summary", "") - if "cross" in summary_text.lower() or "correlat" in summary_text.lower(): - sections.append("(See analysis summary below for cross-service details)") + if "cross" in summary.lower() or "correlat" in summary.lower(): + lines.append("(See Final Outcome below for cross-service details)") else: - sections.append("No explicit cross-service correlations captured.") + lines.append("No explicit cross-service correlations captured.") + lines.append("") # ── Timing Analysis ── - sections.append("\n---\n### Timing Analysis") - timeline = rolling.get("timeline", []) + lines.append("---") + lines.append("### Timing Analysis\n") if timeline: - first = timeline[0].get("timestamp", "") - last = timeline[-1].get("timestamp", "") - sections.append(f"- **First event**: {first}") - sections.append(f"- **Last event**: {last}") - sections.append(f"- **Events captured**: {len(timeline)}") - - sip_events = [e for e in timeline if e.get("type") == "SIP"] - if sip_events: - sections.append(f"- **SIP messages**: {len(sip_events)}") + first_ts = timeline[0].get("timestamp", "") + last_ts = timeline[-1].get("timestamp", "") + sip_evts = [e for e in timeline if e.get("type") == "SIP"] + http_evts = [e for e in timeline if e.get("type") == "HTTP"] + error_evts = [e for e in timeline if e.get("type") == "error"] + + lines.append("| Metric | Value |") + lines.append("|--------|-------|") + lines.append(f"| **First event** | {first_ts} |") + lines.append(f"| **Last event** | {last_ts} |") + lines.append(f"| **Total events** | {len(timeline)} |") + if http_evts: + lines.append(f"| **HTTP requests** | {len(http_evts)} |") + if sip_evts: + lines.append(f"| **SIP messages** | {len(sip_evts)} |") + if error_evts: + lines.append(f"| **Error events** | {len(error_evts)} |") else: - sections.append("No timeline events captured.") + lines.append("No timeline events captured.") + lines.append("") - # ── Final Outcome (analysis summary) ── - sections.append("\n---\n### Final Outcome") - summary = rolling.get("summary", "") + # ── Final Outcome ── + lines.append("---") + lines.append("### Final Outcome\n") if summary: - sections.append(summary) + lines.append(summary) else: - sections.append("Analysis produced no summary.") + lines.append("Analysis produced no summary.") + lines.append("") - # ── Timeline (condensed) ── + # ── Communication Flow (split by protocol) ── if timeline: - sections.append("\n---\n### Communication Flow") - for event in timeline: - ts = event.get("timestamp", "?") - etype = event.get("type", "") - src = event.get("source", "?") - dst = event.get("destination", "?") - detail = event.get("detail", "") - sections.append(f"**[{ts}]** {src} -> {dst}: {etype} {detail}") + http_evts = [e for e in timeline if e.get("type") == "HTTP"] + sip_evts = [e for e in timeline if e.get("type") == "SIP"] + other_evts = [e for e in timeline if e.get("type") not in ("HTTP", "SIP")] + + if http_evts: + lines.append("---") + lines.append(f"### HTTP Communication Flow ({len(http_evts)} requests)\n") + for ev in http_evts: + ts = ev.get("timestamp", "?") + src = ev.get("source", "?") + dst = ev.get("destination", "?") + detail = ev.get("detail", "") + lines.append(f"- **[{ts}]** {src} \u2192 {dst}: {detail}") + lines.append("") + + if sip_evts: + lines.append("---") + lines.append(f"### SIP Communication Flow ({len(sip_evts)} messages)\n") + for ev in sip_evts: + ts = ev.get("timestamp", "?") + src = ev.get("source", "?") + dst = ev.get("destination", "?") + detail = ev.get("detail", "") + lines.append(f"- **[{ts}]** {src} \u2192 {dst}: {detail}") + lines.append("") + + if other_evts: + lines.append("---") + lines.append(f"### Other Events ({len(other_evts)})\n") + for ev in other_evts: + ts = ev.get("timestamp", "?") + etype = ev.get("type", "") + src = ev.get("source", "?") + dst = ev.get("destination", "?") + detail = ev.get("detail", "") + lines.append(f"- **[{ts}]** `{etype}` {src} \u2192 {dst}: {detail}") + lines.append("") # ── Evidence References ── if evidence_index: - sections.append(f"\n---\n### Evidence Index ({len(evidence_index)} references)") - for i, ref in enumerate(evidence_index[:20], 1): + lines.append("---") + lines.append(f"### Evidence Index ({len(evidence_index)} references)\n") + display_refs = evidence_index[:25] + lines.append("| # | Doc ID | Index | Category | Timestamp | Relevance |") + lines.append("|---|--------|-------|----------|-----------|-----------|") + for i, ref in enumerate(display_refs, 1): doc_id = ref.get("doc_id", "?") idx = ref.get("index", "?") ts = ref.get("timestamp", "?") cat = ref.get("category", "?") rel = ref.get("relevance", "") - sections.append(f"{i}. `{doc_id}` ({idx}, {cat}) [{ts}] — {rel}") - if len(evidence_index) > 20: - sections.append(f" ... and {len(evidence_index) - 20} more references") + lines.append(f"| {i} | `{doc_id}` | {idx} | {cat} | {ts} | {rel} |") + if len(evidence_index) > 25: + lines.append(f"\n*... and {len(evidence_index) - 25} more references*") + lines.append("") # ── Stats ── - sections.append(f"\n---\n*Analysis: {rolling.get('batch_count', 0)} batches processed, " - f"{rolling.get('evidence_count', 0)} evidence references collected.*") + lines.append("---") + lines.append( + f"*Analysis: {rolling.get('batch_count', 0)} batches processed, " + f"{len(errors)} errors found, " + f"{len(timeline)} events captured, " + f"{rolling.get('evidence_count', 0)} evidence references collected.*" + ) - return "\n".join(sections) + return "\n".join(lines) # ═══════════════════════════════════════════════════════════════════════════════ diff --git a/agents/chat_agent/agent.py b/agents/chat_agent/agent.py index f5d6993..7169523 100644 --- a/agents/chat_agent/agent.py +++ b/agents/chat_agent/agent.py @@ -251,8 +251,21 @@ def get_search_summary(tool_context: ToolContext) -> dict: If the user doesn't specify which service, ask: "Which logs? Mobius, SSE/MSE, WxCAS, or SDK?" Return logs as received — preserve JSON, sort by @timestamp ascending. -If user asks for ALL logs, warn: "This is a large output. Continue?" - then call get_raw_logs("all"). + +**Chunking large log output:** +When the logs returned by get_raw_logs are large (roughly more than +50 log entries or the output would exceed ~4000 characters), you MUST +split the output into sequential chunks instead of dumping everything +at once. Follow this pattern: + + 1. Tell the user the total count and that you will send in parts: + "Found **142 Mobius log entries**. Sending in chunks…" + 2. Send the first chunk (roughly 30–50 entries) in a JSON code block. + 3. End each chunk with: "**[Chunk 1/N]** — Reply 'next' or 'continue' + for the next batch, or 'stop' to end." + 4. On each follow-up, send the next chunk until all logs are delivered. + 5. If the user asks for ALL services at once, send one service at a + time (e.g. Mobius first, then SSE/MSE, etc.) with clear headers. ── DIAGRAM REQUESTS ("show diagram", "give PlantUML") ── diff --git a/agents/visualAgent/agent.py b/agents/visualAgent/agent.py index b79debb..ae52e5e 100644 --- a/agents/visualAgent/agent.py +++ b/agents/visualAgent/agent.py @@ -76,12 +76,21 @@ ✓ Example: participant "Webex SDK/Client" as Client #E3F2FD ✓ Example: participant "Mobius" as Mobius #BBDEFB +**CRITICAL: ALL entities used in arrow messages MUST be declared as participants FIRST.** +If the analysis mentions a device, user, or entity by a long ID (e.g. d8ac9405-e6c7-30e9-...), +declare it as a participant with a short alias: +✓ CORRECT: participant "Device d8ac9405" as Device #E8EAF6 + Then use: Mobius -> Device: Start Client Event +✗ WRONG: Mobius -> Device d8ac9405-e6c7-30e9-b60f-1613fe6f2986: Start Client Event + (spaces in name and undeclared participant cause syntax errors) + ✗ FORBIDDEN (causes diagram type misdetection): - Using actor keyword: actor "Name" as Alias - Using entity, boundary, control, database, collections keywords - Using stereotypes: participant "Name" as Alias <> - Empty angle brackets: participant "Name" as Alias <> - Missing color assignment: participant "Name" as Alias + - Using undeclared names in arrow messages **RULE 2a: Color Assignment (MANDATORY)** ALL participants MUST have direct color assignment: @@ -104,8 +113,9 @@ - No spaces: Client->Mobius (WRONG) - Double arrows: Client ->> Mobius (wrong notation) - Bidirectional: Client <-> Mobius (not supported) - - Multiple arrows on same line + - Multiple/chained arrows on same line: A -> B -[#00AA00]-> C (WRONG - use two separate lines) - Wrong spacing: Client- >Mobius or Client -> Mobius (uneven spaces) + - Undeclared participants: EVERY name in an arrow MUST be a declared participant alias - **CRITICAL:** Splitting message across lines (see below) **CRITICAL LINE BREAK RULE FOR ARROWS:** @@ -198,6 +208,8 @@ - Missing endlegend - Not wrapping long values - Unescaped special characters breaking table + - Using backslash \ at end of a source line for line continuation (NOT valid PlantUML) + - Splitting a legend table row across multiple source lines **RULE 6: Color Codes (STRICT HEX FORMAT)** @@ -446,6 +458,9 @@ 2. Splitting legend table rows across lines 3. Breaking IDs/URLs with line breaks 4. Using actual line breaks instead of \n for display wrapping +5. Using undeclared participant names in arrows — declare ALL entities first +6. Chaining multiple arrows on one line (WRONG: A -> B -[#color]-> C — split into two lines) +7. Using backslash \ at end of source line for continuation (NOT valid PlantUML — use \n inside strings) Return the PlantUML diagram now.''' ) From e1e1042588a4ca1653f00fddfce4c8f4fcc089b2 Mon Sep 17 00:00:00 2001 From: Ritesh Singh Date: Thu, 12 Mar 2026 18:55:19 +0530 Subject: [PATCH 5/9] fix: update analysis --- agents/analyze_agent_v2/incremental.py | 350 +++++-------------------- agents/chat_agent/agent.py | 133 ++++++---- agents/search_agent_v2/agent.py | 2 +- 3 files changed, 155 insertions(+), 330 deletions(-) diff --git a/agents/analyze_agent_v2/incremental.py b/agents/analyze_agent_v2/incremental.py index a5eafe9..2f6b02d 100644 --- a/agents/analyze_agent_v2/incremental.py +++ b/agents/analyze_agent_v2/incremental.py @@ -1,9 +1,13 @@ """ Incremental Map-Reduce Analysis — processes log batches as they arrive from search. +MAP step invokes batch_analysis_agent (from agent.py) via ADK Runner, giving each +batch the full power of calling_agent / contact_center_agent with skills and routing. +A fresh session is created per batch to prevent memory growth. + Exports a clean function interface consumed by search_agent_v2: - new_rolling_analysis() → empty rolling state - - map_batch() → MAP: one batch + compact memory → structured JSON + - map_batch() → MAP: one batch via ADK Runner → structured JSON - reduce() → REDUCE: merge map output into rolling state - compress_analysis_summary() → shrink rolling summary when it exceeds token cap - format_to_markdown() → convert final rolling state to markdown report @@ -15,119 +19,30 @@ import json import logging import os +import uuid from typing import Any import litellm from dotenv import load_dotenv from pathlib import Path +from google.adk.runners import Runner +from google.adk.sessions import InMemorySessionService +from google.genai import types + env_path = Path(__file__).parent.parent / ".env" load_dotenv(dotenv_path=env_path) logger = logging.getLogger(__name__) -# Re-use TokenBudget from search_agent_v2 (imported by callers, passed in as arg). -# We only reference the type for documentation; no import needed at module level. +from analyze_agent_v2.agent import batch_analysis_agent # ═══════════════════════════════════════════════════════════════════════════════ -# Skill References (loaded on-demand via tool calls) +# ADK Runner setup (reused across all map_batch calls) # ═══════════════════════════════════════════════════════════════════════════════ -_SKILLS_DIR = Path(__file__).parent / "skills" - -_SKILL_FILE_MAP = { - "lookup_mobius_error_codes": _SKILLS_DIR / "mobius-error-id-skill" / "references" / "mobius_error_ids.md", - "lookup_architecture": _SKILLS_DIR / "architecture-endpoints-skill" / "references" / "architecture_and_endpoints.md", - "lookup_sip_flows": _SKILLS_DIR / "sip-flow-skill" / "references" / "sip_flows.md", - "lookup_calling_flow": _SKILLS_DIR / "architecture-endpoints-skill" / "references" / "calling_flow.md", - "lookup_contact_center_flow": _SKILLS_DIR / "architecture-endpoints-skill" / "references" / "contact_center_flow.md", -} - -_SKILL_CACHE: dict[str, str] = {} - - -def _load_skill_reference(name: str) -> str: - """Load a skill reference file, with caching.""" - if name in _SKILL_CACHE: - return _SKILL_CACHE[name] - path = _SKILL_FILE_MAP.get(name) - if not path or not path.exists(): - return f"Reference '{name}' not found." - content = path.read_text(encoding="utf-8") - _SKILL_CACHE[name] = content - logger.info(f"[_load_skill_reference] Loaded {name}: {len(content)} chars") - return content - - -_TOOL_DEFINITIONS = [ - { - "type": "function", - "function": { - "name": "lookup_mobius_error_codes", - "description": ( - "Look up Mobius HTTP error codes and mobius-error codes " - "(e.g., 101, 102, 103, 403, 503). Returns detailed reference " - "with root cause direction, user impact, and what to check in logs. " - "Call this when you see mobius-error codes or unexpected HTTP status " - "codes from Mobius in the log batch." - ), - "parameters": {"type": "object", "properties": {}, "required": []}, - }, - }, - { - "type": "function", - "function": { - "name": "lookup_architecture", - "description": ( - "Look up Webex Calling / Contact Center architecture: service roles " - "(Mobius, SSE, MSE, WxCAS, CPAPI, Mercury, WDM, U2C), signaling and " - "media paths, call types, multi-instance deployment, timers, failover. " - "Call this when you need to understand how services connect or what a " - "specific component does." - ), - "parameters": {"type": "object", "properties": {}, "required": []}, - }, - }, - { - "type": "function", - "function": { - "name": "lookup_sip_flows", - "description": ( - "Look up SIP message flow references: call setup (INVITE transaction), " - "early media (183), hold/resume (re-INVITE), call transfer (REFER), " - "registration (REGISTER), SIP response codes, SDP negotiation, timers, " - "and common failure patterns. Call this when you see SIP messages in logs " - "and need to verify the expected flow or diagnose a SIP failure." - ), - "parameters": {"type": "object", "properties": {}, "required": []}, - }, - }, - { - "type": "function", - "function": { - "name": "lookup_calling_flow", - "description": ( - "Look up WebRTC Calling end-to-end flow: signaling path, media path, " - "call types (WebRTC-to-WebRTC, WebRTC-to-PSTN, WebRTC-to-DeskPhone). " - "Call this when analyzing a standard calling flow." - ), - "parameters": {"type": "object", "properties": {}, "required": []}, - }, - }, - { - "type": "function", - "function": { - "name": "lookup_contact_center_flow", - "description": ( - "Look up Contact Center architecture: Kamailio SIP proxy, RTMS, RAS, " - "health ping endpoints, Mobius timers, Kafka failover, inter-regional " - "failover. Call this when logs indicate a Contact Center flow." - ), - "parameters": {"type": "object", "properties": {}, "required": []}, - }, - }, -] - +_APP_NAME = "log-analyzer-incremental" +_USER_ID = "incremental-pipeline" def _handle_tool_calls(tool_calls: list) -> list[dict]: """Execute tool calls and return tool result messages.""" @@ -226,108 +141,9 @@ def _parse_json_from_llm(raw: Any) -> dict: # ═══════════════════════════════════════════════════════════════════════════════ -# MAP Step +# MAP Step — invokes batch_analysis_agent via ADK Runner # ═══════════════════════════════════════════════════════════════════════════════ -_MAP_INSTRUCTION = """\ -You are an analysis agent with deep expertise in HTTP, WebRTC, \ -SIP protocols and their interactions. You will receive a BATCH of microservice log entries \ -(condensed JSON) and a PRIOR ANALYSIS SUMMARY from earlier batches. - -These logs come from a Webex Calling / Contact Center platform. You have access to \ -reference tools — use them when you need detailed knowledge: - -- **lookup_mobius_error_codes**: Call when you see `mobius-error` codes or unexpected HTTP \ -status codes from Mobius. Returns code-level root cause and debugging guidance. -- **lookup_architecture**: Call when you need to understand service roles (Mobius, SSE, MSE, \ -WxCAS, CPAPI, Mercury, WDM, U2C), signaling/media paths, or how services interconnect. -- **lookup_sip_flows**: Call when analyzing SIP messages (INVITE, BYE, REGISTER, re-INVITE, \ -REFER, etc.) and you need the expected sequence, SDP details, or failure patterns. -- **lookup_calling_flow**: Call when analyzing an end-to-end calling flow (WebRTC-to-WebRTC, \ -WebRTC-to-PSTN, WebRTC-to-DeskPhone). -- **lookup_contact_center_flow**: Call when logs indicate a Contact Center scenario (Kamailio, \ -RTMS, RAS, health pings, Kafka failover). - -## Output Schema - -Analyze THIS batch and produce a structured JSON object. \ -Output ONLY valid JSON — no markdown fences, no preamble. - -{ - "new_identifiers": { - "session_ids": [""], - "call_ids": [""], - "sip_call_ids": [""], - "sse_call_ids": [""], - "tracking_ids": [""], - "user_ids": [""], - "device_ids": [""], - "trace_ids": [""] - }, - "events": [ - { - "timestamp": "", - "type": "HTTP|SIP|media|routing|registration|websocket|error", - "source": "", - "destination": "", - "detail": "" - } - ], - "errors": [ - { - "timestamp": "", - "code": "", - "service": "", - "message": "", - "suspected_cause": "" - } - ], - "state_updates": [ - { - "timestamp": "", - "transition": "", - "from_state": "", - "to_state": "" - } - ], - "evidence_refs": [ - { - "doc_id": "", - "index": "", - "timestamp": "", - "category": "mobius|sse_mse|wxcas", - "relevance": "" - } - ], - "delta_summary": "<2-4 sentence summary of what THIS batch reveals that is NEW compared to the prior summary>" -} - -## Analysis Guidance - -Be THOROUGH and EXHAUSTIVE — every log entry matters for debugging. - -- **HTTP**: capture every request/response with timestamp, source→destination, method, \ -full path, status code, relevant IDs. Flag non-2xx responses. Note latency if visible. -- **SIP**: capture INVITE, 100 Trying, 180 Ringing, 183 Session Progress, 200 OK, ACK, \ -BYE, CANCEL, UPDATE, re-INVITE, PRACK, REFER with Call-ID and CSeq. Extract SDP details \ -(codec, media type, ICE candidates) when visible. Identify retransmissions and timeouts. \ -Use **lookup_sip_flows** if you need to verify the expected sequence. -- **Errors**: every non-2xx HTTP, every 4xx/5xx/6xx SIP, every mobius-error code, every \ -logged error/warning/exception. Use **lookup_mobius_error_codes** for Mobius-specific codes. -- **State transitions**: call state changes (idle→calling→connected→disconnected), \ -registration state (unregistered→registered→expired), SIP dialog state, media negotiation. -- **Cross-service correlation**: the SAME call appears in Mobius (HTTP side), SSE (SIP side), \ -and WxCAS (routing side) with shared IDs. Note when you see the same transaction across services. \ -Identify gaps. Use **lookup_architecture** if you need to understand the expected path. -- **Timing**: note delays >2s between expected sequential events. Calculate setup time \ -(INVITE to 200 OK). Flag timeouts. -- **Evidence**: mark log entries critical for debugging (errors, state changes, first/last events, \ -SIP milestones). -- **delta_summary**: focus on what is NEW in this batch vs the prior summary — avoid repeating. - -If no items exist for a category, use an empty list []. -""" - _MAP_USER_TEMPLATE = """\ ## Prior Analysis Summary {compact_memory} @@ -340,14 +156,17 @@ def _parse_json_from_llm(raw: Any) -> dict: async def map_batch( condensed_hits: list[dict], compact_memory: str, - budget: "TokenBudget", ) -> dict: - """MAP step: analyze one batch of log entries via LLM. + """MAP step: analyze one batch of log entries via ADK Runner. + + Creates a fresh session per batch to prevent memory growth, sends the + batch + compact_memory as a user message, and collects the structured + JSON response from the batch_analysis_agent (which routes to + calling_agent or contact_center_agent with full skills). Args: condensed_hits: list of condensed log entries (from extract_id_fields_for_llm) compact_memory: the rolling_analysis["summary"] from prior batches (few KB) - budget: TokenBudget instance for tracking/limiting token usage Returns: MapOutput dict matching the schema in _MAP_INSTRUCTION, or empty dict on failure. @@ -360,82 +179,59 @@ async def map_batch( batch_json=batch_json, ) - full_prompt = _MAP_INSTRUCTION + user_content - est_tokens = _estimate_tokens(full_prompt) + session_id = f"batch-{uuid.uuid4().hex[:12]}" - if budget and not budget.can_afford(full_prompt): - allowed_chars = ( - budget.remaining_stage() * CHARS_PER_TOKEN_ESTIMATE - - len(_MAP_INSTRUCTION) - - len(_MAP_USER_TEMPLATE) - - len(compact_memory or "") - - 200 + try: + session = await _session_service.create_session( + app_name=_APP_NAME, + user_id=_USER_ID, + session_id=session_id, ) - if allowed_chars < 500: - logger.warning("[map_batch] Budget too tight, skipping batch") - return {} - batch_json = batch_json[:allowed_chars] - user_content = _MAP_USER_TEMPLATE.format( - compact_memory=compact_memory or "(No prior analysis — this is the first batch)", - batch_json=batch_json, + + user_message = types.Content( + role="user", + parts=[types.Part.from_text(text=user_content)], ) - logger.info(f"[map_batch] Trimmed batch for budget: {len(batch_json)} chars") - MAX_TOOL_ROUNDS = 3 + final_text = "" + async for event in _runner.run_async( + user_id=_USER_ID, + session_id=session.id, + new_message=user_message, + ): + if event.content and event.content.parts: + for part in event.content.parts: + if part.text: + final_text = part.text - messages = [ - {"role": "system", "content": _MAP_INSTRUCTION}, - {"role": "user", "content": user_content}, - ] + result = _parse_json_from_llm(final_text or "{}") - try: - for _round in range(MAX_TOOL_ROUNDS + 1): - response = await litellm.acompletion( - model="openai/gpt-4.1", - api_key=api_key, - api_base=api_base, - extra_headers={"x-cisco-app": "microservice-log-analyzer"}, - messages=messages, - tools=_TOOL_DEFINITIONS, - tool_choice="auto", - temperature=0, - ) - if budget: - budget.record_usage(est_tokens) - - choice = response.choices[0] - - if choice.finish_reason == "tool_calls" or ( - choice.message.tool_calls and not choice.message.content - ): - tool_calls = choice.message.tool_calls - logger.info( - f"[map_batch] Round {_round}: LLM requested " - f"{len(tool_calls)} skill(s): " - f"{[tc.function.name for tc in tool_calls]}" - ) - messages.append(choice.message) - messages.extend(_handle_tool_calls(tool_calls)) - continue - - raw = choice.message.content or "{}" - result = _parse_json_from_llm(raw) - - logger.info( - f"[map_batch] Extracted (after {_round} tool round(s)): " - f"events={len(result.get('events', []))}, " - f"errors={len(result.get('errors', []))}, " - f"state_updates={len(result.get('state_updates', []))}, " - f"evidence_refs={len(result.get('evidence_refs', []))}" - ) - return result + logger.info( + f"[map_batch] ADK Runner result (session={session_id}): " + f"events={len(result.get('events', []))}, " + f"errors={len(result.get('errors', []))}, " + f"state_updates={len(result.get('state_updates', []))}, " + f"evidence_refs={len(result.get('evidence_refs', []))}" + ) + return result logger.warning("[map_batch] Exhausted tool rounds, returning last response") return _parse_json_from_llm(response.choices[0].message.content or "{}") except Exception as e: - logger.error(f"[map_batch] LLM call failed: {e}") + logger.error(f"[map_batch] ADK Runner call failed: {e}") return {} + finally: + try: + await _session_service.delete_session( + app_name=_APP_NAME, + user_id=_USER_ID, + session_id=session_id, + ) + except Exception: + pass + + # ═══════════════════════════════════════════════════════════════════════════════ @@ -579,10 +375,7 @@ def _prune_timeline(timeline: list[dict]) -> list[dict]: """ -async def compress_analysis_summary( - rolling: dict, - budget: "TokenBudget", -) -> dict: +async def compress_analysis_summary(rolling: dict) -> dict: """Compress rolling_analysis['summary'] when it exceeds ROLLING_SUMMARY_TOKEN_CAP. Calls the LLM to produce a shorter version that preserves errors, IDs, and @@ -618,11 +411,6 @@ async def compress_analysis_summary( old_tokens = current_tokens new_tokens = _estimate_tokens(compressed) - if budget: - budget.record_usage( - _estimate_tokens(summary) + _estimate_tokens(_ANALYSIS_COMPRESS_INSTRUCTION) - ) - rolling["summary"] = compressed logger.info( f"[compress_analysis_summary] Compressed: {old_tokens} -> {new_tokens} tokens " @@ -845,7 +633,6 @@ def format_to_markdown( async def run_analysis_consumer( queue: "asyncio.Queue[list[dict] | None]", - budget: "TokenBudget", search_summary: str = "", ) -> tuple[str, dict, list[dict]]: """Consume condensed hit batches from an asyncio.Queue and run MAP-REDUCE. @@ -858,7 +645,6 @@ async def run_analysis_consumer( Args: queue: asyncio.Queue fed by the search producer; items are list[dict] (condensed hits) or None (sentinel). - budget: TokenBudget instance shared with the caller. search_summary: optional search_summary string for the final markdown. Returns: @@ -884,13 +670,13 @@ async def run_analysis_consumer( compact_memory = rolling["summary"] - map_output = await map_batch(condensed_hits, compact_memory, budget) + map_output = await map_batch(condensed_hits, compact_memory) if map_output: rolling, evidence_index = reduce(rolling, map_output, evidence_index) summary_tokens = _estimate_tokens(rolling.get("summary", "")) if summary_tokens > ROLLING_SUMMARY_TOKEN_CAP: - rolling = await compress_analysis_summary(rolling, budget) + rolling = await compress_analysis_summary(rolling) queue.task_done() @@ -912,7 +698,6 @@ async def run_analysis_consumer( async def analyze_upload_only( sdk_logs: str, - budget: "TokenBudget | None" = None, ) -> tuple[str, dict, list[dict]]: """Analyze SDK logs that were uploaded directly (no OpenSearch search). @@ -921,7 +706,6 @@ async def analyze_upload_only( Args: sdk_logs: raw log text pasted or uploaded by the user. - budget: optional TokenBudget for controlling LLM spend. Returns: (markdown_report, rolling_analysis, evidence_index) @@ -941,14 +725,14 @@ async def analyze_upload_only( for i, line in enumerate(batch_lines)] compact_memory = rolling["summary"] - map_output = await map_batch(condensed, compact_memory, budget) + map_output = await map_batch(condensed, compact_memory) if map_output: rolling, evidence_index = reduce(rolling, map_output, evidence_index) summary_tokens = _estimate_tokens(rolling.get("summary", "")) if summary_tokens > ROLLING_SUMMARY_TOKEN_CAP: - rolling = await compress_analysis_summary(rolling, budget) + rolling = await compress_analysis_summary(rolling) markdown = format_to_markdown(rolling, evidence_index, search_summary="(SDK log upload)") logger.info( diff --git a/agents/chat_agent/agent.py b/agents/chat_agent/agent.py index 7169523..94db22d 100644 --- a/agents/chat_agent/agent.py +++ b/agents/chat_agent/agent.py @@ -1,3 +1,5 @@ +import json +import math import os from pathlib import Path from dotenv import load_dotenv @@ -16,6 +18,15 @@ ) from search_agent_v2.agent import _log_cache +_SERVICE_KEY_MAP = { + "mobius": "mobius_logs", + "sse_mse": "sse_mse_logs", + "sse": "sse_mse_logs", + "mse": "sse_mse_logs", + "wxcas": "wxcas_logs", + "sdk": "sdk_logs", +} + def _get_state_or_cache(tool_context: ToolContext, key: str) -> str: """Read from tool_context.state first; fall back to the module-level log cache.""" @@ -26,45 +37,74 @@ def _get_state_or_cache(tool_context: ToolContext, key: str) -> str: return _log_cache.get(session_id, {}).get(key, "") -def get_raw_logs(service: str, tool_context: ToolContext) -> dict: - """Retrieve raw logs for a specific service from the current analysis. +def _parse_log_entries(raw: str) -> list[dict]: + """Parse a JSON log string into a list of dicts, returning [] on failure.""" + if not raw: + return [] + try: + entries = json.loads(raw) + return entries if isinstance(entries, list) else [] + except (json.JSONDecodeError, TypeError): + return [] + + +def get_raw_logs(service: str, page: int, tool_context: ToolContext, page_size: int = 30) -> dict: + """Retrieve a paginated chunk of raw logs for a specific service. + + Returns only one page at a time so the full log set is never loaded + into the LLM context. Call with page=1 for the first chunk, then + increment to get subsequent chunks. Args: - service: One of "mobius", "sse_mse", "wxcas", "sdk", or "all". + service: One of "mobius", "sse_mse", "wxcas", "sdk". + Use "all" to get a count summary of all services (no log entries). + page: 1-based page number. Start with 1. + page_size: Number of log entries per page (default 30, max 50). Returns: - A dict with the requested logs, or an error if not available. + A dict with: entries (list), page, total_pages, total_entries, has_more. """ - key_map = { - "mobius": "mobius_logs", - "sse_mse": "sse_mse_logs", - "sse": "sse_mse_logs", - "mse": "sse_mse_logs", - "wxcas": "wxcas_logs", - "sdk": "sdk_logs", - } - service_lower = service.lower().strip() + page_size = max(1, min(page_size, 50)) if service_lower == "all": + summary = {} + for svc, key in [("mobius", "mobius_logs"), ("sse_mse", "sse_mse_logs"), + ("wxcas", "wxcas_logs"), ("sdk", "sdk_logs")]: + entries = _parse_log_entries(_get_state_or_cache(tool_context, key)) + summary[svc] = len(entries) return { - "mobius_logs": _get_state_or_cache(tool_context, "mobius_logs"), - "sse_mse_logs": _get_state_or_cache(tool_context, "sse_mse_logs"), - "wxcas_logs": _get_state_or_cache(tool_context, "wxcas_logs"), - "sdk_logs": _get_state_or_cache(tool_context, "sdk_logs"), + "message": "Use get_raw_logs with a specific service name and page=1 to fetch entries.", + "log_counts": summary, } - state_key = key_map.get(service_lower) + state_key = _SERVICE_KEY_MAP.get(service_lower) if not state_key: return { "error": f"Unknown service '{service}'. Use one of: mobius, sse_mse, wxcas, sdk, all.", } - logs = _get_state_or_cache(tool_context, state_key) - if not logs: - return {"logs": "", "message": f"No {service} logs available in the current analysis."} - - return {"logs": logs} + all_entries = _parse_log_entries(_get_state_or_cache(tool_context, state_key)) + total = len(all_entries) + + if total == 0: + return {"entries": [], "page": 1, "total_pages": 0, + "total_entries": 0, "has_more": False, + "message": f"No {service} logs available in the current analysis."} + + total_pages = math.ceil(total / page_size) + page = max(1, min(page, total_pages)) + start = (page - 1) * page_size + end = start + page_size + chunk = all_entries[start:end] + + return { + "entries": chunk, + "page": page, + "total_pages": total_pages, + "total_entries": total, + "has_more": page < total_pages, + } def get_sequence_diagram(tool_context: ToolContext) -> dict: @@ -124,9 +164,9 @@ def get_search_summary(tool_context: ToolContext) -> dict: The following data is available ON-DEMAND via tools (not loaded into context by default — call the tool only when needed): - get_raw_logs(service) — raw Mobius, SSE/MSE, WxCAS, or SDK logs - get_sequence_diagram() — PlantUML sequence diagram - get_search_summary() — search statistics (log counts, BFS depth, IDs) + get_raw_logs(service, page) — paginated raw logs (one chunk at a time) + get_sequence_diagram() — PlantUML sequence diagram + get_search_summary() — search statistics (log counts, BFS depth, IDs) ================================================================ RULE 0 — CONTEXT TRACKING (READ THIS FIRST) @@ -246,26 +286,27 @@ def get_search_summary(tool_context: ToolContext) -> dict: ── RAW LOG REQUESTS ("show logs", "give me the raw Mobius logs") ── -Call get_raw_logs(service) with the appropriate service name: - "mobius", "sse_mse", "wxcas", "sdk", or "all". -If the user doesn't specify which service, ask: - "Which logs? Mobius, SSE/MSE, WxCAS, or SDK?" -Return logs as received — preserve JSON, sort by @timestamp ascending. - -**Chunking large log output:** -When the logs returned by get_raw_logs are large (roughly more than -50 log entries or the output would exceed ~4000 characters), you MUST -split the output into sequential chunks instead of dumping everything -at once. Follow this pattern: - - 1. Tell the user the total count and that you will send in parts: - "Found **142 Mobius log entries**. Sending in chunks…" - 2. Send the first chunk (roughly 30–50 entries) in a JSON code block. - 3. End each chunk with: "**[Chunk 1/N]** — Reply 'next' or 'continue' - for the next batch, or 'stop' to end." - 4. On each follow-up, send the next chunk until all logs are delivered. - 5. If the user asks for ALL services at once, send one service at a - time (e.g. Mobius first, then SSE/MSE, etc.) with clear headers. +get_raw_logs is PAGINATED — it returns one chunk at a time, not all +logs at once. This keeps context small and responses fast. + + get_raw_logs(service, page, page_size=30) + service: "mobius", "sse_mse", "wxcas", "sdk", or "all" + page: 1-based page number (start with 1) + page_size: entries per page (default 30, max 50) + + Returns: { entries, page, total_pages, total_entries, has_more } + +**Usage pattern:** + 1. If user doesn't specify which service, ask: + "Which logs? Mobius, SSE/MSE, WxCAS, or SDK?" + 2. Call get_raw_logs(service="mobius", page=1) for the first chunk. + 3. Present the entries in a JSON code block. + 4. Report pagination: "**[Page 1/N]** (30 of 142 entries). + Reply 'next' for the next page, or 'stop' to end." + 5. When user says "next"/"continue", call with page=2, page=3, etc. + 6. If user asks for "all" services, call get_raw_logs("all", page=1) + first to get the count per service, then fetch one service at a + time starting with page=1. ── DIAGRAM REQUESTS ("show diagram", "give PlantUML") ── diff --git a/agents/search_agent_v2/agent.py b/agents/search_agent_v2/agent.py index 508f0a1..6aa9870 100644 --- a/agents/search_agent_v2/agent.py +++ b/agents/search_agent_v2/agent.py @@ -1578,7 +1578,7 @@ async def _run_async_impl( analysis_queue: asyncio.Queue = asyncio.Queue() analysis_task = asyncio.create_task( - run_analysis_consumer(queue=analysis_queue, budget=budget) + run_analysis_consumer(queue=analysis_queue) ) logger.info(f"[{self.name}] Analysis consumer task started in background") From 8217dc26ff5ea9650a6bf870b9aa50f3548fbfb6 Mon Sep 17 00:00:00 2001 From: Ritesh Singh Date: Thu, 12 Mar 2026 19:22:06 +0530 Subject: [PATCH 6/9] fix: handle session --- agents/analyze_agent_v2/incremental.py | 161 +++++++++++++------------ 1 file changed, 85 insertions(+), 76 deletions(-) diff --git a/agents/analyze_agent_v2/incremental.py b/agents/analyze_agent_v2/incremental.py index 2f6b02d..99089a9 100644 --- a/agents/analyze_agent_v2/incremental.py +++ b/agents/analyze_agent_v2/incremental.py @@ -3,7 +3,8 @@ MAP step invokes batch_analysis_agent (from agent.py) via ADK Runner, giving each batch the full power of calling_agent / contact_center_agent with skills and routing. -A fresh session is created per batch to prevent memory growth. +A single temporary ADK session is created per analysis run (not per batch) and +destroyed when the run finishes. Exports a clean function interface consumed by search_agent_v2: - new_rolling_analysis() → empty rolling state @@ -44,20 +45,6 @@ _APP_NAME = "log-analyzer-incremental" _USER_ID = "incremental-pipeline" -def _handle_tool_calls(tool_calls: list) -> list[dict]: - """Execute tool calls and return tool result messages.""" - results = [] - for tc in tool_calls: - name = tc.function.name - content = _load_skill_reference(name) - results.append({ - "role": "tool", - "tool_call_id": tc.id, - "content": content, - }) - logger.info(f"[_handle_tool_calls] Executed {name} -> {len(content)} chars") - return results - # ═══════════════════════════════════════════════════════════════════════════════ # Constants # ═══════════════════════════════════════════════════════════════════════════════ @@ -156,22 +143,22 @@ def _parse_json_from_llm(raw: Any) -> dict: async def map_batch( condensed_hits: list[dict], compact_memory: str, + session_id: str, ) -> dict: """MAP step: analyze one batch of log entries via ADK Runner. - Creates a fresh session per batch to prevent memory growth, sends the - batch + compact_memory as a user message, and collects the structured - JSON response from the batch_analysis_agent (which routes to - calling_agent or contact_center_agent with full skills). + Sends the batch + compact_memory as a user message to the shared session + and collects the structured JSON response from the batch_analysis_agent + (which routes to calling_agent or contact_center_agent with full skills). Args: condensed_hits: list of condensed log entries (from extract_id_fields_for_llm) compact_memory: the rolling_analysis["summary"] from prior batches (few KB) + session_id: the shared session ID for this analysis run Returns: - MapOutput dict matching the schema in _MAP_INSTRUCTION, or empty dict on failure. + MapOutput dict matching the JSON schema, or empty dict on failure. """ - api_key, api_base = _get_llm_config() batch_json = json.dumps(condensed_hits, default=str) user_content = _MAP_USER_TEMPLATE.format( @@ -179,15 +166,7 @@ async def map_batch( batch_json=batch_json, ) - session_id = f"batch-{uuid.uuid4().hex[:12]}" - try: - session = await _session_service.create_session( - app_name=_APP_NAME, - user_id=_USER_ID, - session_id=session_id, - ) - user_message = types.Content( role="user", parts=[types.Part.from_text(text=user_content)], @@ -196,7 +175,7 @@ async def map_batch( final_text = "" async for event in _runner.run_async( user_id=_USER_ID, - session_id=session.id, + session_id=session_id, new_message=user_message, ): if event.content and event.content.parts: @@ -207,7 +186,7 @@ async def map_batch( result = _parse_json_from_llm(final_text or "{}") logger.info( - f"[map_batch] ADK Runner result (session={session_id}): " + f"[map_batch] ADK Runner result: " f"events={len(result.get('events', []))}, " f"errors={len(result.get('errors', []))}, " f"state_updates={len(result.get('state_updates', []))}, " @@ -215,21 +194,9 @@ async def map_batch( ) return result - logger.warning("[map_batch] Exhausted tool rounds, returning last response") - return _parse_json_from_llm(response.choices[0].message.content or "{}") - except Exception as e: logger.error(f"[map_batch] ADK Runner call failed: {e}") return {} - finally: - try: - await _session_service.delete_session( - app_name=_APP_NAME, - user_id=_USER_ID, - session_id=session_id, - ) - except Exception: - pass @@ -642,6 +609,9 @@ async def run_analysis_consumer( them one-at-a-time with map_batch -> reduce, compressing the summary when it exceeds the token cap. + A single temporary ADK session is created for the entire analysis run and + destroyed when the consumer finishes (or on error). + Args: queue: asyncio.Queue fed by the search producer; items are list[dict] (condensed hits) or None (sentinel). @@ -653,32 +623,51 @@ async def run_analysis_consumer( rolling = new_rolling_analysis() evidence_index: list[dict] = [] - batch_num = 0 - while True: - item = await queue.get() - if item is SENTINEL: - queue.task_done() - logger.info("[analysis_consumer] Received sentinel, finishing analysis") - break + session_id = f"analysis-run-{uuid.uuid4().hex[:12]}" + await _session_service.create_session( + app_name=_APP_NAME, + user_id=_USER_ID, + session_id=session_id, + ) + logger.info(f"[analysis_consumer] Created shared session {session_id}") - batch_num += 1 - condensed_hits = item - logger.info( - f"[analysis_consumer] Processing batch {batch_num} " - f"({len(condensed_hits)} entries)" - ) + try: + batch_num = 0 + while True: + item = await queue.get() + if item is SENTINEL: + queue.task_done() + logger.info("[analysis_consumer] Received sentinel, finishing analysis") + break + + batch_num += 1 + condensed_hits = item + logger.info( + f"[analysis_consumer] Processing batch {batch_num} " + f"({len(condensed_hits)} entries)" + ) - compact_memory = rolling["summary"] + compact_memory = rolling["summary"] - map_output = await map_batch(condensed_hits, compact_memory) - if map_output: - rolling, evidence_index = reduce(rolling, map_output, evidence_index) + map_output = await map_batch(condensed_hits, compact_memory, session_id) + if map_output: + rolling, evidence_index = reduce(rolling, map_output, evidence_index) - summary_tokens = _estimate_tokens(rolling.get("summary", "")) - if summary_tokens > ROLLING_SUMMARY_TOKEN_CAP: - rolling = await compress_analysis_summary(rolling) + summary_tokens = _estimate_tokens(rolling.get("summary", "")) + if summary_tokens > ROLLING_SUMMARY_TOKEN_CAP: + rolling = await compress_analysis_summary(rolling) - queue.task_done() + queue.task_done() + finally: + try: + await _session_service.delete_session( + app_name=_APP_NAME, + user_id=_USER_ID, + session_id=session_id, + ) + logger.info(f"[analysis_consumer] Destroyed shared session {session_id}") + except Exception: + pass markdown = format_to_markdown(rolling, evidence_index, search_summary) logger.info( @@ -702,7 +691,8 @@ async def analyze_upload_only( """Analyze SDK logs that were uploaded directly (no OpenSearch search). Splits the raw log text into line-based batches and runs the same - map -> reduce -> compress pipeline. + map -> reduce -> compress pipeline. A single temporary ADK session is + created for the entire upload analysis and destroyed at the end. Args: sdk_logs: raw log text pasted or uploaded by the user. @@ -719,20 +709,39 @@ async def analyze_upload_only( rolling = new_rolling_analysis() evidence_index: list[dict] = [] - for start in range(0, len(lines), _UPLOAD_BATCH_SIZE): - batch_lines = lines[start : start + _UPLOAD_BATCH_SIZE] - condensed = [{"raw_line": line, "line_num": start + i + 1} - for i, line in enumerate(batch_lines)] + session_id = f"upload-run-{uuid.uuid4().hex[:12]}" + await _session_service.create_session( + app_name=_APP_NAME, + user_id=_USER_ID, + session_id=session_id, + ) + logger.info(f"[analyze_upload_only] Created shared session {session_id}") - compact_memory = rolling["summary"] - map_output = await map_batch(condensed, compact_memory) + try: + for start in range(0, len(lines), _UPLOAD_BATCH_SIZE): + batch_lines = lines[start : start + _UPLOAD_BATCH_SIZE] + condensed = [{"raw_line": line, "line_num": start + i + 1} + for i, line in enumerate(batch_lines)] + + compact_memory = rolling["summary"] + map_output = await map_batch(condensed, compact_memory, session_id) - if map_output: - rolling, evidence_index = reduce(rolling, map_output, evidence_index) + if map_output: + rolling, evidence_index = reduce(rolling, map_output, evidence_index) - summary_tokens = _estimate_tokens(rolling.get("summary", "")) - if summary_tokens > ROLLING_SUMMARY_TOKEN_CAP: - rolling = await compress_analysis_summary(rolling) + summary_tokens = _estimate_tokens(rolling.get("summary", "")) + if summary_tokens > ROLLING_SUMMARY_TOKEN_CAP: + rolling = await compress_analysis_summary(rolling) + finally: + try: + await _session_service.delete_session( + app_name=_APP_NAME, + user_id=_USER_ID, + session_id=session_id, + ) + logger.info(f"[analyze_upload_only] Destroyed shared session {session_id}") + except Exception: + pass markdown = format_to_markdown(rolling, evidence_index, search_summary="(SDK log upload)") logger.info( From c521d07cfc34ddc31bf4e407dcd6711dddaee435 Mon Sep 17 00:00:00 2001 From: Ritesh Singh Date: Fri, 13 Mar 2026 11:53:33 +0530 Subject: [PATCH 7/9] fix: update sdk logs handler --- agents/analyze_agent_v2/incremental.py | 74 ++++++++++++++++++++++---- agents/search_agent_v2/agent.py | 5 +- 2 files changed, 68 insertions(+), 11 deletions(-) diff --git a/agents/analyze_agent_v2/incremental.py b/agents/analyze_agent_v2/incremental.py index 99089a9..3925a08 100644 --- a/agents/analyze_agent_v2/incremental.py +++ b/agents/analyze_agent_v2/incremental.py @@ -45,6 +45,13 @@ _APP_NAME = "log-analyzer-incremental" _USER_ID = "incremental-pipeline" +_session_service = InMemorySessionService() +_runner = Runner( + agent=batch_analysis_agent, + app_name=_APP_NAME, + session_service=_session_service, +) + # ═══════════════════════════════════════════════════════════════════════════════ # Constants # ═══════════════════════════════════════════════════════════════════════════════ @@ -601,6 +608,7 @@ def format_to_markdown( async def run_analysis_consumer( queue: "asyncio.Queue[list[dict] | None]", search_summary: str = "", + sdk_logs: str = "", ) -> tuple[str, dict, list[dict]]: """Consume condensed hit batches from an asyncio.Queue and run MAP-REDUCE. @@ -609,6 +617,10 @@ async def run_analysis_consumer( them one-at-a-time with map_batch -> reduce, compressing the summary when it exceeds the token cap. + After all search batches are consumed, if sdk_logs is provided the consumer + chunks them and processes those batches too — building a unified + rolling_analysis covering both data sources. + A single temporary ADK session is created for the entire analysis run and destroyed when the consumer finishes (or on error). @@ -616,6 +628,7 @@ async def run_analysis_consumer( queue: asyncio.Queue fed by the search producer; items are list[dict] (condensed hits) or None (sentinel). search_summary: optional search_summary string for the final markdown. + sdk_logs: optional raw SDK log text to analyze after search batches. Returns: (markdown_report, rolling_analysis, evidence_index) @@ -633,17 +646,19 @@ async def run_analysis_consumer( try: batch_num = 0 + + # Phase 1: consume search batches from the queue while True: item = await queue.get() if item is SENTINEL: queue.task_done() - logger.info("[analysis_consumer] Received sentinel, finishing analysis") + logger.info("[analysis_consumer] Received sentinel, search batches done") break batch_num += 1 condensed_hits = item logger.info( - f"[analysis_consumer] Processing batch {batch_num} " + f"[analysis_consumer] Processing search batch {batch_num} " f"({len(condensed_hits)} entries)" ) @@ -658,6 +673,30 @@ async def run_analysis_consumer( rolling = await compress_analysis_summary(rolling) queue.task_done() + + # Phase 2: chunk and process SDK logs (if provided) + sdk_batches = chunk_sdk_logs(sdk_logs) + if sdk_batches: + logger.info( + f"[analysis_consumer] Processing {len(sdk_batches)} SDK log batches " + f"({sum(len(b) for b in sdk_batches)} lines)" + ) + for condensed in sdk_batches: + batch_num += 1 + logger.info( + f"[analysis_consumer] Processing SDK batch {batch_num} " + f"({len(condensed)} entries)" + ) + + compact_memory = rolling["summary"] + map_output = await map_batch(condensed, compact_memory, session_id) + if map_output: + rolling, evidence_index = reduce(rolling, map_output, evidence_index) + + summary_tokens = _estimate_tokens(rolling.get("summary", "")) + if summary_tokens > ROLLING_SUMMARY_TOKEN_CAP: + rolling = await compress_analysis_summary(rolling) + finally: try: await _session_service.delete_session( @@ -685,6 +724,25 @@ async def run_analysis_consumer( _UPLOAD_BATCH_SIZE = 200 +def chunk_sdk_logs(sdk_logs: str, batch_size: int = _UPLOAD_BATCH_SIZE) -> list[list[dict]]: + """Split raw SDK log text into batches of condensed dicts. + + Each dict has {"raw_line": , "line_num": <1-based line number>}. + Returns an empty list if sdk_logs is blank. + """ + if not sdk_logs or not sdk_logs.strip(): + return [] + lines = sdk_logs.strip().splitlines() + batches: list[list[dict]] = [] + for start in range(0, len(lines), batch_size): + batch_lines = lines[start : start + batch_size] + batches.append([ + {"raw_line": line, "line_num": start + i + 1} + for i, line in enumerate(batch_lines) + ]) + return batches + + async def analyze_upload_only( sdk_logs: str, ) -> tuple[str, dict, list[dict]]: @@ -700,11 +758,11 @@ async def analyze_upload_only( Returns: (markdown_report, rolling_analysis, evidence_index) """ - if not sdk_logs or not sdk_logs.strip(): + batches = chunk_sdk_logs(sdk_logs) + if not batches: return "(No SDK logs provided)", new_rolling_analysis(), [] - lines = sdk_logs.strip().splitlines() - logger.info(f"[analyze_upload_only] Processing {len(lines)} lines of SDK logs") + logger.info(f"[analyze_upload_only] Processing {sum(len(b) for b in batches)} lines in {len(batches)} batches") rolling = new_rolling_analysis() evidence_index: list[dict] = [] @@ -718,11 +776,7 @@ async def analyze_upload_only( logger.info(f"[analyze_upload_only] Created shared session {session_id}") try: - for start in range(0, len(lines), _UPLOAD_BATCH_SIZE): - batch_lines = lines[start : start + _UPLOAD_BATCH_SIZE] - condensed = [{"raw_line": line, "line_num": start + i + 1} - for i, line in enumerate(batch_lines)] - + for condensed in batches: compact_memory = rolling["summary"] map_output = await map_batch(condensed, compact_memory, session_id) diff --git a/agents/search_agent_v2/agent.py b/agents/search_agent_v2/agent.py index 6aa9870..33dbdb0 100644 --- a/agents/search_agent_v2/agent.py +++ b/agents/search_agent_v2/agent.py @@ -1578,7 +1578,10 @@ async def _run_async_impl( analysis_queue: asyncio.Queue = asyncio.Queue() analysis_task = asyncio.create_task( - run_analysis_consumer(queue=analysis_queue) + run_analysis_consumer( + queue=analysis_queue, + sdk_logs=ctx.session.state.get("sdk_logs", ""), + ) ) logger.info(f"[{self.name}] Analysis consumer task started in background") From 91f603f5f2e24094d2166c3a96a584122ca3d5c4 Mon Sep 17 00:00:00 2001 From: Ritesh Singh <133033102+riteshfyi@users.noreply.github.com> Date: Mon, 16 Mar 2026 00:42:50 +0530 Subject: [PATCH 8/9] fix: clean-up --- incremental_map-reduce_analysis.md | 268 ----------------------------- 1 file changed, 268 deletions(-) delete mode 100644 incremental_map-reduce_analysis.md diff --git a/incremental_map-reduce_analysis.md b/incremental_map-reduce_analysis.md deleted file mode 100644 index bef52bf..0000000 --- a/incremental_map-reduce_analysis.md +++ /dev/null @@ -1,268 +0,0 @@ ---- -name: Incremental Map-Reduce Analysis -overview: Add incremental map-reduce analysis alongside the existing search pipeline using a producer-consumer pattern. Search and analysis run in parallel via asyncio.Queue. All analysis logic lives in a new analyze_agent_v2/incremental.py. Existing search_agent_v2 code is kept untouched. -todos: - - id: analysis-interface - content: "Create the incremental analysis API in analyze_agent_v2/incremental.py: map_batch(), reduce(), compress_analysis_summary(), format_to_markdown(), new_rolling_analysis(), and analyze_upload_only()" - status: pending - - id: integrate-search - content: "Add producer-consumer pipeline to search_agent_v2: import incremental API, add analysis_consumer coroutine, wrap existing BFS search+dedup+extract in search_producer, connect via asyncio.Queue, run both with asyncio.gather, format and store analyze_results in Step 4" - status: pending - - id: update-pipeline - content: Remove analyze_agent from root_agent_v2 SequentialAgent; update query_router upload-only flow to call analyze_upload_only() - status: pending - - id: verify-downstream - content: Verify chat_agent, sequence_diagram_agent, and _log_cache still work with the new analyze_results format - status: pending -isProject: false ---- - -# Incremental Map-Reduce Analysis (Decoupled, Pipelined) - -## Principle: Do NOT remove existing search agent code - -All existing functions in `search_agent_v2/agent.py` stay untouched: - -- `_summarize_hits()` -- kept as-is -- `_SUMMARIZER_INSTRUCTION` -- kept as-is -- `_COMPRESS_INSTRUCTION` -- kept as-is -- `_extract_ids_from_batch()` -- kept as-is -- `_process_hits_progressive()` -- kept as-is (search producer calls it unchanged) -- `TokenBudget` -- kept as-is, reused by new analysis code -- BFS loop structure -- wrapped in a producer coroutine, not rewritten - -New analysis logic is **added alongside** as a parallel consumer, not substituted in. - -## Current Architecture - -```mermaid -flowchart LR - Search["search_agent_v2\n(progressive pages)"] -->|"writes ALL raw logs"| State[(Session State)] - State -->|"reads ALL raw logs"| Analyze["analyze_agent_v2\n(monolithic LLM call)"] - Analyze -->|"analyze_results"| SeqDiag["sequence_diagram_agent"] -``` - - - -Problem: The analyze agent receives all raw logs at once (100k+ tokens), hitting context limits and producing shallow analysis on large log sets. - -## Target Architecture (Producer-Consumer Pipeline) - -```mermaid -flowchart LR - subgraph Producer ["Search Producer (existing code, untouched)"] - Fetch["Fetch page"] --> Dedup["Dedup + condense"] - Dedup --> Gather["asyncio.gather"] - Gather --> IDExtract["_extract_ids_from_batch()"] - Gather --> Summarize["_summarize_hits()"] - Summarize --> Push["Push condensed hits to Queue"] - end - - Push --> Queue["asyncio.Queue\n(maxsize=4)"] - - subgraph Consumer ["Analysis Consumer (NEW, runs in parallel)"] - Pull["Pull from Queue"] --> MapFn["map_batch()"] - MapFn --> ReduceFn["reduce()"] - ReduceFn --> Compress{"summary > 4k?"} - Compress -->|yes| CompressFn["compress_analysis_summary()"] - Compress -->|no| Pull - CompressFn --> Pull - end - - Consumer -->|"final rolling_analysis"| FormatFn["format_to_markdown()"] - FormatFn -->|"markdown"| FinalState["Session State: analyze_results"] -``` - - - -**Timeline showing parallelism:** - -``` -Search producer: [fetch p1] [dedup+IDs p1] [fetch p2] [dedup+IDs p2] [fetch p3] [dedup+IDs p3] → done - ↓ push ↓ push ↓ push -Queue: [p1] [p2] [p3] - ↓ pull ↓ pull ↓ pull -Analysis consumer: [map p1] [reduce] [map p2] [reduce] [map p3] [reduce] → done -``` - -Search keeps fetching pages without waiting for analysis. Analysis processes pages as they arrive from the queue. - -**Separation of concerns:** - -- `search_agent_v2` owns: OpenSearch fetch, pagination, dedup, BFS frontier, ID extraction, text summarization (all existing, untouched) -- `analyze_agent_v2/incremental.py` owns: structured analysis map instruction, reduce merge logic, analysis compression, markdown formatting, output schema (all new) -- The only coupling is: (1) the function interface imported by search_agent_v2, (2) the `asyncio.Queue` connecting producer and consumer - -## The Interface (analyze_agent_v2/incremental.py) - -```python -def new_rolling_analysis() -> dict: - """Factory: returns an empty rolling_analysis structure.""" - -async def map_batch( - condensed_hits: list[dict], - compact_memory: str, - budget: TokenBudget, -) -> dict: - """MAP step: analyze one batch of log entries via LLM. - Input: condensed hits + prior compact memory (few KB). - Output: MapOutput dict (structured JSON).""" - -def reduce( - rolling: dict, - map_output: dict, - evidence_index: list[dict], -) -> tuple[dict, list[dict]]: - """REDUCE step: merge map output into rolling analysis (pure Python). - Returns (updated rolling_analysis, updated evidence_index).""" - -async def compress_analysis_summary( - rolling: dict, - budget: TokenBudget, -) -> dict: - """Compress rolling_analysis.summary when it exceeds 4k tokens.""" - -def format_to_markdown( - rolling: dict, - evidence_index: list[dict], - search_summary: dict, - detailed_analysis: bool, -) -> str: - """Convert final rolling_analysis into the markdown output structure.""" - -async def analyze_upload_only( - sdk_logs: str, - budget: TokenBudget, -) -> str: - """Single-pass analysis for upload-only flow (no BFS pagination).""" - -async def run_analysis_consumer( - queue: asyncio.Queue, - budget: TokenBudget, -) -> tuple[dict, list[dict]]: - """Consumer coroutine: pulls condensed batches from queue, - runs map_batch + reduce in a loop until sentinel (None) is received. - Returns (final rolling_analysis, evidence_index).""" -``` - -## Map Output JSON Schema - -Each batch produces this structured output: - -```json -{ - "new_identifiers": { - "session_ids": [], "call_ids": [], "tracking_ids": [], - "user_ids": [], "device_ids": [] - }, - "events": [ - {"timestamp": "...", "type": "HTTP|SIP|media|routing", - "source": "...", "destination": "...", "detail": "..."} - ], - "errors": [ - {"timestamp": "...", "code": "...", "service": "...", - "message": "...", "suspected_cause": "..."} - ], - "state_updates": [ - {"timestamp": "...", "transition": "...", "from_state": "...", "to_state": "..."} - ], - "evidence_refs": [ - {"doc_id": "...", "index": "...", "timestamp": "...", - "category": "mobius|sse_mse|wxcas", "relevance": "..."} - ], - "delta_summary": "Short text summarizing what this batch revealed" -} -``` - -## Rolling Analysis Structure - -The reducer merges each map output into this object: - -```json -{ - "identifiers": { "session_ids": [], "call_ids": [], ... }, - "timeline": [ - {"timestamp": "...", "type": "...", "detail": "..."} - ], - "errors": [ - {"timestamp": "...", "code": "...", "service": "...", - "message": "...", "suspected_cause": "..."} - ], - "state_machine": [ - {"timestamp": "...", "transition": "...", "from_state": "...", "to_state": "..."} - ], - "cross_service_correlations": [ ... ], - "summary": "Running narrative <=4k tokens", - "evidence_count": 42, - "batch_count": 5 -} -``` - -Rules: - -- `summary` capped at ~4k tokens; when exceeded, compress via `compress_analysis_summary()` LLM call -- `identifiers` are deduplicated sets -- `timeline` keeps only milestone events (prune low-value entries when list exceeds ~50) -- `errors` are always preserved (never compressed away) -- `evidence_refs` stored separately in `evidence_index` list (unbounded in storage, only `evidence_count` in the rolling object) - -## Files to Change - -### 1. NEW: `agents/analyze_agent_v2/incremental.py` - -The core new file. Contains ALL incremental analysis logic: - -- `_MAP_INSTRUCTION` -- LLM system prompt for the map step. Incorporates domain knowledge from the existing `_ANALYSIS_POINTS` in agent.py (HTTP, SIP, media, timing, errors, cross-service correlation). Receives one batch + compact memory, outputs the structured JSON schema above. -- `_ANALYSIS_COMPRESS_INSTRUCTION` -- LLM prompt for compressing the rolling analysis summary (separate from the search agent's existing `_COMPRESS_INSTRUCTION` which stays in search_agent_v2) -- `new_rolling_analysis()` -- factory that returns an empty structure. -- `map_batch()` -- async, calls LLM via litellm, parses JSON output. Budget-aware (accepts `TokenBudget`). -- `reduce()` -- pure Python merge. Dedup identifiers, append events/errors/state_updates, move evidence_refs to evidence_index, append delta_summary to rolling summary. -- `compress_analysis_summary()` -- async, calls LLM when summary exceeds ~4k tokens. -- `format_to_markdown()` -- converts rolling_analysis dict into the markdown structure matching the current `_OUTPUT_STRUCTURE` sections (Root Cause, Identifiers, Search Scope, Timing, Final Outcome, HTTP/SIP flows). -- `run_analysis_consumer()` -- the consumer coroutine. Pulls batches from the queue, runs map + reduce loop, handles compression. Returns final `(rolling_analysis, evidence_index)`. -- `analyze_upload_only()` -- single-pass LLM analysis for SDK-only uploads (no BFS pagination needed). - -### 2. MODIFY: `agents/search_agent_v2/agent.py` (additive only) - -No existing code removed. Only additions: - -- **Add import** at top: `from analyze_agent_v2.incremental import run_analysis_consumer, new_rolling_analysis, format_to_markdown as format_analysis_to_markdown` -- **Add `analysis_queue`**: create `asyncio.Queue(maxsize=4)` at the start of `_run_async_impl`, before the BFS loop. -- **Add queue push in `_process_hits_progressive()`**: after the existing `asyncio.gather(_extract_ids_from_batch, _summarize_hits)` completes, push the `condensed` hits to the analysis queue. This is a single `await analysis_queue.put(...)` line added after the gather. The existing function body is untouched otherwise. -- **Wrap BFS loop as producer**: the existing BFS loop becomes the body of a `search_producer` inner async function. After the loop ends, push `None` sentinel to signal the consumer to stop. -- **Run producer + consumer in parallel**: `asyncio.gather(search_producer(), run_analysis_consumer(analysis_queue, budget))` replaces the direct BFS loop call. -- **Extend Step 4**: after gather completes, retrieve `(rolling_analysis, evidence_index)` from the consumer result. Call `format_analysis_to_markdown(rolling_analysis, evidence_index, ...)` and store as `analyze_results`. Store `evidence_index` as JSON in session state. All existing Step 4 writes (`chunk_summaries`, `chunk_analysis_summary`, raw logs, etc.) remain unchanged. - -### 3. NO CHANGE: `agents/analyze_agent_v2/agent.py` - -Kept entirely as-is. The monolithic `calling_agent`, `contact_center_agent`, coordinator `analyze_agent`, all instruction strings, skill toolsets -- all remain. The incremental analysis in `incremental.py` is a parallel path, not a replacement. - -### 4. MODIFY: `agents/root_agent_v2/agent.py` - -- Remove `analyze_agent` from the `SequentialAgent`: change `[search_agent, analyze_agent, sequence_diagram_agent]` to `[search_agent, sequence_diagram_agent]` since analysis now happens incrementally during search. -- The monolithic analyze_agent code stays in `agent.py` but is no longer wired into the pipeline. - -### 5. MODIFY: `agents/query_router/agent.py` - -- Upload-only path: replace `analyze_agent.run_async(ctx)` with a call to `analyze_upload_only(sdk_logs, budget)` from `incremental.py`, store result in `ctx.session.state["analyze_results"]` - -### 6. NO CHANGE: `agents/chat_agent/agent.py` - -- Reads `{analyze_results}` -- unchanged (now written by search_agent_v2 via format_to_markdown) -- `_log_cache` import and fallback -- unchanged -- Skill toolset imports from analyze_agent_v2 -- unchanged - -## Key Design Decisions - -- **Producer-consumer parallelism**: Search (producer) and analysis (consumer) run concurrently via `asyncio.gather`. Connected by `asyncio.Queue(maxsize=4)` with backpressure. -- **Additive, not destructive**: All existing search agent functions (`_summarize_hits`, `_SUMMARIZER_INSTRUCTION`, `_COMPRESS_INSTRUCTION`, `TokenBudget`, `_process_hits_progressive`, etc.) remain untouched. Only additions: queue creation, one `queue.put()` line after gather, producer/consumer wiring. -- **Search never blocked by analysis**: search continues fetching + extracting IDs while analysis processes prior pages in the background. -- **Analysis still sequential internally**: reduce must happen in order (page 1 before page 2), but this is handled naturally by the queue's FIFO ordering. -- **Clean interface boundary**: search_agent_v2 imports `run_analysis_consumer` + `format_to_markdown` from `incremental.py`. No analysis logic in search code. -- **Compact memory**: map step receives only `rolling_analysis["summary"]` (few KB) as prior context. -- **Evidence index**: stored separately (unbounded in storage). Only `evidence_count` in the rolling object. -- **4k token cap on analysis summary**: enforced by `reduce()` calling `compress_analysis_summary()`. -- **Output structure**: `format_to_markdown()` produces the same markdown sections as `_OUTPUT_STRUCTURE` for downstream compatibility. -- **analyze_agent_v2/agent.py kept intact**: monolithic agent code remains available but is unwired from the pipeline. Can be re-wired if needed. -- **Backpressure**: `maxsize=4` on the queue means if analysis falls 4+ pages behind, search will `await` on `queue.put()` until a slot opens. Prevents unbounded memory growth. - From cca510bd7824979d0c6960b349e06d4b3121d63e Mon Sep 17 00:00:00 2001 From: Ritesh Singh Date: Tue, 17 Mar 2026 09:14:51 +0530 Subject: [PATCH 9/9] fix: improve chat markdown --- agents/analyze_agent_v2/incremental.py | 45 +++++++------------ agents/chat_agent/agent.py | 8 +++- .../components/analysis-view.tsx | 26 +++++++---- .../components/chat-panel.tsx | 37 +++++++++------ .../components/chat-view.tsx | 26 +++++++---- 5 files changed, 81 insertions(+), 61 deletions(-) diff --git a/agents/analyze_agent_v2/incremental.py b/agents/analyze_agent_v2/incremental.py index 3925a08..5bebc45 100644 --- a/agents/analyze_agent_v2/incremental.py +++ b/agents/analyze_agent_v2/incremental.py @@ -419,7 +419,6 @@ def format_to_markdown( summary = rolling.get("summary", "") # ── Root Cause Analysis ── - lines.append("---") lines.append("### Root Cause Analysis\n") if not errors: lines.append( @@ -436,22 +435,21 @@ def format_to_markdown( fix = err.get("suggested_fix", "") impact = err.get("impact", "") - lines.append(f"**{i}. [{ts}]** — `{code}`\n") + lines.append(f"{i}. [{ts}] — `{code}`\n") lines.append("| Field | Detail |") lines.append("|-------|--------|") - lines.append(f"| **Service** | {svc} |") - lines.append(f"| **Description** | {msg} |") + lines.append(f"| Service | {svc} |") + lines.append(f"| Description | {msg} |") if ctx: - lines.append(f"| **Context** | {ctx} |") - lines.append(f"| **Root Cause** | {cause} |") + lines.append(f"| Context | {ctx} |") + lines.append(f"| Root Cause | {cause} |") if fix: - lines.append(f"| **Suggested Fix** | {fix} |") + lines.append(f"| Suggested Fix | {fix} |") if impact: - lines.append(f"| **Impact** | {impact} |") + lines.append(f"| Impact | {impact} |") lines.append("") # ── Extracted Identifiers ── - lines.append("---") lines.append("### Extracted Identifiers\n") label_map = { "tracking_ids": "Tracking ID", @@ -468,20 +466,18 @@ def format_to_markdown( vals = ids.get(key, []) if vals: has_any_id = True - lines.append(f"- **{label}**: `{'`, `'.join(vals)}`") + lines.append(f"- {label}: `{'`, `'.join(vals)}`") if not has_any_id: lines.append("No identifiers extracted.") lines.append("") # ── Search Scope ── if search_summary: - lines.append("---") lines.append("### Search Scope\n") lines.append(search_summary) lines.append("") # ── Cross-Service Correlation ── - lines.append("---") lines.append("### Cross-Service Correlation\n") corrs = rolling.get("cross_service_correlations", []) if corrs: @@ -495,7 +491,6 @@ def format_to_markdown( lines.append("") # ── Timing Analysis ── - lines.append("---") lines.append("### Timing Analysis\n") if timeline: first_ts = timeline[0].get("timestamp", "") @@ -506,21 +501,20 @@ def format_to_markdown( lines.append("| Metric | Value |") lines.append("|--------|-------|") - lines.append(f"| **First event** | {first_ts} |") - lines.append(f"| **Last event** | {last_ts} |") - lines.append(f"| **Total events** | {len(timeline)} |") + lines.append(f"| First event | {first_ts} |") + lines.append(f"| Last event | {last_ts} |") + lines.append(f"| Total events | {len(timeline)} |") if http_evts: - lines.append(f"| **HTTP requests** | {len(http_evts)} |") + lines.append(f"| HTTP requests | {len(http_evts)} |") if sip_evts: - lines.append(f"| **SIP messages** | {len(sip_evts)} |") + lines.append(f"| SIP messages | {len(sip_evts)} |") if error_evts: - lines.append(f"| **Error events** | {len(error_evts)} |") + lines.append(f"| Error events | {len(error_evts)} |") else: lines.append("No timeline events captured.") lines.append("") # ── Final Outcome ── - lines.append("---") lines.append("### Final Outcome\n") if summary: lines.append(summary) @@ -535,29 +529,26 @@ def format_to_markdown( other_evts = [e for e in timeline if e.get("type") not in ("HTTP", "SIP")] if http_evts: - lines.append("---") lines.append(f"### HTTP Communication Flow ({len(http_evts)} requests)\n") for ev in http_evts: ts = ev.get("timestamp", "?") src = ev.get("source", "?") dst = ev.get("destination", "?") detail = ev.get("detail", "") - lines.append(f"- **[{ts}]** {src} \u2192 {dst}: {detail}") + lines.append(f"- [{ts}] {src} \u2192 {dst}: {detail}") lines.append("") if sip_evts: - lines.append("---") lines.append(f"### SIP Communication Flow ({len(sip_evts)} messages)\n") for ev in sip_evts: ts = ev.get("timestamp", "?") src = ev.get("source", "?") dst = ev.get("destination", "?") detail = ev.get("detail", "") - lines.append(f"- **[{ts}]** {src} \u2192 {dst}: {detail}") + lines.append(f"- [{ts}] {src} \u2192 {dst}: {detail}") lines.append("") if other_evts: - lines.append("---") lines.append(f"### Other Events ({len(other_evts)})\n") for ev in other_evts: ts = ev.get("timestamp", "?") @@ -565,12 +556,11 @@ def format_to_markdown( src = ev.get("source", "?") dst = ev.get("destination", "?") detail = ev.get("detail", "") - lines.append(f"- **[{ts}]** `{etype}` {src} \u2192 {dst}: {detail}") + lines.append(f"- [{ts}] `{etype}` {src} \u2192 {dst}: {detail}") lines.append("") # ── Evidence References ── if evidence_index: - lines.append("---") lines.append(f"### Evidence Index ({len(evidence_index)} references)\n") display_refs = evidence_index[:25] lines.append("| # | Doc ID | Index | Category | Timestamp | Relevance |") @@ -587,7 +577,6 @@ def format_to_markdown( lines.append("") # ── Stats ── - lines.append("---") lines.append( f"*Analysis: {rolling.get('batch_count', 0)} batches processed, " f"{len(errors)} errors found, " diff --git a/agents/chat_agent/agent.py b/agents/chat_agent/agent.py index caa5eb9..feff7a1 100644 --- a/agents/chat_agent/agent.py +++ b/agents/chat_agent/agent.py @@ -250,12 +250,16 @@ def get_search_summary(tool_context: ToolContext) -> dict: - Be concise. Lead with the direct answer. Expand only if asked. - Always cite exact timestamps and identifiers: - "At **06:58:18.075Z**, **Mobius** sent **SIP 480** + "At 06:58:18.075Z, Mobius sent SIP 480 (Call-ID: SSE065806...)." Never say: "later in the logs", "around that time". -- Use markdown: bold for services/IDs, bullet lists for clarity. +- Use markdown sparingly: bullet lists for clarity, bold only for + section headings or a single critical keyword per sentence. + Do NOT bold timestamps, service names, IDs, or status codes inline. + Overuse of bold makes the output hard to read. - Engineers prefer precision over explanation. Facts first. - Professional tone. No fluff, no storytelling, no emojis. +- Never output bare bullet markers (- or *) on otherwise empty lines. ================================================================ HANDLING SPECIFIC REQUEST TYPES diff --git a/log-analyzer-frontend/components/analysis-view.tsx b/log-analyzer-frontend/components/analysis-view.tsx index c4b2501..1569b06 100644 --- a/log-analyzer-frontend/components/analysis-view.tsx +++ b/log-analyzer-frontend/components/analysis-view.tsx @@ -15,16 +15,24 @@ export function AnalysisView({ analysis }: AnalysisViewProps) { return ( -
+

{children}

, - h2: ({ children }) =>

{children}

, - h3: ({ children }) =>

{children}

, - p: ({ children }) =>

{children}

, - ul: ({ children }) =>
    {children}
, - ol: ({ children }) =>
    {children}
, - li: ({ children }) =>
  • {children}
  • , + h1: ({ children }) =>

    {children}

    , + h2: ({ children }) =>

    {children}

    , + h3: ({ children }) =>

    {children}

    , + p: ({ children }) => { + if (!children || (Array.isArray(children) && children.every((c: any) => c === null || c === undefined || c === ""))) return null + return

    {children}

    + }, + ul: ({ children }) =>
      {children}
    , + ol: ({ children }) =>
      {children}
    , + li: ({ children }) => { + if (!children || (typeof children === "string" && !(children as string).trim())) return null + return
  • {children}
  • + }, + hr: () =>
    , + strong: ({ children }) => {children}, code: ({ children }) => ( {children} ), @@ -40,7 +48,7 @@ export function AnalysisView({ analysis }: AnalysisViewProps) { ), }} > - {analysis} + {analysis.replace(/^[-*]\s*$/gm, "").replace(/\n{3,}/g, "\n\n")}
    diff --git a/log-analyzer-frontend/components/chat-panel.tsx b/log-analyzer-frontend/components/chat-panel.tsx index 0b1cd7f..537c0c3 100644 --- a/log-analyzer-frontend/components/chat-panel.tsx +++ b/log-analyzer-frontend/components/chat-panel.tsx @@ -7,6 +7,12 @@ import { Input } from "@/components/ui/input" import { Send, Square, Bot, User, Loader2 } from "lucide-react" import ReactMarkdown from "react-markdown" +function cleanMarkdown(md: string): string { + return md + .replace(/^[-*]\s*$/gm, "") // remove bullet-only lines (no content) + .replace(/\n{3,}/g, "\n\n") // collapse excessive blank lines +} + export interface ChatMessage { id: string role: "user" | "assistant" @@ -23,24 +29,29 @@ interface ChatPanelProps { const markdownComponents = { h1: ({ children }: any) => ( -

    {children}

    +

    {children}

    ), h2: ({ children }: any) => ( -

    {children}

    +

    {children}

    ), h3: ({ children }: any) => ( -

    {children}

    - ), - p: ({ children }: any) => ( -

    {children}

    +

    {children}

    ), + p: ({ children }: any) => { + if (!children || (Array.isArray(children) && children.every((c: any) => c === null || c === undefined || c === ""))) return null + return

    {children}

    + }, ul: ({ children }: any) => ( -
      {children}
    +
      {children}
    ), ol: ({ children }: any) => ( -
      {children}
    +
      {children}
    ), - li: ({ children }: any) =>
  • {children}
  • , + li: ({ children }: any) => { + if (!children || (typeof children === "string" && !children.trim())) return null + return
  • {children}
  • + }, + hr: () =>
    , code: ({ children, className }: any) => { const isBlock = className?.includes("language-") if (isBlock) { @@ -65,7 +76,7 @@ const markdownComponents = { ), strong: ({ children }: any) => ( - {children} + {children} ), table: ({ children }: any) => (
    @@ -73,7 +84,7 @@ const markdownComponents = {
    ), th: ({ children }: any) => ( - + {children} ), @@ -171,9 +182,9 @@ export function ChatPanel({ messages, loading, chatDisabled, onSendMessage, onSt {msg.content}

    ) : ( -
    +
    - {msg.content} + {cleanMarkdown(msg.content)}
    )} diff --git a/log-analyzer-frontend/components/chat-view.tsx b/log-analyzer-frontend/components/chat-view.tsx index e17f8ab..6f97f18 100644 --- a/log-analyzer-frontend/components/chat-view.tsx +++ b/log-analyzer-frontend/components/chat-view.tsx @@ -15,16 +15,24 @@ export function ChatView({ chatResponse }: ChatViewProps) { return ( -
    +

    {children}

    , - h2: ({ children }) =>

    {children}

    , - h3: ({ children }) =>

    {children}

    , - p: ({ children }) =>

    {children}

    , - ul: ({ children }) =>
      {children}
    , - ol: ({ children }) =>
      {children}
    , - li: ({ children }) =>
  • {children}
  • , + h1: ({ children }) =>

    {children}

    , + h2: ({ children }) =>

    {children}

    , + h3: ({ children }) =>

    {children}

    , + p: ({ children }) => { + if (!children || (Array.isArray(children) && children.every((c: any) => c === null || c === undefined || c === ""))) return null + return

    {children}

    + }, + ul: ({ children }) =>
      {children}
    , + ol: ({ children }) =>
      {children}
    , + li: ({ children }) => { + if (!children || (typeof children === "string" && !(children as string).trim())) return null + return
  • {children}
  • + }, + hr: () =>
    , + strong: ({ children }) => {children}, code: ({ children }) => ( {children} ), @@ -40,7 +48,7 @@ export function ChatView({ chatResponse }: ChatViewProps) { ), }} > - {chatResponse} + {chatResponse.replace(/^[-*]\s*$/gm, "").replace(/\n{3,}/g, "\n\n")}