diff --git a/aieng-eval-agents/aieng/agent_evals/configs.py b/aieng-eval-agents/aieng/agent_evals/configs.py index 390847b..61f7542 100644 --- a/aieng-eval-agents/aieng/agent_evals/configs.py +++ b/aieng-eval-agents/aieng/agent_evals/configs.py @@ -96,11 +96,6 @@ class Configs(BaseSettings): validation_alias=AliasChoices("OPENAI_API_KEY", "GEMINI_API_KEY", "GOOGLE_API_KEY"), description="API key for OpenAI-compatible API (accepts OPENAI_API_KEY, GEMINI_API_KEY, or GOOGLE_API_KEY).", ) - gemini_api_key: SecretStr = Field( - default=SecretStr("default-gemini-api-key"), # setting a default so some implementations can run without it - validation_alias=AliasChoices("GEMINI_API_KEY", "GOOGLE_API_KEY"), - description="API key for Google/Gemini API (accepts GEMINI_API_KEY, or GOOGLE_API_KEY).", - ) default_planner_model: str = Field( default="gemini-2.5-pro", description="Model name for planning/complex reasoning tasks.", diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py index 0c02325..bc9ba87 100644 --- a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py +++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py @@ -7,6 +7,7 @@ import asyncio import logging +import os import time import uuid import warnings @@ -235,6 +236,10 @@ def __init__( if thinking_budget > 0 and self._supports_thinking(self.model): thinking_config = types.ThinkingConfig(thinking_budget=thinking_budget) + # Google ADK reads GOOGLE_API_KEY from the environment directly. + # Bridge from OPENAI_API_KEY (or GEMINI_API_KEY) if not already set. + os.environ.setdefault("GOOGLE_API_KEY", config.openai_api_key.get_secret_value()) + self._agent = Agent( name="knowledge_qa", model=self.model, @@ -345,6 +350,11 @@ def reset(self) -> None: ) logger.debug("Agent state reset for new question") + @property + def adk_agent(self) -> Agent: + """Return the underlying ADK agent, e.g. for use with ``adk web``.""" + return self._agent + @property def current_plan(self) -> ResearchPlan | None: """Get the current research plan if one exists.""" diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/notebook.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/notebook.py new file mode 100644 index 0000000..627f173 --- /dev/null +++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/notebook.py @@ -0,0 +1,339 @@ +"""Notebook display utilities for the Knowledge Agent. + +Provides live progress display for Jupyter notebooks, showing plan status +and tool calls while the agent works, and formatted rendering of agent responses. + +Example +------- +>>> from aieng.agent_evals.knowledge_qa import KnowledgeGroundedAgent +>>> from aieng.agent_evals.knowledge_qa.notebook import ( +... display_response, +... run_with_display, +... ) +>>> agent = KnowledgeGroundedAgent(enable_planning=True) +>>> response = await run_with_display(agent, "What is quantum computing?") +>>> display_response(console, response.text) +""" + +import asyncio +import logging +import re +from typing import TYPE_CHECKING + +from IPython.display import HTML, clear_output, display +from rich.console import Console +from rich.markdown import Markdown +from rich.panel import Panel + +from .plan_parsing import StepStatus + + +if TYPE_CHECKING: + from .agent import AgentResponse, KnowledgeGroundedAgent + from .plan_parsing import ResearchPlan + + +class ToolCallCapture(logging.Handler): + """Captures tool calls from agent logs for display.""" + + def __init__(self): + super().__init__() + self.tool_calls: list[dict] = [] + + def emit(self, record): + """Capture tool call and response log messages.""" + msg = record.getMessage() + if "Tool call:" in msg: + try: + parts = msg.split("Tool call: ", 1)[1] + paren_idx = parts.find("(") + if paren_idx > 0: + tool_name = parts[:paren_idx] + args_str = parts[paren_idx + 1 : -1] + if len(args_str) > 60: + args_str = args_str[:57] + "..." + self.tool_calls.append({"name": tool_name, "args": args_str, "completed": False}) + except Exception: + pass + elif "Tool response:" in msg: + try: + parts = msg.split("Tool response: ", 1)[1] + tool_name = parts.split(" ")[0] + for tc in reversed(self.tool_calls): + if tc["name"] == tool_name and not tc["completed"]: + tc["completed"] = True + break + except Exception: + pass + + +def _format_plan_html(plan: "ResearchPlan") -> str: + """Format the research plan as HTML.""" + lines = ['
'] + lines.append('
๐Ÿ“‹ Research Plan
') + + for step in plan.steps: + if step.status == StepStatus.COMPLETED: + icon, color = "โœ“", "#28a745" + elif step.status == StepStatus.FAILED: + icon, color = "โœ—", "#dc3545" + elif step.status == StepStatus.IN_PROGRESS: + icon, color = "โ†’", "#ffc107" + elif step.status == StepStatus.SKIPPED: + icon, color = "โ—‹", "#6c757d" + else: + icon, color = "โ—‹", "#adb5bd" + + lines.append(f'
{icon} {step.step_id}. {step.description}
') + + lines.append("
") + return "\n".join(lines) + + +def _format_tools_html(tool_calls: list[dict]) -> str: + """Format tool calls as HTML.""" + if not tool_calls: + return '
Waiting for tool calls...
' + + lines = [ + '
' + ] + lines.append(f'
๐Ÿ”ง Tool Calls ({len(tool_calls)})
') + + # Show last 8 tool calls + display_calls = tool_calls[-8:] + if len(tool_calls) > 8: + lines.append(f'
... ({len(tool_calls) - 8} earlier calls)
') + + tool_icons = { + "google_search": "๐Ÿ”", + "google_search_agent": "๐Ÿ”", + "fetch_url": "๐ŸŒ", + "web_fetch": "๐ŸŒ", + "read_pdf": "๐Ÿ“„", + "grep_file": "๐Ÿ“‘", + "read_file": "๐Ÿ“–", + } + + for tc in display_calls: + name = tc["name"] + if name == "google_search_agent": + name = "google_search" + icon = tool_icons.get(name, "๐Ÿ”ง") + status_icon = "โœ“" if tc.get("completed") else "โ†’" + status_color = "#28a745" if tc.get("completed") else "#ffc107" + + lines.append( + f'
' + f'{status_icon} ' + f"{icon} {name} " + f'{tc["args"]}' + f"
" + ) + + lines.append("
") + return "\n".join(lines) + + +def _format_display_html(plan: "ResearchPlan | None", tool_calls: list[dict], question: str) -> str: + """Create the full HTML display.""" + html = ['
'] + + # Question + html.append( + f'
' + f"Question: {question}
" + ) + + # Plan + if plan and plan.steps: + html.append(_format_plan_html(plan)) + + # Tools + html.append(_format_tools_html(tool_calls)) + + html.append("
") + return "\n".join(html) + + +def _parse_response_sections(text: str) -> tuple[str, list[str], str]: + """Extract answer, sources, and reasoning from structured agent response text. + + The agent formats its final response as:: + + ANSWER: + SOURCES: + REASONING: + + Parameters + ---------- + text : str + Raw response text from the agent. + + Returns + ------- + tuple[str, list[str], str] + ``(answer, sources, reasoning)`` where *sources* is a list of URLs. + If the text does not contain the expected sections, the full text is + returned as the answer with empty sources and reasoning. + """ + answer_match = re.search(r"ANSWER:\s*(.*?)(?=\n\s*SOURCES:|\n\s*REASONING:|$)", text, re.DOTALL | re.IGNORECASE) + sources_match = re.search(r"SOURCES:\s*(.*?)(?=\n\s*ANSWER:|\n\s*REASONING:|$)", text, re.DOTALL | re.IGNORECASE) + reasoning_match = re.search(r"REASONING:\s*(.*?)(?=\n\s*ANSWER:|\n\s*SOURCES:|$)", text, re.DOTALL | re.IGNORECASE) + + answer = answer_match.group(1).strip() if answer_match else text + sources_raw = sources_match.group(1).strip() if sources_match else "" + reasoning = reasoning_match.group(1).strip() if reasoning_match else "" + + # Sources may be newline- or comma-separated URLs + sources = [s.strip() for s in re.split(r"[\n,]+", sources_raw) if s.strip().startswith("http")] + + return answer, sources, reasoning + + +def display_response( + console: Console, + text: str, + title: str = "Answer", + subtitle: str | None = None, +) -> None: + """Display a structured agent response with separated, styled sections. + + Parses the ``ANSWER`` / ``SOURCES`` / ``REASONING`` structure from the + agent's final response text and renders each section with appropriate Rich + styling: the answer in a cyan panel, sources in a dimmed panel, and + reasoning in a muted panel. + + Parameters + ---------- + console : Console + Rich console to render to. + text : str + Raw response text from the agent. + title : str, optional + Panel title for the answer section (default ``"Answer"``). + subtitle : str, optional + Panel subtitle, e.g. duration and tool-call count. + + Example + ------- + >>> duration = f"{response.total_duration_ms / 1000:.1f}s" + >>> display_response(console, response.text, subtitle=duration) + """ + answer, sources, reasoning = _parse_response_sections(text) + + console.print(Panel(Markdown(answer), title=title, border_style="cyan", subtitle=subtitle)) + + if sources: + src_lines = "\n".join(f" [blue]{src}[/blue]" for src in sources[:6]) + console.print(Panel(src_lines, title="Sources", border_style="dim", padding=(0, 1))) + + if reasoning: + console.print(Panel(Markdown(reasoning), title="[dim]Reasoning[/dim]", border_style="dim", padding=(0, 1))) + + +async def run_with_display( + agent: "KnowledgeGroundedAgent", + question: str, + refresh_rate: float = 0.5, +) -> "AgentResponse": + """Run the agent with live progress display in a Jupyter notebook. + + Shows the research plan checklist and tool calls while the agent works, + updating the display periodically. + + Parameters + ---------- + agent : KnowledgeGroundedAgent + The agent to run. + question : str + The question to answer. + refresh_rate : float + How often to update the display in seconds (default 0.5). + + Returns + ------- + AgentResponse + The agent's response. + + Example + ------- + >>> agent = KnowledgeGroundedAgent(enable_planning=True) + >>> response = await run_with_display(agent, "What is quantum computing?") + >>> print(response.text) + """ + # Suppress verbose logging from external libraries (same as CLI) + verbose_loggers = ["google.adk", "google.genai", "httpx", "httpcore"] + original_levels = {} + for name in verbose_loggers: + _logger = logging.getLogger(name) + original_levels[name] = _logger.level + _logger.setLevel(logging.ERROR) + _logger.propagate = False + + # Set up tool call capture on the agent logger (same as CLI) + tool_capture = ToolCallCapture() + tool_capture.setLevel(logging.INFO) + agent_logger = logging.getLogger("aieng.agent_evals.knowledge_qa.agent") + original_agent_level = agent_logger.level + original_handlers = agent_logger.handlers.copy() + agent_logger.handlers.clear() + agent_logger.addHandler(tool_capture) + agent_logger.setLevel(logging.INFO) + agent_logger.propagate = False + + try: + # Create the plan first if planning is enabled + if agent.enable_planning and hasattr(agent, "create_plan_async"): + clear_output(wait=True) + display(HTML('
Creating research plan...
')) + await agent.create_plan_async(question) + + # Start the agent task + task = asyncio.create_task(agent.answer_async(question)) + + # Update display while agent works + while not task.done(): + clear_output(wait=True) + display( + HTML( + _format_display_html( + plan=agent.current_plan if hasattr(agent, "current_plan") else None, + tool_calls=tool_capture.tool_calls, + question=question, + ) + ) + ) + await asyncio.sleep(refresh_rate) + + # Get the result + response = await task + + # Final display with completion status + clear_output(wait=True) + display( + HTML( + _format_display_html( + plan=agent.current_plan if hasattr(agent, "current_plan") else None, + tool_calls=tool_capture.tool_calls, + question=question, + ) + + f'
' + f"โœ“ Complete in {response.total_duration_ms / 1000:.1f}s | " + f"{len(response.tool_calls)} tool calls | " + f"{len(response.sources)} sources
" + ) + ) + + return response + + finally: + # Clean up logging - restore original state + agent_logger.removeHandler(tool_capture) + agent_logger.handlers = original_handlers + agent_logger.setLevel(original_agent_level) + agent_logger.propagate = True + + # Restore verbose logger levels + for name, level in original_levels.items(): + logging.getLogger(name).setLevel(level) diff --git a/aieng-eval-agents/aieng/agent_evals/logging_config.py b/aieng-eval-agents/aieng/agent_evals/logging_config.py new file mode 100644 index 0000000..b9bb75a --- /dev/null +++ b/aieng-eval-agents/aieng/agent_evals/logging_config.py @@ -0,0 +1,87 @@ +"""Logging configuration with colors and clean output. + +This module provides a clean, colored logging setup for agent evaluations +using the rich library. It reuses the console infrastructure from display.py +for consistent styling across the codebase. +""" + +import logging + +from rich.logging import RichHandler + +from .display import create_console + + +def setup_logging( + level: int = logging.INFO, + show_time: bool = True, + show_path: bool = False, +) -> None: + """Configure colored logging with rich. + + Uses the same console theme as display.py for consistent styling. + + Parameters + ---------- + level : int, optional + Logging level, by default logging.INFO. + show_time : bool, optional + Whether to show timestamps, by default True. + show_path : bool, optional + Whether to show file path in logs, by default False. + """ + # Reuse display console with force_jupyter=False for CLI + console = create_console(force_jupyter=False) + + # Configure rich handler with clean formatting + rich_handler = RichHandler( + console=console, + show_time=show_time, + show_path=show_path, + markup=True, + rich_tracebacks=True, + tracebacks_show_locals=False, + omit_repeated_times=False, + ) + + # Simple format - rich handles styling + rich_handler.setFormatter(logging.Formatter("%(message)s", datefmt="[%X]")) + + # Configure root logger + logging.basicConfig( + level=level, + format="%(message)s", + datefmt="[%X]", + handlers=[rich_handler], + force=True, + ) + + # Silence noisy third-party libraries + _silence_third_party_loggers() + + +def _silence_third_party_loggers() -> None: + """Reduce noise from third-party libraries. + + Sets logging levels for common noisy libraries to WARNING or ERROR + to keep evaluation output clean and focused on agent behavior. + """ + # Google SDK libraries - only warnings and above + for logger_name in [ + "google_adk", + "google_genai", + "google.adk", + "google.genai", + ]: + logging.getLogger(logger_name).setLevel(logging.WARNING) + + # Tracing/observability - only warnings + logging.getLogger("langfuse").setLevel(logging.WARNING) + + # HTTP/network libraries - errors only + for logger_name in ["httpx", "httpcore", "urllib3"]: + logging.getLogger(logger_name).setLevel(logging.ERROR) + + # System libraries + logging.getLogger("asyncio").setLevel(logging.WARNING) + logging.getLogger("py.warnings").setLevel(logging.ERROR) diff --git a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py index ba33840..2c0c1cc 100644 --- a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py +++ b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py @@ -351,9 +351,9 @@ class TestKnowledgeGroundedAgent: def mock_config(self): """Create a mock config for testing.""" config = MagicMock() - config.gemini_api_key = "test-api-key" config.default_worker_model = "gemini-2.5-flash" config.default_temperature = 0.0 + config.openai_api_key.get_secret_value.return_value = "test-api-key" return config @patch("aieng.agent_evals.knowledge_qa.agent.PlanReActPlanner") @@ -489,6 +489,7 @@ def test_lazy_initialization(self, *_mocks): mock_config = MagicMock() mock_config.default_worker_model = "gemini-2.5-flash" mock_config.default_temperature = 0.0 + mock_config.openai_api_key.get_secret_value.return_value = "test-api-key" mock_config_class.return_value = mock_config manager = KnowledgeAgentManager(enable_caching=False, enable_compaction=False) @@ -517,6 +518,7 @@ def test_close(self, *_mocks): mock_config = MagicMock() mock_config.default_worker_model = "gemini-2.5-flash" mock_config.default_temperature = 0.0 + mock_config.openai_api_key.get_secret_value.return_value = "test-api-key" mock_config_class.return_value = mock_config manager = KnowledgeAgentManager(enable_caching=False, enable_compaction=False) diff --git a/implementations/knowledge_qa/01_dataset_and_tools.ipynb b/implementations/knowledge_qa/01_dataset_and_tools.ipynb new file mode 100644 index 0000000..d16941c --- /dev/null +++ b/implementations/knowledge_qa/01_dataset_and_tools.ipynb @@ -0,0 +1,318 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "intro", + "metadata": {}, + "source": [ + "# 01: The DeepSearchQA Dataset & Agent Tools\n", + "\n", + "This notebook introduces the two foundational components of the Knowledge QA system:\n", + "\n", + "- **DeepSearchQA** โ€” the benchmark dataset used to evaluate the agent\n", + "- **Agent tools** โ€” the five capabilities the agent uses to research and verify answers\n", + "\n", + "## What You'll Learn\n", + "\n", + "1. What the DeepSearchQA dataset contains and how to explore it\n", + "2. The five tools the agent has access to, and how it's instructed to use them\n", + "\n", + "## Prerequisites\n", + "\n", + "- `GOOGLE_API_KEY` set in your `.env` file\n", + "- Dependencies installed with `uv sync`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path\n", + "\n", + "from aieng.agent_evals.knowledge_qa import DeepSearchQADataset\n", + "from aieng.agent_evals.knowledge_qa.system_instructions import build_system_instructions\n", + "from dotenv import load_dotenv\n", + "from rich.console import Console\n", + "from rich.markdown import Markdown\n", + "from rich.panel import Panel\n", + "from rich.table import Table\n", + "\n", + "\n", + "# Set working directory to the repository root\n", + "if Path(\"\").absolute().name == \"eval-agents\":\n", + " print(f\"Working directory: {Path('').absolute()}\")\n", + "else:\n", + " os.chdir(Path(\"\").absolute().parent.parent)\n", + " print(f\"Working directory set to: {Path('').absolute()}\")\n", + "\n", + "load_dotenv(verbose=True)\n", + "console = Console(width=100)" + ] + }, + { + "cell_type": "markdown", + "id": "s1-intro", + "metadata": {}, + "source": [ + "## 1. The DeepSearchQA Dataset\n", + "\n", + "[DeepSearchQA](https://www.kaggle.com/datasets/deepmind/deepsearchqa) is a benchmark from Google DeepMind\n", + "for evaluating deep research agents. It contains 896 research questions requiring multi-step web search\n", + "and reasoning to answer correctly.\n", + "\n", + "Each question is a **causal chain task**: the agent must follow a chain of searches, fetch real sources,\n", + "and verify facts before answering โ€” not recall from training data.\n", + "\n", + "### Answer Types\n", + "\n", + "| Type | Description | Example |\n", + "|------|-------------|---------|\n", + "| **Single Answer** | One specific value | A date, a number, a proper name |\n", + "| **Set Answer** | Multiple required items | A list of countries, a set of policy changes |\n", + "\n", + "Evaluation uses an LLM-as-judge that computes **precision, recall, and F1** by comparing the agent's\n", + "answer to the ground truth item-by-item." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "load-dataset", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = DeepSearchQADataset()\n", + "\n", + "console.print(f\"Total examples: [cyan]{len(dataset)}[/cyan]\")\n", + "console.print(f\"Categories: [cyan]{len(dataset.get_categories())}[/cyan]\")" + ] + }, + { + "cell_type": "markdown", + "id": "s1-structure", + "metadata": {}, + "source": [ + "### 1.1 Dataset Structure\n", + "\n", + "Each example is a `DSQAExample` with five fields. Let's look at one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "show-structure", + "metadata": {}, + "outputs": [], + "source": [ + "example = dataset[0]\n", + "\n", + "console.print(\n", + " Panel(\n", + " f\"[bold]example_id:[/bold] {example.example_id}\\n\"\n", + " f\"[bold]problem_category:[/bold] {example.problem_category}\\n\"\n", + " f\"[bold]answer_type:[/bold] {example.answer_type}\\n\\n\"\n", + " f\"[bold cyan]problem:[/bold cyan]\\n{example.problem}\\n\\n\"\n", + " f\"[bold yellow]answer:[/bold yellow]\\n{example.answer}\",\n", + " title=\"DSQAExample\",\n", + " border_style=\"blue\",\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "s1-categories", + "metadata": {}, + "source": [ + "### 1.2 Categories\n", + "\n", + "The dataset spans 17 domains. Let's see how examples are distributed across them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "categories", + "metadata": {}, + "outputs": [], + "source": [ + "categories = dataset.get_categories()\n", + "\n", + "cat_table = Table(title=\"Dataset by Category\")\n", + "cat_table.add_column(\"Category\", style=\"cyan\")\n", + "cat_table.add_column(\"Total\", style=\"white\", justify=\"right\")\n", + "cat_table.add_column(\"Single Answer\", style=\"dim\", justify=\"right\")\n", + "cat_table.add_column(\"Set Answer\", style=\"dim\", justify=\"right\")\n", + "\n", + "for cat in sorted(categories):\n", + " examples = dataset.get_by_category(cat)\n", + " single = sum(1 for e in examples if e.answer_type == \"Single Answer\")\n", + " set_ans = len(examples) - single\n", + " cat_table.add_row(cat, str(len(examples)), str(single), str(set_ans))\n", + "\n", + "console.print(cat_table)" + ] + }, + { + "cell_type": "markdown", + "id": "s1-answer-types", + "metadata": {}, + "source": [ + "### 1.3 Answer Types in Practice\n", + "\n", + "The answer type matters for evaluation โ€” the grader treats \"Single Answer\" and \"Set Answer\"\n", + "differently when computing correctness." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "answer-types", + "metadata": {}, + "outputs": [], + "source": [ + "single_ex = next(e for e in dataset.examples if e.answer_type == \"Single Answer\")\n", + "set_ex = next(e for e in dataset.examples if e.answer_type == \"Set Answer\")\n", + "\n", + "for label, ex, style in [\n", + " (\"Single Answer\", single_ex, \"green\"),\n", + " (\"Set Answer\", set_ex, \"yellow\"),\n", + "]:\n", + " console.print(\n", + " Panel(\n", + " f\"[bold cyan]Question:[/bold cyan]\\n{ex.problem}\\n\\n[bold yellow]Answer:[/bold yellow]\\n{ex.answer}\",\n", + " title=f\"{label} โ€” {ex.problem_category}\",\n", + " border_style=style,\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "s1-browse", + "metadata": {}, + "source": [ + "### 1.4 Browsing Examples\n", + "\n", + "You can retrieve examples by category or by ID." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "browse", + "metadata": {}, + "outputs": [], + "source": [ + "# Examples by category\n", + "finance_examples = dataset.get_by_category(\"Finance & Economics\")\n", + "console.print(f\"Finance & Economics: [cyan]{len(finance_examples)}[/cyan] examples\\n\")\n", + "\n", + "# Display a preview table\n", + "browse_table = Table(title=\"Finance & Economics โ€” First 5 Examples\")\n", + "browse_table.add_column(\"ID\", style=\"dim\", width=6)\n", + "browse_table.add_column(\"Answer Type\", style=\"cyan\", width=15)\n", + "browse_table.add_column(\"Question\", style=\"white\")\n", + "\n", + "for ex in finance_examples[:5]:\n", + " q = ex.problem[:75] + \"...\" if len(ex.problem) > 75 else ex.problem\n", + " browse_table.add_row(str(ex.example_id), ex.answer_type, q)\n", + "\n", + "console.print(browse_table)" + ] + }, + { + "cell_type": "markdown", + "id": "s2-intro", + "metadata": {}, + "source": [ + "## 2. The Agent's Tools\n", + "\n", + "The `KnowledgeGroundedAgent` has five tools that form a natural research workflow:\n", + "\n", + "| Tool | Purpose | When the Agent Uses It |\n", + "|------|---------|----------------------|\n", + "| `google_search` | Find relevant URLs | First step for any sub-question |\n", + "| `web_fetch` | Read web pages and PDFs | To verify facts from the actual source |\n", + "| `fetch_file` | Download CSV, XLSX, JSON files | When the answer is in structured data |\n", + "| `grep_file` | Search within a downloaded file | To locate a specific value in a large file |\n", + "| `read_file` | Read sections of a downloaded file | To inspect a specific part of a downloaded file |\n", + "\n", + "**Why not answer from search snippets?** Snippets are brief and may be outdated or misleading.\n", + "The system instructions enforce a strict causal chain: **Search โ†’ Fetch โ†’ Verify โ†’ Answer**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "system-instructions", + "metadata": {}, + "outputs": [], + "source": [ + "instructions = build_system_instructions()\n", + "\n", + "console.print(\n", + " Panel(\n", + " Markdown(instructions),\n", + " title=\"Agent System Instructions\",\n", + " border_style=\"blue\",\n", + " padding=(1, 2),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "summary", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "In this notebook you saw:\n", + "\n", + "1. **The DeepSearchQA dataset** โ€” 896 research questions across 17 categories, evaluated with\n", + " precision/recall/F1 using an LLM-as-judge\n", + "2. **The five agent tools** โ€” search, web fetch, file download, grep, and file read\n", + "3. **The system instructions** โ€” how the agent is guided to use its tools, including the\n", + " critical search โ†’ fetch โ†’ verify โ†’ answer chain\n", + "\n", + "**Next:** In Notebook 02, we'll create the agent, run it on questions, and observe how it uses these tools." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "done", + "metadata": {}, + "outputs": [], + "source": [ + "console.print(\n Panel(\n \"[green]โœ“[/green] Notebook complete!\\n\\n\"\n \"[cyan]Next:[/cyan] Open [bold]02_running_the_agent.ipynb[/bold] to run the agent.\",\n title=\"Done\",\n border_style=\"green\",\n )\n)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/implementations/knowledge_qa/01_grounding_basics.ipynb b/implementations/knowledge_qa/01_grounding_basics.ipynb deleted file mode 100644 index 0401bb2..0000000 --- a/implementations/knowledge_qa/01_grounding_basics.ipynb +++ /dev/null @@ -1,200 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 01: Google Search Grounding with ADK\n", - "\n", - "This notebook introduces Google Search grounding using the Agent Development Kit (ADK),\n", - "which provides explicit, traceable tool calls for web search.\n", - "\n", - "## Learning Objectives\n", - "\n", - "- Understand how Google Search grounding works with ADK\n", - "- Use the `KnowledgeGroundedAgent` to make grounded queries\n", - "- See explicit tool calls in the agent's reasoning\n", - "- Compare grounded vs non-grounded responses" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "# Setup: Load environment and configure rich console\nfrom aieng.agent_evals import (\n create_console,\n display_comparison,\n display_response,\n display_source_table,\n)\nfrom aieng.agent_evals.knowledge_qa import KnowledgeAgentConfig, KnowledgeGroundedAgent\nfrom dotenv import load_dotenv\nfrom google import genai\nfrom rich.panel import Panel\n\n\nconsole = create_console()\nload_dotenv(verbose=True)" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Understanding Google Search Grounding with ADK\n", - "\n", - "The Agent Development Kit (ADK) provides a `GoogleSearchTool` that enables:\n", - "\n", - "1. **Explicit Tool Calls**: The agent decides when to search and you can see each call\n", - "2. **ReAct Pattern**: Thought โ†’ Action โ†’ Observation loop is visible\n", - "3. **Traceable**: Every search query and result is logged\n", - "4. **Real-time Information**: Access to current web data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "# Initialize the knowledge agent (uses ADK with GoogleSearchTool internally)\nagent = KnowledgeGroundedAgent()\n\nconsole.print(\n Panel(\n f\"[green]โœ“[/green] Knowledge Agent initialized\\n[cyan]Model:[/cyan] {agent.model}\",\n title=\"๐Ÿ”ง Setup Complete\",\n border_style=\"green\",\n )\n)" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Making Your First Grounded Query" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Ask a question that requires current information\n", - "query = \"What is the current population of Tokyo?\"\n", - "\n", - "console.print(f\"\\n[cyan]๐Ÿ“ Query:[/cyan] {query}\\n\")\n", - "\n", - "console.print(\"[dim]Searching...[/dim]\")\n", - "response = await agent.answer_async(query)\n", - "\n", - "display_response(response, console=console, title=\"Tokyo Population\")\n", - "\n", - "# Show the tool calls made by the agent\n", - "if response.tool_calls:\n", - " console.print(\"\\n[bold cyan]๐Ÿ”ง Tool Calls Made:[/bold cyan]\")\n", - " for tc in response.tool_calls:\n", - " console.print(f\" โ€ข {tc.get('name', 'unknown')}: {tc.get('args', {})}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Display sources in a detailed table format\n", - "display_source_table(response, console=console)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Comparing Grounded vs Non-Grounded Responses\n", - "\n", - "This is where grounding truly shines. We'll ask about Toronto's record single-day snowfall.\n", - "\n", - "**Why this example works:**\n", - "- The record was set on **January 25, 2026** - after the model's training data cutoff\n", - "- Without grounding, the model can only guess based on historical data it was trained on\n", - "- With grounding, the model searches the web and finds the recent news about this event\n", - "\n", - "This clearly demonstrates that grounding enables access to information the model couldn't possibly know from training alone." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "config = KnowledgeAgentConfig()\nclient = genai.Client(api_key=config.gemini_api_key)\n\n# This question requires very recent information (Jan 2026)\n# The non-grounded model will fail since its training data doesn't include this event\nquestion = \"Which day had the highest recorded snowfall in a single day in Toronto?\"\nexpected_answer = \"January 25, 2026\"\n\nconsole.print(f\"\\n[bold]Question:[/bold] {question}\")\nconsole.print(f\"[dim]Expected Answer: {expected_answer}[/dim]\\n\")\n\n# Without grounding - model relies on training data (cutoff before Jan 2026)\nconsole.print(\"[dim]Generating without grounding...[/dim]\")\nresponse_no_grounding = client.models.generate_content(\n model=config.default_worker_model,\n contents=question,\n)\n\n# With grounding - agent uses Google Search tool\nconsole.print(\"[dim]Generating with grounding (ADK agent)...[/dim]\")\nresponse_grounded = await agent.answer_async(question)\n\n# Side-by-side comparison using our display utility\ndisplay_comparison(response_no_grounding.text, response_grounded, console=console)\n\n# Show tool calls from the grounded response\nif response_grounded.tool_calls:\n console.print(\"\\n[bold cyan]๐Ÿ”ง Tool Calls (Grounded):[/bold cyan]\")\n for tc in response_grounded.tool_calls:\n console.print(f\" โ€ข {tc.get('name', 'unknown')}: {tc.get('args', {})}\")\n\n# Check if the grounded response contains the correct answer\nif expected_answer.lower() in response_grounded.text.lower() or \"january 25\" in response_grounded.text.lower():\n console.print(\"\\n[green]โœ“ Grounded response contains the correct answer![/green]\")\nelse:\n console.print(\"\\n[yellow]โš  Check the grounded response for accuracy[/yellow]\")" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Exercise: Try Your Own Queries\n", - "\n", - "Try asking questions that:\n", - "- Require recent information (news, events, statistics)\n", - "- Need multiple facts combined\n", - "- Are about specific domains (sports, science, politics)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Try your own query\n", - "my_query = \"What are the latest developments in fusion energy?\"\n", - "\n", - "console.print(f\"[bold cyan]๐Ÿ” Query:[/bold cyan] {my_query}\\n\")\n", - "\n", - "console.print(\"[dim]Searching the web...[/dim]\")\n", - "my_response = await agent.answer_async(my_query)\n", - "\n", - "display_response(my_response, console=console, title=\"Fusion Energy Developments\")\n", - "\n", - "# Show the tool calls\n", - "if my_response.tool_calls:\n", - " console.print(\"\\n[bold cyan]๐Ÿ”ง Tool Calls:[/bold cyan]\")\n", - " for tc in my_response.tool_calls:\n", - " console.print(f\" โ€ข {tc.get('name', 'unknown')}: {tc.get('args', {})}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Summary\n", - "\n", - "In this notebook, you learned:\n", - "\n", - "1. How Google Search grounding works with ADK's `GoogleSearchTool`\n", - "2. How to use the `KnowledgeGroundedAgent` for grounded queries\n", - "3. How to see explicit tool calls in the agent's response\n", - "4. The difference between grounded and non-grounded responses\n", - "\n", - "**Next**: In the next notebook, we'll explore the agent's system instructions and the evaluation dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "console.print(\n", - " Panel(\n", - " \"[green]โœ“[/green] Notebook complete!\\n\\n\"\n", - " \"[cyan]Next:[/cyan] Open [bold]02_agent_basics.ipynb[/bold] to learn about the Knowledge Agent.\",\n", - " title=\"๐ŸŽ‰ Done\",\n", - " border_style=\"green\",\n", - " )\n", - ")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/implementations/knowledge_qa/02_agent_basics.ipynb b/implementations/knowledge_qa/02_agent_basics.ipynb deleted file mode 100644 index 7f9d088..0000000 --- a/implementations/knowledge_qa/02_agent_basics.ipynb +++ /dev/null @@ -1,323 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 02: Knowledge-Grounded Agent Basics\n", - "\n", - "This notebook introduces the `KnowledgeGroundedAgent` class, which wraps Gemini\n", - "with Google Search grounding into a full-featured QA agent.\n", - "\n", - "## Learning Objectives\n", - "\n", - "- Create and configure a `KnowledgeGroundedAgent`\n", - "- Understand the agent's system instructions\n", - "- Use the agent for single-turn Q&A\n", - "- Explore the DeepSearchQA evaluation dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "# Setup: Load environment and configure rich console\nfrom aieng.agent_evals import (\n create_console,\n display_example,\n display_info,\n display_response,\n display_success,\n)\nfrom aieng.agent_evals.knowledge_qa import (\n DeepSearchQADataset,\n KnowledgeAgentManager,\n KnowledgeGroundedAgent,\n)\nfrom aieng.agent_evals.knowledge_qa.agent import SYSTEM_INSTRUCTIONS\nfrom dotenv import load_dotenv\nfrom rich.markdown import Markdown\nfrom rich.panel import Panel\nfrom rich.table import Table\n\n\nconsole = create_console()\nload_dotenv(verbose=True)" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Creating a Knowledge Agent" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "# Create the agent\nagent = KnowledgeGroundedAgent()\n\n# Display configuration\nconfig_table = Table(title=\"๐Ÿค– Agent Configuration\", show_header=True, header_style=\"bold cyan\")\nconfig_table.add_column(\"Setting\", style=\"cyan\")\nconfig_table.add_column(\"Value\", style=\"white\")\nconfig_table.add_row(\"Model\", agent.model)\nconfig_table.add_row(\"Planner Model\", agent.config.default_planner_model)\nconfig_table.add_row(\"Worker Model\", agent.config.default_worker_model)\n\nconsole.print(config_table)" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Understanding System Instructions\n", - "\n", - "The agent uses carefully crafted system instructions to guide its behavior." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "console.print(\n Panel(\n Markdown(SYSTEM_INSTRUCTIONS),\n title=\"๐Ÿ“œ System Instructions\",\n border_style=\"blue\",\n subtitle=\"[dim]Guides agent behavior[/dim]\",\n )\n)" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Single-Turn Q&A\n", - "\n", - "Let's use the agent to answer some questions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Example 1: Current events\n", - "question = \"What are the most significant AI developments in January 2026?\"\n", - "\n", - "console.print(\n", - " Panel(\n", - " f\"[bold green]{question}[/bold green]\",\n", - " title=\"โ“ Question\",\n", - " border_style=\"green\",\n", - " )\n", - ")\n", - "\n", - "console.print(\"[dim]๐Ÿ” Agent is researching...[/dim]\")\n", - "response = await agent.answer_async(question)\n", - "\n", - "display_response(response, console=console, title=\"AI Developments January 2026\")\n", - "\n", - "# Show tool calls made during reasoning\n", - "if response.tool_calls:\n", - " console.print(\"\\n[bold cyan]๐Ÿ”ง Tool Calls (ReAct Trace):[/bold cyan]\")\n", - " for tc in response.tool_calls:\n", - " console.print(f\" โ€ข {tc.get('name', 'unknown')}: {tc.get('args', {})}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Example 2: Factual question\n", - "question = \"What countries have successfully landed spacecraft on the Moon?\"\n", - "\n", - "console.print(\n", - " Panel(\n", - " f\"[bold green]{question}[/bold green]\",\n", - " title=\"โ“ Question\",\n", - " border_style=\"green\",\n", - " )\n", - ")\n", - "\n", - "console.print(\"[dim]๐Ÿ” Agent is researching...[/dim]\")\n", - "response = await agent.answer_async(question)\n", - "\n", - "display_response(response, console=console, title=\"Moon Landing Countries\")\n", - "\n", - "# Show tool calls\n", - "if response.tool_calls:\n", - " console.print(\"\\n[bold cyan]๐Ÿ”ง Tool Calls:[/bold cyan]\")\n", - " for tc in response.tool_calls:\n", - " console.print(f\" โ€ข {tc.get('name', 'unknown')}: {tc.get('args', {})}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": "## 4. Using the KnowledgeAgentManager\n\nFor applications that need to manage agent lifecycle, use `KnowledgeAgentManager`." - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "# Create a manager (lazy initialization)\nmanager = KnowledgeAgentManager()\n\ndisplay_info(f\"Initialized: {manager.is_initialized()}\", console=console)\n\n# Access the agent (triggers initialization)\nmanaged_agent = manager.agent\ndisplay_info(f\"After access: {manager.is_initialized()}\", console=console)\n\n# Use the agent\nconsole.print(\"[dim]Querying...[/dim]\")\nresponse = await managed_agent.answer_async(\"What is the speed of light?\")\n\ndisplay_response(response, console=console, title=\"Quick Answer\", show_queries=False)\n\n# Cleanup\nmanager.close()\ndisplay_success(\"Manager closed\", console=console)" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Exploring the DeepSearchQA Dataset\n", - "\n", - "The DeepSearchQA dataset contains 896 research questions for evaluating knowledge agents." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "# Load the dataset\nwith console.status(\"[cyan]Loading DeepSearchQA dataset...[/cyan]\", spinner=\"dots\"):\n dataset = DeepSearchQADataset()\n\n# Display dataset info\ninfo_table = Table(title=\"๐Ÿ“Š DeepSearchQA Dataset\", show_header=True, header_style=\"bold cyan\")\ninfo_table.add_column(\"Metric\", style=\"cyan\")\ninfo_table.add_column(\"Value\", style=\"white\")\ninfo_table.add_row(\"Total Examples\", str(len(dataset)))\ninfo_table.add_row(\"Categories\", str(len(dataset.get_categories())))\n\nconsole.print(info_table)" - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Display categories\n", - "categories = dataset.get_categories()\n", - "\n", - "cat_table = Table(title=\"๐Ÿ“ Problem Categories\", show_header=True, header_style=\"bold green\")\n", - "cat_table.add_column(\"Category\", style=\"white\")\n", - "cat_table.add_column(\"Count\", style=\"cyan\", justify=\"right\")\n", - "\n", - "for cat in sorted(categories):\n", - " count = len(dataset.get_by_category(cat))\n", - " cat_table.add_row(cat, str(count))\n", - "\n", - "console.print(cat_table)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Show a sample example using the shared display utility\n", - "example = dataset[0]\n", - "\n", - "display_example(\n", - " example_id=example.example_id,\n", - " problem=example.problem,\n", - " category=example.problem_category,\n", - " answer=example.answer,\n", - " answer_type=example.answer_type,\n", - " console=console,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get random samples\n", - "samples = dataset.sample(n=3, random_state=42)\n", - "\n", - "console.print(\"[bold]๐Ÿ“š Random Samples from Dataset[/bold]\\n\")\n", - "\n", - "for ex in samples:\n", - " display_example(\n", - " example_id=ex.example_id,\n", - " problem=ex.problem[:300] + \"...\" if len(ex.problem) > 300 else ex.problem,\n", - " category=ex.problem_category,\n", - " answer=ex.answer,\n", - " console=console,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. Testing the Agent on DeepSearchQA\n", - "\n", - "Let's test the agent on a sample question from the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Pick an example\n", - "test_example = samples[0]\n", - "\n", - "console.print(\n", - " Panel(\n", - " f\"[bold]Testing on Example {test_example.example_id}[/bold]\\n\\n\"\n", - " f\"[cyan]Category:[/cyan] {test_example.problem_category}\\n\"\n", - " f\"[cyan]Expected Answer:[/cyan] {test_example.answer}\",\n", - " title=\"๐Ÿงช Test Setup\",\n", - " border_style=\"yellow\",\n", - " )\n", - ")\n", - "\n", - "# Ask the agent\n", - "console.print(\n", - " Panel(\n", - " f\"[bold green]{test_example.problem}[/bold green]\",\n", - " title=\"โ“ Question\",\n", - " border_style=\"green\",\n", - " )\n", - ")\n", - "\n", - "console.print(\"[dim]๐Ÿ” Agent is researching...[/dim]\")\n", - "response = await agent.answer_async(test_example.problem)\n", - "\n", - "display_response(response, console=console, title=\"Agent Response\")\n", - "\n", - "# Show tool calls\n", - "if response.tool_calls:\n", - " console.print(\"\\n[bold cyan]๐Ÿ”ง Tool Calls:[/bold cyan]\")\n", - " for tc in response.tool_calls:\n", - " console.print(f\" โ€ข {tc.get('name', 'unknown')}: {tc.get('args', {})}\")\n", - "\n", - "# Compare\n", - "contains_answer = test_example.answer.lower() in response.text.lower()\n", - "if contains_answer:\n", - " console.print(\"\\n[green]โœ“ CONTAINS EXPECTED ANSWER[/green]\")\n", - "else:\n", - " console.print(\"\\n[yellow]โš  Answer may differ[/yellow]\")\n", - "console.print(f\"[dim]Expected: {test_example.answer}[/dim]\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Summary\n", - "\n", - "In this notebook, you learned:\n", - "\n", - "1. How to create and configure a `KnowledgeGroundedAgent`\n", - "2. The system instructions that guide agent behavior\n", - "3. How to use the agent for single-turn Q&A\n", - "4. How to explore the DeepSearchQA evaluation dataset\n", - "\n", - "**Next**: In the next notebook, we'll explore multi-turn conversations and run systematic evaluations." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "console.print(\n", - " Panel(\n", - " \"[green]โœ“[/green] Notebook complete!\\n\\n\"\n", - " \"[cyan]Next:[/cyan] Open [bold]03_multi_turn.ipynb[/bold] to learn about multi-turn conversations and evaluation.\",\n", - " title=\"๐ŸŽ‰ Done\",\n", - " border_style=\"green\",\n", - " )\n", - ")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/implementations/knowledge_qa/02_running_the_agent.ipynb b/implementations/knowledge_qa/02_running_the_agent.ipynb new file mode 100644 index 0000000..834080a --- /dev/null +++ b/implementations/knowledge_qa/02_running_the_agent.ipynb @@ -0,0 +1,426 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "intro", + "metadata": {}, + "source": [ + "# 02: Running the Agent\n", + "\n", + "In Notebook 01 we explored the dataset and tools. This notebook shows how to run the\n", + "`KnowledgeGroundedAgent` in practice.\n", + "\n", + "## What You'll Learn\n", + "\n", + "1. The agent's PlanReAct architecture and the `AgentResponse` data structure\n", + "2. Running a question with live progress display\n", + "3. Inspecting the response: plan, tool calls, sources, and reasoning\n", + "4. Multi-turn conversations using session state\n", + "5. Observability with Langfuse tracing\n", + "\n", + "## Prerequisites\n", + "\n", + "Complete Notebook 01. You'll need `GOOGLE_API_KEY` in your `.env` file.\n", + "For tracing (Section 4): `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are also required." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup", + "metadata": {}, + "outputs": [], + "source": [ + "import os\nimport uuid\nfrom pathlib import Path\n\nfrom aieng.agent_evals.knowledge_qa import KnowledgeGroundedAgent\nfrom aieng.agent_evals.knowledge_qa.notebook import display_response, run_with_display\nfrom aieng.agent_evals.langfuse import init_tracing\nfrom dotenv import load_dotenv\nfrom rich.console import Console\nfrom rich.panel import Panel\nfrom rich.table import Table\n\n\nif Path(\"\").absolute().name == \"eval-agents\":\n print(f\"Working directory: {Path('').absolute()}\")\nelse:\n os.chdir(Path(\"\").absolute().parent.parent)\n print(f\"Working directory set to: {Path('').absolute()}\")\n\nload_dotenv(verbose=True)\nconsole = Console(width=100)" + ] + }, + { + "cell_type": "markdown", + "id": "s1-arch", + "metadata": {}, + "source": [ + "## 1. Agent Architecture\n", + "\n", + "The `KnowledgeGroundedAgent` is built on Google ADK and combines two patterns:\n", + "\n", + "**PlanReAct** โ€” Before executing, the agent produces an explicit research plan with numbered\n", + "steps. Each step has a type (`SEARCH`, `FETCH`, `ANALYZE`) and a status that transitions from\n", + "`pending` โ†’ `in_progress` โ†’ `completed` (or `failed`/`skipped`). The plan can be revised\n", + "mid-run if the agent encounters unexpected results.\n", + "\n", + "**ReAct loop** โ€” Within each step, the agent alternates between reasoning (Thought), acting\n", + "(tool call), and observing (tool response).\n", + "\n", + "### The `AgentResponse` Object\n", + "\n", + "After running, `agent.answer_async(question)` returns an `AgentResponse`:\n", + "\n", + "| Field | Type | Description |\n", + "|-------|------|-------------|\n", + "| `text` | `str` | The final answer |\n", + "| `plan` | `ResearchPlan` | Numbered steps with statuses |\n", + "| `tool_calls` | `list[dict]` | Every tool invocation during execution |\n", + "| `sources` | `list[GroundingChunk]` | URLs used as evidence |\n", + "| `reasoning_chain` | `list[str]` | The model's intermediate reasoning |\n", + "| `total_duration_ms` | `int` | Wall-clock execution time |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "create-agent", + "metadata": {}, + "outputs": [], + "source": [ + "agent = KnowledgeGroundedAgent(enable_planning=True)\n", + "\n", + "config_table = Table(title=\"Agent Configuration\", show_header=False)\n", + "config_table.add_column(\"Setting\", style=\"cyan\")\n", + "config_table.add_column(\"Value\", style=\"white\")\n", + "config_table.add_row(\"Model\", agent.model)\n", + "config_table.add_row(\"Planning\", \"PlanReAct (enabled)\")\n", + "config_table.add_row(\"Session Service\", \"InMemorySessionService\")\n", + "config_table.add_row(\"Tools\", \"google_search, web_fetch, fetch_file, grep_file, read_file\")\n", + "\n", + "console.print(config_table)" + ] + }, + { + "cell_type": "markdown", + "id": "s2-running", + "metadata": {}, + "source": [ + "## 2. Running a Question\n", + "\n", + "`run_with_display` executes the agent in a Jupyter notebook with a live progress display showing:\n", + "\n", + "- The research plan with step statuses (updating in real time)\n", + "- Tool calls as they fire\n", + "\n", + "We'll use a question that requires web search โ€” the agent must find and verify a specific fact,\n", + "not recall it from training data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "run-agent", + "metadata": {}, + "outputs": [], + "source": [ + "question = \"When was the highest single-day snowfall recorded in Toronto, and how much snow fell?\"\n", + "\n", + "response = await run_with_display(agent, question)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "show-answer", + "metadata": {}, + "outputs": [], + "source": [ + "display_response(\n", + " console,\n", + " response.text,\n", + " subtitle=(\n", + " f\"Duration: {response.total_duration_ms / 1000:.1f}s | \"\n", + " f\"Tool calls: {len(response.tool_calls)} | \"\n", + " f\"Sources: {len(response.sources)}\"\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "s2-inspect", + "metadata": {}, + "source": [ + "### 2.1 Inspecting the Response\n", + "\n", + "The `AgentResponse` object contains the full execution trace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "show-plan", + "metadata": {}, + "outputs": [], + "source": [ + "plan = response.plan\n\nplan_table = Table(title=\"Research Plan\")\nplan_table.add_column(\"#\", style=\"cyan\", width=3)\nplan_table.add_column(\"Step\", style=\"white\")\nplan_table.add_column(\"Type\", style=\"dim\", width=12)\nplan_table.add_column(\"Status\", style=\"green\")\n\nfor step in plan.steps:\n icon = {\"completed\": \"โœ“\", \"failed\": \"โœ—\", \"skipped\": \"โ—‹\"}.get(step.status.value, \"ยท\")\n desc = step.description[:70] + \"...\" if len(step.description) > 70 else step.description\n plan_table.add_row(str(step.step_id), desc, step.step_type, f\"{icon} {step.status.value}\")\n\nconsole.print(plan_table)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "show-tools", + "metadata": {}, + "outputs": [], + "source": [ + "if response.tool_calls:\n", + " tools_table = Table(title=\"Tool Calls\")\n", + " tools_table.add_column(\"#\", style=\"dim\", width=3)\n", + " tools_table.add_column(\"Tool\", style=\"cyan\", width=16)\n", + " tools_table.add_column(\"Arguments (truncated)\", style=\"white\")\n", + "\n", + " for i, tc in enumerate(response.tool_calls[:15], 1):\n", + " name = tc.get(\"name\", \"unknown\")\n", + " args = str(tc.get(\"args\", {}))\n", + " args = args[:70] + \"...\" if len(args) > 70 else args\n", + " tools_table.add_row(str(i), name, args)\n", + "\n", + " if len(response.tool_calls) > 15:\n", + " tools_table.add_row(\"...\", f\"({len(response.tool_calls) - 15} more)\", \"\")\n", + "\n", + " console.print(tools_table)\n", + "else:\n", + " console.print(\"[dim]No tool calls recorded[/dim]\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "show-sources", + "metadata": {}, + "outputs": [], + "source": [ + "if response.sources:\n", + " seen: set[str] = set()\n", + " sources_table = Table(title=\"Sources\")\n", + " sources_table.add_column(\"#\", style=\"dim\", width=3)\n", + " sources_table.add_column(\"URL\", style=\"blue\")\n", + "\n", + " for src in response.sources:\n", + " if src.uri and src.uri not in seen:\n", + " seen.add(src.uri)\n", + " url = src.uri[:85] + \"...\" if len(src.uri) > 85 else src.uri\n", + " sources_table.add_row(str(len(seen)), url)\n", + " if len(seen) >= 10:\n", + " break\n", + "\n", + " console.print(sources_table)\n", + "else:\n", + " console.print(\"[dim]No sources recorded[/dim]\")" + ] + }, + { + "cell_type": "markdown", + "id": "s3-multiturn", + "metadata": {}, + "source": [ + "## 3. Multi-Turn Conversations\n", + "\n", + "The agent uses an `InMemorySessionService` to maintain conversation context across turns.\n", + "Pass the same `session_id` to link questions together โ€” the agent will use prior context\n", + "when answering follow-up questions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "multiturn", + "metadata": {}, + "outputs": [], + "source": [ + "session_id = str(uuid.uuid4())\n", + "console.print(f\"Session ID: [dim]{session_id}[/dim]\\n\")\n", + "\n", + "# First turn: establish a subject\n", + "response1 = await agent.answer_async(\n", + " \"What is the capital of France?\",\n", + " session_id=session_id,\n", + ")\n", + "display_response(console, response1.text, title=\"Turn 1\")\n", + "\n", + "# Second turn: follow-up that references the prior context\n", + "response2 = await agent.answer_async(\n", + " \"What is the official language spoken there?\",\n", + " session_id=session_id,\n", + ")\n", + "display_response(console, response2.text, title=\"Turn 2 (follow-up)\")" + ] + }, + { + "cell_type": "markdown", + "id": "s4-tracing", + "metadata": {}, + "source": [ + "## 4. Observability with Langfuse\n", + "\n", + "Langfuse captures a full trace of every agent run using OpenTelemetry, giving you visibility into:\n", + "\n", + "- Every tool call and its arguments\n", + "- Every LLM call with prompts and completions\n", + "- Timing for each span\n", + "- The full agent execution tree\n", + "\n", + "This is essential for debugging failures, measuring latency, and comparing configurations.\n", + "\n", + "### Trace Structure\n", + "\n", + "```\n", + "Trace: agent run\n", + "โ”œโ”€โ”€ Span: planning (PlanReAct)\n", + "โ”‚ โ””โ”€โ”€ LLM Call: create_plan\n", + "โ”œโ”€โ”€ Span: step-1-execution\n", + "โ”‚ โ”œโ”€โ”€ Tool Call: google_search\n", + "โ”‚ โ”œโ”€โ”€ Tool Call: web_fetch\n", + "โ”‚ โ””โ”€โ”€ LLM Call: step_summary\n", + "โ”œโ”€โ”€ Span: step-2-execution\n", + "โ”‚ โ””โ”€โ”€ ...\n", + "โ””โ”€โ”€ Span: synthesis\n", + " โ””โ”€โ”€ LLM Call: final_answer\n", + "```\n", + "\n", + "### Prerequisites\n", + "\n", + "Set these in your `.env` file:\n", + "- `LANGFUSE_PUBLIC_KEY`\n", + "- `LANGFUSE_SECRET_KEY`\n", + "- `LANGFUSE_HOST` (optional, defaults to `https://cloud.langfuse.com`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "check-creds", + "metadata": {}, + "outputs": [], + "source": [ + "langfuse_configured = all(\n", + " [\n", + " os.getenv(\"LANGFUSE_PUBLIC_KEY\"),\n", + " os.getenv(\"LANGFUSE_SECRET_KEY\"),\n", + " ]\n", + ")\n", + "\n", + "if langfuse_configured:\n", + " console.print(\"[green]โœ“[/green] Langfuse credentials found\")\n", + "else:\n", + " console.print(\"[yellow]โš [/yellow] Langfuse credentials not found โ€” tracing cells will be skipped\")\n", + " console.print(\"[dim]Set LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY in .env[/dim]\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "init-tracing", + "metadata": {}, + "outputs": [], + "source": [ + "tracing_enabled = init_tracing()\n", + "\n", + "if tracing_enabled:\n", + " console.print(\"[green]โœ“[/green] Langfuse tracing initialized\")\n", + "else:\n", + " console.print(\"[yellow]โš [/yellow] Tracing not enabled (check credentials)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "run-traced", + "metadata": {}, + "outputs": [], + "source": [ + "if tracing_enabled:\n", + " from langfuse import Langfuse\n", + "\n", + " langfuse = Langfuse()\n", + " traced_agent = KnowledgeGroundedAgent(enable_planning=True)\n", + " traced_question = \"What programming language was created by Guido van Rossum, and in what year?\"\n", + "\n", + " console.print(Panel(traced_question, title=\"Traced Question\", border_style=\"green\"))\n", + "\n", + " with langfuse.start_as_current_span(name=\"knowledge-agent\", input=traced_question):\n", + " trace_id = langfuse.get_current_trace_id()\n", + " traced_response = await traced_agent.answer_async(traced_question)\n", + " langfuse.update_current_span(output=traced_response.text)\n", + "\n", + " display_response(\n", + " console,\n", + " traced_response.text,\n", + " subtitle=f\"Duration: {traced_response.total_duration_ms / 1000:.1f}s\",\n", + " )\n", + "else:\n", + " console.print(\"[dim]Skipping (Langfuse not configured)[/dim]\")\n", + " trace_id = None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "flush-traces", + "metadata": {}, + "outputs": [], + "source": [ + "if tracing_enabled:\n from IPython.display import HTML, display # noqa: A004\n from langfuse import Langfuse\n from opentelemetry import trace as otel_trace\n\n provider = otel_trace.get_tracer_provider()\n if hasattr(provider, \"force_flush\"):\n provider.force_flush(timeout_millis=5000)\n console.print(\"[green]โœ“[/green] Traces flushed to Langfuse\")\n\n if trace_id:\n trace_url = Langfuse().get_trace_url(trace_id=trace_id)\n display(HTML(f'

View trace: {trace_url}

'))" + ] + }, + { + "cell_type": "markdown", + "id": "s4-ui", + "metadata": {}, + "source": [ + "### 4.1 Viewing Traces in the Langfuse UI\n\nOpen your Langfuse project and navigate to **Traces**. Each run appears as a\ntree of spans. Useful things to look at:\n\n- **Span timeline** โ€” which steps take the most time?\n- **Tool call arguments** โ€” what search queries did the agent use?\n- **LLM interactions** โ€” what did the model reason about before calling each tool?\n- **Errors** โ€” red spans show where failures occurred\n\nYou can also filter by trace name, time range, or input content." + ] + }, + { + "cell_type": "markdown", + "id": "summary", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "In this notebook you learned:\n", + "\n", + "1. **Creating the agent** โ€” `KnowledgeGroundedAgent(enable_planning=True)` with PlanReAct\n", + "2. **Running questions** โ€” `run_with_display` for live notebook progress; `agent.answer_async` for raw access\n", + "3. **The `AgentResponse`** โ€” plan, tool calls, sources, reasoning, and timing in one object\n", + "4. **Multi-turn conversations** โ€” linking turns with `session_id`\n", + "5. **Langfuse tracing** โ€” `init_tracing()` and the Langfuse SDK for full observability\n", + "\n", + "**Next:** In Notebook 03, we'll run a systematic evaluation using the DeepSearchQA benchmark." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "done", + "metadata": {}, + "outputs": [], + "source": [ + "console.print(\n", + " Panel(\n", + " \"[green]โœ“[/green] Notebook complete!\\n\\n\"\n", + " \"[cyan]Next:[/cyan] Open [bold]03_evaluation.ipynb[/bold] to evaluate the agent at scale.\",\n", + " title=\"Done\",\n", + " border_style=\"green\",\n", + " )\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/implementations/knowledge_qa/03_evaluation.ipynb b/implementations/knowledge_qa/03_evaluation.ipynb new file mode 100644 index 0000000..c8d6bd3 --- /dev/null +++ b/implementations/knowledge_qa/03_evaluation.ipynb @@ -0,0 +1,482 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "intro", + "metadata": {}, + "source": [ + "# 03: Evaluation\n", + "\n", + "In Notebook 02 we ran individual questions by hand. This notebook evaluates the agent\n", + "systematically: we upload a dataset subset to Langfuse, run the agent on every item, and\n", + "score each response with an LLM-as-judge grader using the official DeepSearchQA methodology.\n", + "\n", + "## What You'll Learn\n", + "\n", + "1. Uploading a DeepSearchQA subset to Langfuse as a persistent dataset\n", + "2. The LLM-as-judge grader: precision, recall, F1, and the four outcome categories\n", + "3. A single-sample evaluation walkthrough\n", + "4. Running the full experiment with `run_experiment`\n", + "5. Inspecting and interpreting item-level results\n", + "\n", + "## Prerequisites\n", + "\n", + "Complete Notebooks 01 and 02. You'll need all credentials in `.env`:\n", + "- `GOOGLE_API_KEY`\n", + "- `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY`\n", + "- `OPENAI_API_KEY` (for the LLM grader)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "import tempfile\n", + "from pathlib import Path\n", + "from typing import Any\n", + "\n", + "import pandas as pd\n", + "from aieng.agent_evals.evaluation import run_experiment\n", + "from aieng.agent_evals.evaluation.graders.config import LLMRequestConfig\n", + "from aieng.agent_evals.knowledge_qa import DeepSearchQADataset, KnowledgeGroundedAgent\n", + "from aieng.agent_evals.knowledge_qa.deepsearchqa_grader import (\n", + " EvaluationOutcome,\n", + " evaluate_deepsearchqa_async,\n", + ")\n", + "from aieng.agent_evals.knowledge_qa.notebook import display_response, run_with_display\n", + "from aieng.agent_evals.langfuse import upload_dataset_to_langfuse\n", + "from dotenv import load_dotenv\n", + "from IPython.display import HTML, display # noqa: A004\n", + "from langfuse.experiment import Evaluation\n", + "from rich.console import Console\n", + "from rich.panel import Panel\n", + "from rich.table import Table\n", + "\n", + "\n", + "if Path(\"\").absolute().name == \"eval-agents\":\n", + " print(f\"Working directory: {Path('').absolute()}\")\n", + "else:\n", + " os.chdir(Path(\"\").absolute().parent.parent)\n", + " print(f\"Working directory set to: {Path('').absolute()}\")\n", + "\n", + "load_dotenv(verbose=True)\n", + "console = Console(width=100)\n", + "\n", + "DATASET_NAME = \"DeepSearchQA-Subset\"" + ] + }, + { + "cell_type": "markdown", + "id": "s1-upload-intro", + "metadata": {}, + "source": [ + "## 1. Uploading the Dataset to Langfuse\n", + "\n", + "Langfuse stores our evaluation dataset so we can run multiple experiments against the same items\n", + "and compare results over time. Each dataset item has three fields:\n", + "\n", + "- **`input`**: the question (sent to the agent)\n", + "- **`expected_output`**: the ground truth answer (given to the grader, never shown to the agent)\n", + "- **`metadata`**: `category`, `answer_type`, `example_id`\n", + "\n", + "Items are deduplicated by a hash of their content, so running this cell again is safe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "upload", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = DeepSearchQADataset()\n", + "examples = dataset.get_by_category(\"Finance & Economics\")[:1]\n", + "\n", + "console.print(f\"Uploading [cyan]{len(examples)}[/cyan] examples to dataset '{DATASET_NAME}'...\")\n", + "\n", + "# Write examples to a temporary JSONL file for the upload utility\n", + "with tempfile.NamedTemporaryFile(mode=\"w\", suffix=\".jsonl\", delete=False, encoding=\"utf-8\") as f:\n", + " for ex in examples:\n", + " record = {\n", + " \"input\": ex.problem,\n", + " \"expected_output\": ex.answer,\n", + " \"metadata\": {\n", + " \"example_id\": ex.example_id,\n", + " \"category\": ex.problem_category,\n", + " \"answer_type\": ex.answer_type,\n", + " },\n", + " }\n", + " f.write(json.dumps(record, ensure_ascii=False) + \"\\n\")\n", + " temp_path = f.name\n", + "\n", + "await upload_dataset_to_langfuse(dataset_path=temp_path, dataset_name=DATASET_NAME)\n", + "os.unlink(temp_path)\n", + "\n", + "console.print(f\"[green]โœ“[/green] Dataset '{DATASET_NAME}' ready in Langfuse\")" + ] + }, + { + "cell_type": "markdown", + "id": "s2-grader-intro", + "metadata": {}, + "source": [ + "## 2. The DeepSearchQA Grader\n", + "\n", + "The grader is an LLM-as-judge that evaluates answers using the official DeepSearchQA methodology\n", + "from Appendix A of the paper. It handles both answer types:\n", + "\n", + "- **Single Answer**: checks whether the response contains the one expected value\n", + "- **Set Answer**: checks which items from the ground truth set appear in the response,\n", + " and flags any extra items the agent included\n", + "\n", + "### Metrics\n", + "\n", + "Let **S** = predicted items, **G** = ground truth items:\n", + "\n", + "| Metric | Formula | Meaning |\n", + "|--------|---------|---------|\n", + "| **Precision** | \\|SโˆฉG\\| / \\|S\\| | Of what the agent said, how much was correct |\n", + "| **Recall** | \\|SโˆฉG\\| / \\|G\\| | Of the ground truth, how much did the agent find |\n", + "| **F1** | 2ยทPยทR / (P+R) | Harmonic mean of precision and recall |\n", + "\n", + "### Outcome Classification\n", + "\n", + "| Outcome | Condition | Interpretation |\n", + "|---------|-----------|----------------|\n", + "| `fully_correct` | S = G | Perfect answer |\n", + "| `correct_with_extraneous` | G โІ S | All correct, but extra items included |\n", + "| `partially_correct` | SโˆฉG โ‰  โˆ… | Some correct items found |\n", + "| `fully_incorrect` | SโˆฉG = โˆ… | No correct items |" + ] + }, + { + "cell_type": "markdown", + "id": "s2-single-sample", + "metadata": {}, + "source": [ + "### 2.1 Single-Sample Walkthrough\n", + "\n", + "Before running at scale, let's walk through one example end-to-end: run the agent,\n", + "then grade its response with the LLM judge." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "pick-example", + "metadata": {}, + "outputs": [], + "source": [ + "# Reproducibly select one Finance & Economics example\n", + "finance_examples = dataset.get_by_category(\"Finance & Economics\")\n", + "example = finance_examples[0]\n", + "\n", + "console.print(\n", + " Panel(\n", + " f\"[bold]ID:[/bold] {example.example_id}\\n\"\n", + " f\"[bold]Category:[/bold] {example.problem_category}\\n\"\n", + " f\"[bold]Answer Type:[/bold] {example.answer_type}\\n\\n\"\n", + " f\"[bold cyan]Question:[/bold cyan]\\n{example.problem}\\n\\n\"\n", + " f\"[bold yellow]Ground Truth:[/bold yellow]\\n{example.answer}\",\n", + " title=\"Evaluation Example\",\n", + " border_style=\"blue\",\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "run-agent", + "metadata": {}, + "outputs": [], + "source": [ + "eval_agent = KnowledgeGroundedAgent(enable_planning=True)\n", + "eval_response = await run_with_display(eval_agent, example.problem)\n", + "\n", + "display_response(\n", + " console,\n", + " eval_response.text,\n", + " title=\"Agent Answer\",\n", + " subtitle=f\"Duration: {eval_response.total_duration_ms / 1000:.1f}s | Tools: {len(eval_response.tool_calls)}\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "grade-response", + "metadata": {}, + "outputs": [], + "source": [ + "console.print(\"[dim]Grading with LLM judge...[/dim]\\n\")\n", + "\n", + "result = await evaluate_deepsearchqa_async(\n", + " question=example.problem,\n", + " answer=eval_response.text,\n", + " ground_truth=example.answer,\n", + " answer_type=example.answer_type,\n", + ")\n", + "\n", + "outcome_color = {\n", + " EvaluationOutcome.FULLY_CORRECT: \"green\",\n", + " EvaluationOutcome.CORRECT_WITH_EXTRANEOUS: \"yellow\",\n", + " EvaluationOutcome.PARTIALLY_CORRECT: \"orange1\",\n", + " EvaluationOutcome.FULLY_INCORRECT: \"red\",\n", + "}.get(result.outcome, \"white\")\n", + "\n", + "metrics_table = Table(title=\"Grader Results\")\n", + "metrics_table.add_column(\"Metric\", style=\"cyan\")\n", + "metrics_table.add_column(\"Value\", style=\"white\")\n", + "metrics_table.add_row(\"Outcome\", f\"[{outcome_color}]{result.outcome.value}[/{outcome_color}]\")\n", + "metrics_table.add_row(\"Precision\", f\"{result.precision:.3f}\")\n", + "metrics_table.add_row(\"Recall\", f\"{result.recall:.3f}\")\n", + "metrics_table.add_row(\"F1\", f\"[bold]{result.f1_score:.3f}[/bold]\")\n", + "console.print(metrics_table)\n", + "\n", + "if result.explanation:\n", + " console.print(Panel(result.explanation, title=\"Grader Explanation\", border_style=\"magenta\"))\n", + "\n", + "# Show per-item correctness for Set Answer questions\n", + "if result.correctness_details:\n", + " details_table = Table(title=\"Correctness Details\")\n", + " details_table.add_column(\"Expected Item\", style=\"white\")\n", + " details_table.add_column(\"Found\", style=\"cyan\", justify=\"center\")\n", + " for item, found in result.correctness_details.items():\n", + " icon = \"[green]โœ“[/green]\" if found else \"[red]โœ—[/red]\"\n", + " label = item[:60] + \"...\" if len(item) > 60 else item\n", + " details_table.add_row(label, icon)\n", + " console.print(details_table)" + ] + }, + { + "cell_type": "markdown", + "id": "s3-experiment-intro", + "metadata": {}, + "source": [ + "## 3. Running the Evaluation Experiment\n", + "\n", + "`run_experiment` runs the agent against every item in the Langfuse dataset, scores each\n", + "response, and records results in Langfuse. Each call creates a new named experiment run\n", + "that you can compare to previous runs in the UI.\n", + "\n", + "The experiment takes two functions:\n", + "\n", + "- **`agent_task`** โ€” receives a dataset item, runs the agent, returns the answer string\n", + "- **`deepsearchqa_evaluator`** โ€” receives question, answer, and ground truth; returns grader scores\n", + "\n", + "> **Note:** This makes one agent call and one grader call per item. With 10 items and\n", + "> `max_concurrency=1`, expect 20โ€“40 minutes depending on model latency." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "define-task-evaluator", + "metadata": {}, + "outputs": [], + "source": [ + "async def agent_task(*, item: Any, **kwargs: Any) -> str:\n", + " \"\"\"Run the Knowledge Agent on a Langfuse dataset item.\"\"\"\n", + " agent = KnowledgeGroundedAgent(enable_planning=True)\n", + " response = await agent.answer_async(item.input)\n", + " return response.text\n", + "\n", + "\n", + "async def deepsearchqa_evaluator(\n", + " *,\n", + " input: str, # noqa: A002\n", + " output: str,\n", + " expected_output: str,\n", + " metadata: dict[str, Any] | None = None,\n", + " **kwargs: Any,\n", + ") -> list[Evaluation]:\n", + " \"\"\"LLM-as-judge grader using DeepSearchQA methodology.\"\"\"\n", + " answer_type = (metadata or {}).get(\"answer_type\", \"Set Answer\")\n", + " result = await evaluate_deepsearchqa_async(\n", + " question=input,\n", + " answer=output,\n", + " ground_truth=expected_output,\n", + " answer_type=answer_type,\n", + " model_config=LLMRequestConfig(temperature=0.0),\n", + " )\n", + " return result.to_evaluations()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "run-experiment", + "metadata": {}, + "outputs": [], + "source": [ + "experiment_result = run_experiment(\n", + " DATASET_NAME,\n", + " name=\"knowledge-agent-baseline\",\n", + " task=agent_task,\n", + " evaluators=[deepsearchqa_evaluator],\n", + " description=\"Baseline Knowledge Agent on Finance & Economics questions.\",\n", + " max_concurrency=1,\n", + ")\n", + "\n", + "console.print(\"[green]โœ“[/green] Experiment complete\")\n", + "if experiment_result.dataset_run_url:\n", + " display(\n", + " HTML(\n", + " f'

View experiment: {experiment_result.dataset_run_url}

'\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "s4-results-intro", + "metadata": {}, + "source": [ + "## 4. Inspecting Results\n", + "\n", + "The `ExperimentResult` object gives programmatic access to every item-level score.\n", + "Aggregate metrics are visible in the Langfuse experiment run summary in the UI." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "item-results", + "metadata": {}, + "outputs": [], + "source": [ + "rows = []\n", + "for item_result in experiment_result.item_results:\n", + " item = item_result.item\n", + " question = str(item.input)\n", + " row = {\n", + " \"question\": question[:55] + \"...\" if len(question) > 55 else question,\n", + " \"answer_type\": (item.metadata or {}).get(\"answer_type\", \"\"),\n", + " }\n", + " for evaluation in item_result.evaluations or []:\n", + " row[evaluation.name] = evaluation.value\n", + " rows.append(row)\n", + "\n", + "df = pd.DataFrame(rows)\n", + "print(df.to_string(index=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aggregate-scores", + "metadata": {}, + "outputs": [], + "source": [ + "# Mean of numeric metrics\n", + "numeric_cols = [c for c in [\"F1\", \"Precision\", \"Recall\"] if c in df.columns]\n", + "if numeric_cols:\n", + " means_table = Table(title=\"Mean Scores\")\n", + " means_table.add_column(\"Metric\", style=\"cyan\")\n", + " means_table.add_column(\"Mean\", style=\"white\")\n", + " for col in numeric_cols:\n", + " means_table.add_row(col, f\"{df[col].mean():.3f}\")\n", + " console.print(means_table)\n", + "\n", + "# Outcome distribution\n", + "if \"Outcome\" in df.columns:\n", + " outcome_table = Table(title=\"Outcome Distribution\")\n", + " outcome_table.add_column(\"Outcome\", style=\"cyan\")\n", + " outcome_table.add_column(\"Count\", style=\"white\", justify=\"right\")\n", + " outcome_table.add_column(\"Fraction\", style=\"dim\", justify=\"right\")\n", + " total = len(df)\n", + " for outcome, count in df[\"Outcome\"].value_counts().items():\n", + " outcome_table.add_row(str(outcome), str(count), f\"{count / total:.0%}\")\n", + " console.print(outcome_table)" + ] + }, + { + "cell_type": "markdown", + "id": "s5-iteration", + "metadata": {}, + "source": [ + "## 5. Iterating on the Agent\n", + "\n", + "The dataset in Langfuse is persistent โ€” you don't need to re-upload it. To evaluate a modified\n", + "agent, call `run_experiment` again with a new `name` argument. Langfuse will create a new\n", + "experiment run and you can compare runs side-by-side in the UI.\n", + "\n", + "### Levers to Explore\n", + "\n", + "- **System prompt** โ€” edit `SYSTEM_INSTRUCTIONS_TEMPLATE` in `system_instructions.py` to change\n", + " the search strategy, verification rules, or final answer format\n", + "- **Planning** โ€” toggle `enable_planning=False` to skip PlanReAct and compare quality vs. speed\n", + "- **Model** โ€” change the Gemini model in `KnowledgeGroundedAgent` for different capability/cost trade-offs\n", + "- **Dataset** โ€” change the `category` filter in Section 1 or increase `samples` to cover more examples\n", + "\n", + "### What to Look for in Langfuse\n", + "\n", + "- Items with **low F1** โ€” did the agent fail to fetch the source? Stop early? Misread the question?\n", + "- Items with **`correct_with_extraneous`** โ€” is the agent over-generating? Can the prompt be tightened?\n", + "- **Latency outliers** โ€” which steps are slow? Is replanning happening unnecessarily?" + ] + }, + { + "cell_type": "markdown", + "id": "summary", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "In this notebook you:\n", + "\n", + "1. **Uploaded** a DeepSearchQA subset to Langfuse as a persistent, reusable dataset\n", + "2. **Understood** the LLM-as-judge grader: precision, recall, F1, and the four outcome categories\n", + "3. **Walked through** a single-sample evaluation end-to-end\n", + "4. **Ran** a full experiment with `run_experiment` and inspected item-level scores\n", + "5. **Learned** how to iterate: re-run with a new experiment name to compare configurations in Langfuse\n", + "\n", + "The evaluation pipeline is the foundation for systematic agent improvement โ€” each iteration\n", + "produces a new experiment run that you can compare to the baseline in the Langfuse UI." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "done", + "metadata": {}, + "outputs": [], + "source": [ + "console.print(Panel(\"[green]โœ“[/green] Notebook complete!\", title=\"Done\", border_style=\"green\"))\n", + "if experiment_result.dataset_run_url:\n", + " display(\n", + " HTML(\n", + " f'

View experiment results: {experiment_result.dataset_run_url}

'\n", + " )\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/implementations/knowledge_qa/03_multi_turn.ipynb b/implementations/knowledge_qa/03_multi_turn.ipynb deleted file mode 100644 index a0e3909..0000000 --- a/implementations/knowledge_qa/03_multi_turn.ipynb +++ /dev/null @@ -1,312 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": "# 03: Multi-Turn Conversations & Evaluation\n\nThis notebook demonstrates multi-turn conversation capabilities and\nhow to evaluate the agent on the DeepSearchQA benchmark.\n\n## Learning Objectives\n\n- Understand how ADK manages multi-turn conversations via sessions\n- Use the `DeepSearchQAEvaluator` for systematic evaluation\n- Analyze evaluation results with rich visualizations\n- Understand evaluation metrics for research agents" - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Setup: Load environment and configure rich console\n", - "import uuid\n", - "\n", - "from aieng.agent_evals import (\n", - " create_console,\n", - " display_evaluation_result,\n", - " display_metrics_table,\n", - " display_success,\n", - ")\n", - "from aieng.agent_evals.knowledge_qa import (\n", - " DeepSearchQADataset,\n", - " DeepSearchQAEvaluator,\n", - " KnowledgeGroundedAgent,\n", - ")\n", - "from dotenv import load_dotenv\n", - "from rich.panel import Panel\n", - "from rich.table import Table\n", - "\n", - "\n", - "console = create_console()\n", - "load_dotenv(verbose=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": "## 1. Multi-Turn Conversations with ADK\n\nThe `KnowledgeGroundedAgent` uses Google ADK's built-in session management via `InMemorySessionService`.\nWhen you pass a `session_id` to `answer_async()`, ADK maintains conversation history automatically.\n\nKey points:\n- Each unique `session_id` creates a separate conversation thread\n- ADK tracks all messages, tool calls, and context within that session\n- No manual history tracking needed - ADK handles it internally" - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create agent and demonstrate multi-turn conversation\n", - "agent = KnowledgeGroundedAgent()\n", - "\n", - "# Create a session ID for multi-turn conversation\n", - "session_id = str(uuid.uuid4())\n", - "\n", - "console.print(\n", - " Panel(\n", - " f\"[cyan]Session ID:[/cyan] {session_id}\\n\\nADK will track conversation history for this session automatically.\",\n", - " title=\"๐Ÿ—จ๏ธ New Session Created\",\n", - " border_style=\"green\",\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# First turn - ask a question\n", - "response1 = await agent.answer_async(\"What is the capital of France?\", session_id=session_id)\n", - "console.print(Panel(response1.text, title=\"Turn 1: Capital of France\", border_style=\"blue\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Second turn - follow-up question (ADK remembers the context)\n", - "response2 = await agent.answer_async(\"What is its population?\", session_id=session_id)\n", - "console.print(Panel(response2.text, title=\"Turn 2: Population (follow-up)\", border_style=\"blue\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": "## 2. Session Management in Applications\n\nFor web applications (like Gradio), you can store a session ID in the app's state:\n\n```python\n# In a Gradio app handler:\nif \"session_id\" not in session_state:\n session_state[\"session_id\"] = str(uuid.uuid4())\n\nresponse = await agent.answer_async(query, session_id=session_state[\"session_id\"])\n```\n\nSee `gradio_app.py` for a complete example." - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# For more details on ADK sessions, see:\n", - "# https://google.github.io/adk-docs/sessions/\n", - "\n", - "display_success(\"Multi-turn conversation demo complete!\", console=console)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Running DeepSearchQA Evaluation\n", - "\n", - "The `DeepSearchQAEvaluator` provides a systematic way to evaluate the agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create evaluator using the existing agent\n", - "evaluator = DeepSearchQAEvaluator(agent)\n", - "\n", - "display_success(f\"Dataset size: {len(evaluator.dataset)} examples\", console=console)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Evaluate a small sample\n", - "console.print(\"[bold]๐Ÿ”ฌ Running evaluation on 3 examples...[/bold]\\n\")\n", - "\n", - "console.print(\"[dim]Evaluating...[/dim]\")\n", - "results = await evaluator.evaluate_sample_async(n=3, random_state=42)\n", - "\n", - "display_success(f\"Completed {len(results)} evaluations\", console=console)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# View results using the display utility\n", - "console.print(\"\\n[bold]๐Ÿ“‹ Evaluation Results[/bold]\\n\")\n", - "\n", - "for result in results:\n", - " contains_answer = result.ground_truth.lower() in result.prediction.lower()\n", - " display_evaluation_result(\n", - " example_id=result.example_id,\n", - " problem=result.problem,\n", - " ground_truth=result.ground_truth,\n", - " prediction=result.prediction,\n", - " sources_used=result.sources_used,\n", - " search_queries=result.search_queries,\n", - " contains_answer=contains_answer,\n", - " console=console,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Analyzing Evaluation Results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Convert to DataFrame for analysis\n", - "df = evaluator.results_to_dataframe(results)\n", - "\n", - "# Calculate metrics\n", - "containment_correct = sum(1 for r in results if r.ground_truth.lower() in r.prediction.lower())\n", - "containment_accuracy = containment_correct / len(results) * 100\n", - "\n", - "metrics = {\n", - " \"Total Examples\": len(results),\n", - " \"Containment Accuracy\": f\"{containment_accuracy:.1f}%\",\n", - " \"Avg Sources Used\": df[\"sources_used\"].mean(),\n", - " \"Avg Search Queries\": df[\"search_queries\"].apply(len).mean(),\n", - "}\n", - "\n", - "display_metrics_table(metrics, title=\"Evaluation Metrics\", console=console)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Understanding Evaluation Metrics\n", - "\n", - "For research agents, we care about:\n", - "\n", - "1. **Answer Correctness**: Does the prediction match the ground truth?\n", - "2. **Source Quality**: Are the sources relevant and authoritative?\n", - "3. **Comprehensiveness**: Did the agent find all necessary information?\n", - "4. **Search Efficiency**: How many searches were needed?\n", - "\n", - "DeepSearchQA specifically measures:\n", - "- **Precision**: Quality of the answer\n", - "- **Recall**: Completeness of the answer (for list-type questions)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Manual correctness check with better display\n", - "def check_answer_contains_ground_truth(prediction: str, ground_truth: str) -> bool:\n", - " \"\"\"Check if prediction contains the ground truth answer.\"\"\"\n", - " return ground_truth.lower() in prediction.lower()\n", - "\n", - "\n", - "# Check our results\n", - "console.print(\"\\n[bold]๐Ÿ“Š Correctness Check[/bold]\\n\")\n", - "\n", - "result_table = Table(show_header=True, header_style=\"bold cyan\")\n", - "result_table.add_column(\"Example\", style=\"cyan\")\n", - "result_table.add_column(\"Status\", style=\"white\")\n", - "result_table.add_column(\"Expected\", style=\"dim\")\n", - "\n", - "for result in results:\n", - " contains = check_answer_contains_ground_truth(result.prediction, result.ground_truth)\n", - " status = \"[green]โœ“ MATCH[/green]\" if contains else \"[yellow]โœ— NO MATCH[/yellow]\"\n", - " result_table.add_row(\n", - " str(result.example_id),\n", - " status,\n", - " result.ground_truth[:40] + \"...\" if len(result.ground_truth) > 40 else result.ground_truth,\n", - " )\n", - "\n", - "console.print(result_table)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. Exploring Categories" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get examples from a specific category\n", - "dataset = DeepSearchQADataset()\n", - "categories = dataset.get_categories()\n", - "\n", - "cat_table = Table(title=\"๐Ÿ“ Available Categories\", show_header=True, header_style=\"bold green\")\n", - "cat_table.add_column(\"Category\", style=\"white\")\n", - "cat_table.add_column(\"Count\", style=\"cyan\", justify=\"right\")\n", - "\n", - "for cat in sorted(categories):\n", - " count = len(dataset.get_by_category(cat))\n", - " cat_table.add_row(cat, str(count))\n", - "\n", - "console.print(cat_table)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": "## Summary\n\nIn this notebook, you learned:\n\n1. How ADK manages multi-turn conversations via `InMemorySessionService`\n2. How to use `session_id` for conversation continuity\n3. How to run systematic evaluations with `DeepSearchQAEvaluator`\n4. How to analyze evaluation results with rich visualizations\n5. Key metrics for evaluating research agents\n\n## Next Steps\n\n- Run the Gradio app for interactive testing\n- Experiment with different models (gemini-2.5-pro vs flash)\n- Try the async evaluator for larger-scale evaluation\n- Implement LLM-as-judge evaluation for more nuanced correctness checking" - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "console.print(\n", - " Panel(\n", - " \"[green]โœ“[/green] Notebook complete!\\n\\n\"\n", - " \"[cyan]Next:[/cyan] Run [bold]gradio_app.py[/bold] for interactive testing.\",\n", - " title=\"๐ŸŽ‰ Done\",\n", - " border_style=\"green\",\n", - " )\n", - ")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/implementations/knowledge_qa/README.md b/implementations/knowledge_qa/README.md index d796e37..d0617bd 100644 --- a/implementations/knowledge_qa/README.md +++ b/implementations/knowledge_qa/README.md @@ -1,19 +1,18 @@ # Knowledge-Grounded QA Agent -This implementation demonstrates a knowledge-grounded question answering agent using **Google ADK** with explicit **Google Search tool calls**, evaluated on the **DeepSearchQA** benchmark. +This implementation demonstrates a knowledge-grounded question answering agent using **Google ADK** with a **PlanReAct** architecture, evaluated on the **DeepSearchQA** benchmark. ## Overview -The knowledge agent uses a ReAct (Reasoning + Acting) architecture powered by Google ADK. It explicitly calls Google Search as a tool, making the reasoning process transparent through observable Thought โ†’ Action โ†’ Observation cycles. This approach searches the live web to find relevant information for questions requiring real-time data. +The agent combines two patterns: **PlanReAct** (creates an explicit numbered research plan before executing) and a **ReAct loop** within each step (Thought โ†’ Tool Call โ†’ Observation). It searches the live web to find and verify facts rather than relying on training data. ## Features -- **ReAct Architecture**: Explicit tool calls with traceable reasoning (Thought โ†’ Action โ†’ Observation) -- **Google Search Tool**: Uses ADK's `GoogleSearchTool` for real-time web search -- **Source Citation**: Automatically extracts and includes source URLs from search results -- **DeepSearchQA Evaluation**: Built-in evaluation on the DeepSearchQA benchmark (900 research tasks) +- **PlanReAct Architecture**: Explicit research plan with step statuses, revised mid-run if needed +- **Five Tools**: `google_search`, `web_fetch`, `fetch_file`, `grep_file`, `read_file` +- **Source Citation**: Extracts and cites source URLs from search results +- **DeepSearchQA Evaluation**: LLM-as-judge evaluation on the DeepSearchQA benchmark (896 questions) - **Multi-turn Conversations**: Session management via ADK's `InMemorySessionService` -- **Gradio Interface**: Interactive chat UI for testing ## Setup @@ -36,14 +35,6 @@ uv sync ## Usage -### Interactive Chat - -Run the Gradio app: - -```bash -uv run --env-file .env gradio implementations/knowledge_qa/gradio_app.py -``` - ### Programmatic Usage ```python @@ -64,43 +55,66 @@ print(f"Tool calls: {response.tool_calls}") ### Evaluation on DeepSearchQA -```python -from aieng.agent_evals.knowledge_qa import ( - KnowledgeGroundedAgent, - DeepSearchQAEvaluator, -) +Use the main evaluation script to run comprehensive evaluations: -agent = KnowledgeGroundedAgent() -evaluator = DeepSearchQAEvaluator(agent) +```bash +# Run evaluation on 3 samples +python implementations/knowledge_qa/evaluate.py --samples 3 -# Evaluate a sample (use await in Jupyter) -results = await evaluator.evaluate_sample_async(n=10, random_state=42) +# Run with specific example IDs +python implementations/knowledge_qa/evaluate.py --ids 123 456 789 -# Convert to DataFrame for analysis -df = evaluator.results_to_dataframe(results) -print(df[["example_id", "ground_truth", "prediction", "sources_used"]]) +# Enable trace groundedness evaluation +ENABLE_TRACE_GROUNDEDNESS=true python implementations/knowledge_qa/evaluate.py +``` + +Or use the CLI: + +```bash +# Run evaluation via CLI +uv run --env-file .env knowledge-qa eval --samples 3 +uv run --env-file .env knowledge-qa eval --ids 123 456 --show-plan +``` + +## Run with ADK Web UI + +To inspect the agent interactively, the module exposes a top-level `root_agent` for ADK discovery. + +```bash +uv run adk web --port 8000 --reload --reload_agents implementations/ ``` ## Notebooks -1. **01_grounding_basics.ipynb**: Introduction to the knowledge agent and Google Search tool -2. **02_agent_basics.ipynb**: Creating agents with custom instructions -3. **03_multi_turn.ipynb**: Multi-turn conversations and DeepSearchQA evaluation +1. **01_dataset_and_tools.ipynb**: The DeepSearchQA dataset and the agent's five tools +2. **02_running_the_agent.ipynb**: PlanReAct architecture, live progress display, multi-turn conversations, and Langfuse tracing +3. **03_evaluation.ipynb**: Systematic evaluation with `run_experiment`, LLM-as-judge grading, and result inspection ## Architecture ``` aieng.agent_evals.knowledge_qa/ -โ”œโ”€โ”€ config.py # Configuration (Pydantic settings) -โ”œโ”€โ”€ grounding_tool.py # GoogleSearchTool wrapper and response models -โ”œโ”€โ”€ agent.py # KnowledgeGroundedAgent (ADK Agent + Runner) -โ”œโ”€โ”€ session.py # Conversation session management -โ””โ”€โ”€ evaluation.py # DeepSearchQA dataset and evaluator +โ”œโ”€โ”€ agent.py # KnowledgeGroundedAgent (ADK Agent + Runner) +โ”œโ”€โ”€ data/ # DeepSearchQA dataset loader +โ”œโ”€โ”€ deepsearchqa_grader.py # LLM-as-judge evaluation +โ”œโ”€โ”€ planner.py # Research planning +โ”œโ”€โ”€ token_tracker.py # Token usage tracking +โ””โ”€โ”€ cli.py # Rich CLI interface + +aieng.agent_evals/ +โ”œโ”€โ”€ configs.py # Configuration (Pydantic settings) +โ”œโ”€โ”€ evaluation/ # Evaluation harness +โ”‚ โ”œโ”€โ”€ experiment.py # Langfuse experiment runner +โ”‚ โ””โ”€โ”€ graders/ # Evaluators (trace groundedness, etc.) +โ””โ”€โ”€ tools/ # Shared tools + โ”œโ”€โ”€ search.py # GoogleSearchTool wrapper + โ”œโ”€โ”€ web.py # web_fetch for HTML/PDF + โ””โ”€โ”€ file.py # fetch_file, grep_file, read_file ``` ## DeepSearchQA Dataset -The [DeepSearchQA](https://www.kaggle.com/datasets/deepmind/deepsearchqa) benchmark consists of 900 "causal chain" research tasks across 17 categories. These questions require: +The [DeepSearchQA](https://www.kaggle.com/datasets/deepmind/deepsearchqa) benchmark consists of 896 "causal chain" research tasks across 17 categories. These questions require: - Multi-source lookups - Statistical comparisons diff --git a/implementations/knowledge_qa/agent.py b/implementations/knowledge_qa/agent.py new file mode 100644 index 0000000..d386b0c --- /dev/null +++ b/implementations/knowledge_qa/agent.py @@ -0,0 +1,21 @@ +"""ADK discovery entrypoint for the Knowledge QA agent. + +Exposes a module-level ``root_agent`` so ``adk web`` can discover it. + +Examples +-------- +Run with ``adk web``: + uv run adk web --port 8000 --reload --reload_agents implementations/ +""" + +import logging + +from aieng.agent_evals.knowledge_qa.agent import KnowledgeGroundedAgent + + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + + +# ADK discovery expects a module-level `root_agent` +root_agent = KnowledgeGroundedAgent().adk_agent