diff --git a/aieng-eval-agents/aieng/agent_evals/configs.py b/aieng-eval-agents/aieng/agent_evals/configs.py
index 390847b..61f7542 100644
--- a/aieng-eval-agents/aieng/agent_evals/configs.py
+++ b/aieng-eval-agents/aieng/agent_evals/configs.py
@@ -96,11 +96,6 @@ class Configs(BaseSettings):
validation_alias=AliasChoices("OPENAI_API_KEY", "GEMINI_API_KEY", "GOOGLE_API_KEY"),
description="API key for OpenAI-compatible API (accepts OPENAI_API_KEY, GEMINI_API_KEY, or GOOGLE_API_KEY).",
)
- gemini_api_key: SecretStr = Field(
- default=SecretStr("default-gemini-api-key"), # setting a default so some implementations can run without it
- validation_alias=AliasChoices("GEMINI_API_KEY", "GOOGLE_API_KEY"),
- description="API key for Google/Gemini API (accepts GEMINI_API_KEY, or GOOGLE_API_KEY).",
- )
default_planner_model: str = Field(
default="gemini-2.5-pro",
description="Model name for planning/complex reasoning tasks.",
diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py
index 0c02325..bc9ba87 100644
--- a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py
+++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py
@@ -7,6 +7,7 @@
import asyncio
import logging
+import os
import time
import uuid
import warnings
@@ -235,6 +236,10 @@ def __init__(
if thinking_budget > 0 and self._supports_thinking(self.model):
thinking_config = types.ThinkingConfig(thinking_budget=thinking_budget)
+ # Google ADK reads GOOGLE_API_KEY from the environment directly.
+ # Bridge from OPENAI_API_KEY (or GEMINI_API_KEY) if not already set.
+ os.environ.setdefault("GOOGLE_API_KEY", config.openai_api_key.get_secret_value())
+
self._agent = Agent(
name="knowledge_qa",
model=self.model,
@@ -345,6 +350,11 @@ def reset(self) -> None:
)
logger.debug("Agent state reset for new question")
+ @property
+ def adk_agent(self) -> Agent:
+ """Return the underlying ADK agent, e.g. for use with ``adk web``."""
+ return self._agent
+
@property
def current_plan(self) -> ResearchPlan | None:
"""Get the current research plan if one exists."""
diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/notebook.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/notebook.py
new file mode 100644
index 0000000..627f173
--- /dev/null
+++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/notebook.py
@@ -0,0 +1,339 @@
+"""Notebook display utilities for the Knowledge Agent.
+
+Provides live progress display for Jupyter notebooks, showing plan status
+and tool calls while the agent works, and formatted rendering of agent responses.
+
+Example
+-------
+>>> from aieng.agent_evals.knowledge_qa import KnowledgeGroundedAgent
+>>> from aieng.agent_evals.knowledge_qa.notebook import (
+... display_response,
+... run_with_display,
+... )
+>>> agent = KnowledgeGroundedAgent(enable_planning=True)
+>>> response = await run_with_display(agent, "What is quantum computing?")
+>>> display_response(console, response.text)
+"""
+
+import asyncio
+import logging
+import re
+from typing import TYPE_CHECKING
+
+from IPython.display import HTML, clear_output, display
+from rich.console import Console
+from rich.markdown import Markdown
+from rich.panel import Panel
+
+from .plan_parsing import StepStatus
+
+
+if TYPE_CHECKING:
+ from .agent import AgentResponse, KnowledgeGroundedAgent
+ from .plan_parsing import ResearchPlan
+
+
+class ToolCallCapture(logging.Handler):
+ """Captures tool calls from agent logs for display."""
+
+ def __init__(self):
+ super().__init__()
+ self.tool_calls: list[dict] = []
+
+ def emit(self, record):
+ """Capture tool call and response log messages."""
+ msg = record.getMessage()
+ if "Tool call:" in msg:
+ try:
+ parts = msg.split("Tool call: ", 1)[1]
+ paren_idx = parts.find("(")
+ if paren_idx > 0:
+ tool_name = parts[:paren_idx]
+ args_str = parts[paren_idx + 1 : -1]
+ if len(args_str) > 60:
+ args_str = args_str[:57] + "..."
+ self.tool_calls.append({"name": tool_name, "args": args_str, "completed": False})
+ except Exception:
+ pass
+ elif "Tool response:" in msg:
+ try:
+ parts = msg.split("Tool response: ", 1)[1]
+ tool_name = parts.split(" ")[0]
+ for tc in reversed(self.tool_calls):
+ if tc["name"] == tool_name and not tc["completed"]:
+ tc["completed"] = True
+ break
+ except Exception:
+ pass
+
+
+def _format_plan_html(plan: "ResearchPlan") -> str:
+ """Format the research plan as HTML."""
+ lines = ['
']
+ lines.append('
๐ Research Plan
')
+
+ for step in plan.steps:
+ if step.status == StepStatus.COMPLETED:
+ icon, color = "โ", "#28a745"
+ elif step.status == StepStatus.FAILED:
+ icon, color = "โ", "#dc3545"
+ elif step.status == StepStatus.IN_PROGRESS:
+ icon, color = "โ", "#ffc107"
+ elif step.status == StepStatus.SKIPPED:
+ icon, color = "โ", "#6c757d"
+ else:
+ icon, color = "โ", "#adb5bd"
+
+ lines.append(f'
{icon} {step.step_id}. {step.description}
')
+
+ lines.append("
")
+ return "\n".join(lines)
+
+
+def _format_tools_html(tool_calls: list[dict]) -> str:
+ """Format tool calls as HTML."""
+ if not tool_calls:
+ return 'Waiting for tool calls...
'
+
+ lines = [
+ ''
+ ]
+ lines.append(f'
๐ง Tool Calls ({len(tool_calls)})
')
+
+ # Show last 8 tool calls
+ display_calls = tool_calls[-8:]
+ if len(tool_calls) > 8:
+ lines.append(f'
... ({len(tool_calls) - 8} earlier calls)
')
+
+ tool_icons = {
+ "google_search": "๐",
+ "google_search_agent": "๐",
+ "fetch_url": "๐",
+ "web_fetch": "๐",
+ "read_pdf": "๐",
+ "grep_file": "๐",
+ "read_file": "๐",
+ }
+
+ for tc in display_calls:
+ name = tc["name"]
+ if name == "google_search_agent":
+ name = "google_search"
+ icon = tool_icons.get(name, "๐ง")
+ status_icon = "โ" if tc.get("completed") else "โ"
+ status_color = "#28a745" if tc.get("completed") else "#ffc107"
+
+ lines.append(
+ f'
'
+ f'{status_icon} '
+ f"{icon} {name} "
+ f'{tc["args"]}'
+ f"
"
+ )
+
+ lines.append("
")
+ return "\n".join(lines)
+
+
+def _format_display_html(plan: "ResearchPlan | None", tool_calls: list[dict], question: str) -> str:
+ """Create the full HTML display."""
+ html = ['']
+
+ # Question
+ html.append(
+ f'
'
+ f"Question: {question}
"
+ )
+
+ # Plan
+ if plan and plan.steps:
+ html.append(_format_plan_html(plan))
+
+ # Tools
+ html.append(_format_tools_html(tool_calls))
+
+ html.append("
")
+ return "\n".join(html)
+
+
+def _parse_response_sections(text: str) -> tuple[str, list[str], str]:
+ """Extract answer, sources, and reasoning from structured agent response text.
+
+ The agent formats its final response as::
+
+ ANSWER:
+ SOURCES:
+ REASONING:
+
+ Parameters
+ ----------
+ text : str
+ Raw response text from the agent.
+
+ Returns
+ -------
+ tuple[str, list[str], str]
+ ``(answer, sources, reasoning)`` where *sources* is a list of URLs.
+ If the text does not contain the expected sections, the full text is
+ returned as the answer with empty sources and reasoning.
+ """
+ answer_match = re.search(r"ANSWER:\s*(.*?)(?=\n\s*SOURCES:|\n\s*REASONING:|$)", text, re.DOTALL | re.IGNORECASE)
+ sources_match = re.search(r"SOURCES:\s*(.*?)(?=\n\s*ANSWER:|\n\s*REASONING:|$)", text, re.DOTALL | re.IGNORECASE)
+ reasoning_match = re.search(r"REASONING:\s*(.*?)(?=\n\s*ANSWER:|\n\s*SOURCES:|$)", text, re.DOTALL | re.IGNORECASE)
+
+ answer = answer_match.group(1).strip() if answer_match else text
+ sources_raw = sources_match.group(1).strip() if sources_match else ""
+ reasoning = reasoning_match.group(1).strip() if reasoning_match else ""
+
+ # Sources may be newline- or comma-separated URLs
+ sources = [s.strip() for s in re.split(r"[\n,]+", sources_raw) if s.strip().startswith("http")]
+
+ return answer, sources, reasoning
+
+
+def display_response(
+ console: Console,
+ text: str,
+ title: str = "Answer",
+ subtitle: str | None = None,
+) -> None:
+ """Display a structured agent response with separated, styled sections.
+
+ Parses the ``ANSWER`` / ``SOURCES`` / ``REASONING`` structure from the
+ agent's final response text and renders each section with appropriate Rich
+ styling: the answer in a cyan panel, sources in a dimmed panel, and
+ reasoning in a muted panel.
+
+ Parameters
+ ----------
+ console : Console
+ Rich console to render to.
+ text : str
+ Raw response text from the agent.
+ title : str, optional
+ Panel title for the answer section (default ``"Answer"``).
+ subtitle : str, optional
+ Panel subtitle, e.g. duration and tool-call count.
+
+ Example
+ -------
+ >>> duration = f"{response.total_duration_ms / 1000:.1f}s"
+ >>> display_response(console, response.text, subtitle=duration)
+ """
+ answer, sources, reasoning = _parse_response_sections(text)
+
+ console.print(Panel(Markdown(answer), title=title, border_style="cyan", subtitle=subtitle))
+
+ if sources:
+ src_lines = "\n".join(f" [blue]{src}[/blue]" for src in sources[:6])
+ console.print(Panel(src_lines, title="Sources", border_style="dim", padding=(0, 1)))
+
+ if reasoning:
+ console.print(Panel(Markdown(reasoning), title="[dim]Reasoning[/dim]", border_style="dim", padding=(0, 1)))
+
+
+async def run_with_display(
+ agent: "KnowledgeGroundedAgent",
+ question: str,
+ refresh_rate: float = 0.5,
+) -> "AgentResponse":
+ """Run the agent with live progress display in a Jupyter notebook.
+
+ Shows the research plan checklist and tool calls while the agent works,
+ updating the display periodically.
+
+ Parameters
+ ----------
+ agent : KnowledgeGroundedAgent
+ The agent to run.
+ question : str
+ The question to answer.
+ refresh_rate : float
+ How often to update the display in seconds (default 0.5).
+
+ Returns
+ -------
+ AgentResponse
+ The agent's response.
+
+ Example
+ -------
+ >>> agent = KnowledgeGroundedAgent(enable_planning=True)
+ >>> response = await run_with_display(agent, "What is quantum computing?")
+ >>> print(response.text)
+ """
+ # Suppress verbose logging from external libraries (same as CLI)
+ verbose_loggers = ["google.adk", "google.genai", "httpx", "httpcore"]
+ original_levels = {}
+ for name in verbose_loggers:
+ _logger = logging.getLogger(name)
+ original_levels[name] = _logger.level
+ _logger.setLevel(logging.ERROR)
+ _logger.propagate = False
+
+ # Set up tool call capture on the agent logger (same as CLI)
+ tool_capture = ToolCallCapture()
+ tool_capture.setLevel(logging.INFO)
+ agent_logger = logging.getLogger("aieng.agent_evals.knowledge_qa.agent")
+ original_agent_level = agent_logger.level
+ original_handlers = agent_logger.handlers.copy()
+ agent_logger.handlers.clear()
+ agent_logger.addHandler(tool_capture)
+ agent_logger.setLevel(logging.INFO)
+ agent_logger.propagate = False
+
+ try:
+ # Create the plan first if planning is enabled
+ if agent.enable_planning and hasattr(agent, "create_plan_async"):
+ clear_output(wait=True)
+ display(HTML('Creating research plan...
'))
+ await agent.create_plan_async(question)
+
+ # Start the agent task
+ task = asyncio.create_task(agent.answer_async(question))
+
+ # Update display while agent works
+ while not task.done():
+ clear_output(wait=True)
+ display(
+ HTML(
+ _format_display_html(
+ plan=agent.current_plan if hasattr(agent, "current_plan") else None,
+ tool_calls=tool_capture.tool_calls,
+ question=question,
+ )
+ )
+ )
+ await asyncio.sleep(refresh_rate)
+
+ # Get the result
+ response = await task
+
+ # Final display with completion status
+ clear_output(wait=True)
+ display(
+ HTML(
+ _format_display_html(
+ plan=agent.current_plan if hasattr(agent, "current_plan") else None,
+ tool_calls=tool_capture.tool_calls,
+ question=question,
+ )
+ + f''
+ f"โ Complete in {response.total_duration_ms / 1000:.1f}s | "
+ f"{len(response.tool_calls)} tool calls | "
+ f"{len(response.sources)} sources
"
+ )
+ )
+
+ return response
+
+ finally:
+ # Clean up logging - restore original state
+ agent_logger.removeHandler(tool_capture)
+ agent_logger.handlers = original_handlers
+ agent_logger.setLevel(original_agent_level)
+ agent_logger.propagate = True
+
+ # Restore verbose logger levels
+ for name, level in original_levels.items():
+ logging.getLogger(name).setLevel(level)
diff --git a/aieng-eval-agents/aieng/agent_evals/logging_config.py b/aieng-eval-agents/aieng/agent_evals/logging_config.py
new file mode 100644
index 0000000..b9bb75a
--- /dev/null
+++ b/aieng-eval-agents/aieng/agent_evals/logging_config.py
@@ -0,0 +1,87 @@
+"""Logging configuration with colors and clean output.
+
+This module provides a clean, colored logging setup for agent evaluations
+using the rich library. It reuses the console infrastructure from display.py
+for consistent styling across the codebase.
+"""
+
+import logging
+
+from rich.logging import RichHandler
+
+from .display import create_console
+
+
+def setup_logging(
+ level: int = logging.INFO,
+ show_time: bool = True,
+ show_path: bool = False,
+) -> None:
+ """Configure colored logging with rich.
+
+ Uses the same console theme as display.py for consistent styling.
+
+ Parameters
+ ----------
+ level : int, optional
+ Logging level, by default logging.INFO.
+ show_time : bool, optional
+ Whether to show timestamps, by default True.
+ show_path : bool, optional
+ Whether to show file path in logs, by default False.
+ """
+ # Reuse display console with force_jupyter=False for CLI
+ console = create_console(force_jupyter=False)
+
+ # Configure rich handler with clean formatting
+ rich_handler = RichHandler(
+ console=console,
+ show_time=show_time,
+ show_path=show_path,
+ markup=True,
+ rich_tracebacks=True,
+ tracebacks_show_locals=False,
+ omit_repeated_times=False,
+ )
+
+ # Simple format - rich handles styling
+ rich_handler.setFormatter(logging.Formatter("%(message)s", datefmt="[%X]"))
+
+ # Configure root logger
+ logging.basicConfig(
+ level=level,
+ format="%(message)s",
+ datefmt="[%X]",
+ handlers=[rich_handler],
+ force=True,
+ )
+
+ # Silence noisy third-party libraries
+ _silence_third_party_loggers()
+
+
+def _silence_third_party_loggers() -> None:
+ """Reduce noise from third-party libraries.
+
+ Sets logging levels for common noisy libraries to WARNING or ERROR
+ to keep evaluation output clean and focused on agent behavior.
+ """
+ # Google SDK libraries - only warnings and above
+ for logger_name in [
+ "google_adk",
+ "google_genai",
+ "google.adk",
+ "google.genai",
+ ]:
+ logging.getLogger(logger_name).setLevel(logging.WARNING)
+
+ # Tracing/observability - only warnings
+ logging.getLogger("langfuse").setLevel(logging.WARNING)
+
+ # HTTP/network libraries - errors only
+ for logger_name in ["httpx", "httpcore", "urllib3"]:
+ logging.getLogger(logger_name).setLevel(logging.ERROR)
+
+ # System libraries
+ logging.getLogger("asyncio").setLevel(logging.WARNING)
+ logging.getLogger("py.warnings").setLevel(logging.ERROR)
diff --git a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py
index ba33840..2c0c1cc 100644
--- a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py
+++ b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py
@@ -351,9 +351,9 @@ class TestKnowledgeGroundedAgent:
def mock_config(self):
"""Create a mock config for testing."""
config = MagicMock()
- config.gemini_api_key = "test-api-key"
config.default_worker_model = "gemini-2.5-flash"
config.default_temperature = 0.0
+ config.openai_api_key.get_secret_value.return_value = "test-api-key"
return config
@patch("aieng.agent_evals.knowledge_qa.agent.PlanReActPlanner")
@@ -489,6 +489,7 @@ def test_lazy_initialization(self, *_mocks):
mock_config = MagicMock()
mock_config.default_worker_model = "gemini-2.5-flash"
mock_config.default_temperature = 0.0
+ mock_config.openai_api_key.get_secret_value.return_value = "test-api-key"
mock_config_class.return_value = mock_config
manager = KnowledgeAgentManager(enable_caching=False, enable_compaction=False)
@@ -517,6 +518,7 @@ def test_close(self, *_mocks):
mock_config = MagicMock()
mock_config.default_worker_model = "gemini-2.5-flash"
mock_config.default_temperature = 0.0
+ mock_config.openai_api_key.get_secret_value.return_value = "test-api-key"
mock_config_class.return_value = mock_config
manager = KnowledgeAgentManager(enable_caching=False, enable_compaction=False)
diff --git a/implementations/knowledge_qa/01_dataset_and_tools.ipynb b/implementations/knowledge_qa/01_dataset_and_tools.ipynb
new file mode 100644
index 0000000..d16941c
--- /dev/null
+++ b/implementations/knowledge_qa/01_dataset_and_tools.ipynb
@@ -0,0 +1,318 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "intro",
+ "metadata": {},
+ "source": [
+ "# 01: The DeepSearchQA Dataset & Agent Tools\n",
+ "\n",
+ "This notebook introduces the two foundational components of the Knowledge QA system:\n",
+ "\n",
+ "- **DeepSearchQA** โ the benchmark dataset used to evaluate the agent\n",
+ "- **Agent tools** โ the five capabilities the agent uses to research and verify answers\n",
+ "\n",
+ "## What You'll Learn\n",
+ "\n",
+ "1. What the DeepSearchQA dataset contains and how to explore it\n",
+ "2. The five tools the agent has access to, and how it's instructed to use them\n",
+ "\n",
+ "## Prerequisites\n",
+ "\n",
+ "- `GOOGLE_API_KEY` set in your `.env` file\n",
+ "- Dependencies installed with `uv sync`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "setup",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from pathlib import Path\n",
+ "\n",
+ "from aieng.agent_evals.knowledge_qa import DeepSearchQADataset\n",
+ "from aieng.agent_evals.knowledge_qa.system_instructions import build_system_instructions\n",
+ "from dotenv import load_dotenv\n",
+ "from rich.console import Console\n",
+ "from rich.markdown import Markdown\n",
+ "from rich.panel import Panel\n",
+ "from rich.table import Table\n",
+ "\n",
+ "\n",
+ "# Set working directory to the repository root\n",
+ "if Path(\"\").absolute().name == \"eval-agents\":\n",
+ " print(f\"Working directory: {Path('').absolute()}\")\n",
+ "else:\n",
+ " os.chdir(Path(\"\").absolute().parent.parent)\n",
+ " print(f\"Working directory set to: {Path('').absolute()}\")\n",
+ "\n",
+ "load_dotenv(verbose=True)\n",
+ "console = Console(width=100)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "s1-intro",
+ "metadata": {},
+ "source": [
+ "## 1. The DeepSearchQA Dataset\n",
+ "\n",
+ "[DeepSearchQA](https://www.kaggle.com/datasets/deepmind/deepsearchqa) is a benchmark from Google DeepMind\n",
+ "for evaluating deep research agents. It contains 896 research questions requiring multi-step web search\n",
+ "and reasoning to answer correctly.\n",
+ "\n",
+ "Each question is a **causal chain task**: the agent must follow a chain of searches, fetch real sources,\n",
+ "and verify facts before answering โ not recall from training data.\n",
+ "\n",
+ "### Answer Types\n",
+ "\n",
+ "| Type | Description | Example |\n",
+ "|------|-------------|---------|\n",
+ "| **Single Answer** | One specific value | A date, a number, a proper name |\n",
+ "| **Set Answer** | Multiple required items | A list of countries, a set of policy changes |\n",
+ "\n",
+ "Evaluation uses an LLM-as-judge that computes **precision, recall, and F1** by comparing the agent's\n",
+ "answer to the ground truth item-by-item."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "load-dataset",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dataset = DeepSearchQADataset()\n",
+ "\n",
+ "console.print(f\"Total examples: [cyan]{len(dataset)}[/cyan]\")\n",
+ "console.print(f\"Categories: [cyan]{len(dataset.get_categories())}[/cyan]\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "s1-structure",
+ "metadata": {},
+ "source": [
+ "### 1.1 Dataset Structure\n",
+ "\n",
+ "Each example is a `DSQAExample` with five fields. Let's look at one."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "show-structure",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "example = dataset[0]\n",
+ "\n",
+ "console.print(\n",
+ " Panel(\n",
+ " f\"[bold]example_id:[/bold] {example.example_id}\\n\"\n",
+ " f\"[bold]problem_category:[/bold] {example.problem_category}\\n\"\n",
+ " f\"[bold]answer_type:[/bold] {example.answer_type}\\n\\n\"\n",
+ " f\"[bold cyan]problem:[/bold cyan]\\n{example.problem}\\n\\n\"\n",
+ " f\"[bold yellow]answer:[/bold yellow]\\n{example.answer}\",\n",
+ " title=\"DSQAExample\",\n",
+ " border_style=\"blue\",\n",
+ " )\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "s1-categories",
+ "metadata": {},
+ "source": [
+ "### 1.2 Categories\n",
+ "\n",
+ "The dataset spans 17 domains. Let's see how examples are distributed across them."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "categories",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "categories = dataset.get_categories()\n",
+ "\n",
+ "cat_table = Table(title=\"Dataset by Category\")\n",
+ "cat_table.add_column(\"Category\", style=\"cyan\")\n",
+ "cat_table.add_column(\"Total\", style=\"white\", justify=\"right\")\n",
+ "cat_table.add_column(\"Single Answer\", style=\"dim\", justify=\"right\")\n",
+ "cat_table.add_column(\"Set Answer\", style=\"dim\", justify=\"right\")\n",
+ "\n",
+ "for cat in sorted(categories):\n",
+ " examples = dataset.get_by_category(cat)\n",
+ " single = sum(1 for e in examples if e.answer_type == \"Single Answer\")\n",
+ " set_ans = len(examples) - single\n",
+ " cat_table.add_row(cat, str(len(examples)), str(single), str(set_ans))\n",
+ "\n",
+ "console.print(cat_table)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "s1-answer-types",
+ "metadata": {},
+ "source": [
+ "### 1.3 Answer Types in Practice\n",
+ "\n",
+ "The answer type matters for evaluation โ the grader treats \"Single Answer\" and \"Set Answer\"\n",
+ "differently when computing correctness."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "answer-types",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "single_ex = next(e for e in dataset.examples if e.answer_type == \"Single Answer\")\n",
+ "set_ex = next(e for e in dataset.examples if e.answer_type == \"Set Answer\")\n",
+ "\n",
+ "for label, ex, style in [\n",
+ " (\"Single Answer\", single_ex, \"green\"),\n",
+ " (\"Set Answer\", set_ex, \"yellow\"),\n",
+ "]:\n",
+ " console.print(\n",
+ " Panel(\n",
+ " f\"[bold cyan]Question:[/bold cyan]\\n{ex.problem}\\n\\n[bold yellow]Answer:[/bold yellow]\\n{ex.answer}\",\n",
+ " title=f\"{label} โ {ex.problem_category}\",\n",
+ " border_style=style,\n",
+ " )\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "s1-browse",
+ "metadata": {},
+ "source": [
+ "### 1.4 Browsing Examples\n",
+ "\n",
+ "You can retrieve examples by category or by ID."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "browse",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Examples by category\n",
+ "finance_examples = dataset.get_by_category(\"Finance & Economics\")\n",
+ "console.print(f\"Finance & Economics: [cyan]{len(finance_examples)}[/cyan] examples\\n\")\n",
+ "\n",
+ "# Display a preview table\n",
+ "browse_table = Table(title=\"Finance & Economics โ First 5 Examples\")\n",
+ "browse_table.add_column(\"ID\", style=\"dim\", width=6)\n",
+ "browse_table.add_column(\"Answer Type\", style=\"cyan\", width=15)\n",
+ "browse_table.add_column(\"Question\", style=\"white\")\n",
+ "\n",
+ "for ex in finance_examples[:5]:\n",
+ " q = ex.problem[:75] + \"...\" if len(ex.problem) > 75 else ex.problem\n",
+ " browse_table.add_row(str(ex.example_id), ex.answer_type, q)\n",
+ "\n",
+ "console.print(browse_table)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "s2-intro",
+ "metadata": {},
+ "source": [
+ "## 2. The Agent's Tools\n",
+ "\n",
+ "The `KnowledgeGroundedAgent` has five tools that form a natural research workflow:\n",
+ "\n",
+ "| Tool | Purpose | When the Agent Uses It |\n",
+ "|------|---------|----------------------|\n",
+ "| `google_search` | Find relevant URLs | First step for any sub-question |\n",
+ "| `web_fetch` | Read web pages and PDFs | To verify facts from the actual source |\n",
+ "| `fetch_file` | Download CSV, XLSX, JSON files | When the answer is in structured data |\n",
+ "| `grep_file` | Search within a downloaded file | To locate a specific value in a large file |\n",
+ "| `read_file` | Read sections of a downloaded file | To inspect a specific part of a downloaded file |\n",
+ "\n",
+ "**Why not answer from search snippets?** Snippets are brief and may be outdated or misleading.\n",
+ "The system instructions enforce a strict causal chain: **Search โ Fetch โ Verify โ Answer**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "system-instructions",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "instructions = build_system_instructions()\n",
+ "\n",
+ "console.print(\n",
+ " Panel(\n",
+ " Markdown(instructions),\n",
+ " title=\"Agent System Instructions\",\n",
+ " border_style=\"blue\",\n",
+ " padding=(1, 2),\n",
+ " )\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "summary",
+ "metadata": {},
+ "source": [
+ "## Summary\n",
+ "\n",
+ "In this notebook you saw:\n",
+ "\n",
+ "1. **The DeepSearchQA dataset** โ 896 research questions across 17 categories, evaluated with\n",
+ " precision/recall/F1 using an LLM-as-judge\n",
+ "2. **The five agent tools** โ search, web fetch, file download, grep, and file read\n",
+ "3. **The system instructions** โ how the agent is guided to use its tools, including the\n",
+ " critical search โ fetch โ verify โ answer chain\n",
+ "\n",
+ "**Next:** In Notebook 02, we'll create the agent, run it on questions, and observe how it uses these tools."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "done",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "console.print(\n Panel(\n \"[green]โ[/green] Notebook complete!\\n\\n\"\n \"[cyan]Next:[/cyan] Open [bold]02_running_the_agent.ipynb[/bold] to run the agent.\",\n title=\"Done\",\n border_style=\"green\",\n )\n)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/implementations/knowledge_qa/01_grounding_basics.ipynb b/implementations/knowledge_qa/01_grounding_basics.ipynb
deleted file mode 100644
index 0401bb2..0000000
--- a/implementations/knowledge_qa/01_grounding_basics.ipynb
+++ /dev/null
@@ -1,200 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# 01: Google Search Grounding with ADK\n",
- "\n",
- "This notebook introduces Google Search grounding using the Agent Development Kit (ADK),\n",
- "which provides explicit, traceable tool calls for web search.\n",
- "\n",
- "## Learning Objectives\n",
- "\n",
- "- Understand how Google Search grounding works with ADK\n",
- "- Use the `KnowledgeGroundedAgent` to make grounded queries\n",
- "- See explicit tool calls in the agent's reasoning\n",
- "- Compare grounded vs non-grounded responses"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": "# Setup: Load environment and configure rich console\nfrom aieng.agent_evals import (\n create_console,\n display_comparison,\n display_response,\n display_source_table,\n)\nfrom aieng.agent_evals.knowledge_qa import KnowledgeAgentConfig, KnowledgeGroundedAgent\nfrom dotenv import load_dotenv\nfrom google import genai\nfrom rich.panel import Panel\n\n\nconsole = create_console()\nload_dotenv(verbose=True)"
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 1. Understanding Google Search Grounding with ADK\n",
- "\n",
- "The Agent Development Kit (ADK) provides a `GoogleSearchTool` that enables:\n",
- "\n",
- "1. **Explicit Tool Calls**: The agent decides when to search and you can see each call\n",
- "2. **ReAct Pattern**: Thought โ Action โ Observation loop is visible\n",
- "3. **Traceable**: Every search query and result is logged\n",
- "4. **Real-time Information**: Access to current web data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": "# Initialize the knowledge agent (uses ADK with GoogleSearchTool internally)\nagent = KnowledgeGroundedAgent()\n\nconsole.print(\n Panel(\n f\"[green]โ[/green] Knowledge Agent initialized\\n[cyan]Model:[/cyan] {agent.model}\",\n title=\"๐ง Setup Complete\",\n border_style=\"green\",\n )\n)"
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 2. Making Your First Grounded Query"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Ask a question that requires current information\n",
- "query = \"What is the current population of Tokyo?\"\n",
- "\n",
- "console.print(f\"\\n[cyan]๐ Query:[/cyan] {query}\\n\")\n",
- "\n",
- "console.print(\"[dim]Searching...[/dim]\")\n",
- "response = await agent.answer_async(query)\n",
- "\n",
- "display_response(response, console=console, title=\"Tokyo Population\")\n",
- "\n",
- "# Show the tool calls made by the agent\n",
- "if response.tool_calls:\n",
- " console.print(\"\\n[bold cyan]๐ง Tool Calls Made:[/bold cyan]\")\n",
- " for tc in response.tool_calls:\n",
- " console.print(f\" โข {tc.get('name', 'unknown')}: {tc.get('args', {})}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Display sources in a detailed table format\n",
- "display_source_table(response, console=console)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 3. Comparing Grounded vs Non-Grounded Responses\n",
- "\n",
- "This is where grounding truly shines. We'll ask about Toronto's record single-day snowfall.\n",
- "\n",
- "**Why this example works:**\n",
- "- The record was set on **January 25, 2026** - after the model's training data cutoff\n",
- "- Without grounding, the model can only guess based on historical data it was trained on\n",
- "- With grounding, the model searches the web and finds the recent news about this event\n",
- "\n",
- "This clearly demonstrates that grounding enables access to information the model couldn't possibly know from training alone."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": "config = KnowledgeAgentConfig()\nclient = genai.Client(api_key=config.gemini_api_key)\n\n# This question requires very recent information (Jan 2026)\n# The non-grounded model will fail since its training data doesn't include this event\nquestion = \"Which day had the highest recorded snowfall in a single day in Toronto?\"\nexpected_answer = \"January 25, 2026\"\n\nconsole.print(f\"\\n[bold]Question:[/bold] {question}\")\nconsole.print(f\"[dim]Expected Answer: {expected_answer}[/dim]\\n\")\n\n# Without grounding - model relies on training data (cutoff before Jan 2026)\nconsole.print(\"[dim]Generating without grounding...[/dim]\")\nresponse_no_grounding = client.models.generate_content(\n model=config.default_worker_model,\n contents=question,\n)\n\n# With grounding - agent uses Google Search tool\nconsole.print(\"[dim]Generating with grounding (ADK agent)...[/dim]\")\nresponse_grounded = await agent.answer_async(question)\n\n# Side-by-side comparison using our display utility\ndisplay_comparison(response_no_grounding.text, response_grounded, console=console)\n\n# Show tool calls from the grounded response\nif response_grounded.tool_calls:\n console.print(\"\\n[bold cyan]๐ง Tool Calls (Grounded):[/bold cyan]\")\n for tc in response_grounded.tool_calls:\n console.print(f\" โข {tc.get('name', 'unknown')}: {tc.get('args', {})}\")\n\n# Check if the grounded response contains the correct answer\nif expected_answer.lower() in response_grounded.text.lower() or \"january 25\" in response_grounded.text.lower():\n console.print(\"\\n[green]โ Grounded response contains the correct answer![/green]\")\nelse:\n console.print(\"\\n[yellow]โ Check the grounded response for accuracy[/yellow]\")"
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 4. Exercise: Try Your Own Queries\n",
- "\n",
- "Try asking questions that:\n",
- "- Require recent information (news, events, statistics)\n",
- "- Need multiple facts combined\n",
- "- Are about specific domains (sports, science, politics)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Try your own query\n",
- "my_query = \"What are the latest developments in fusion energy?\"\n",
- "\n",
- "console.print(f\"[bold cyan]๐ Query:[/bold cyan] {my_query}\\n\")\n",
- "\n",
- "console.print(\"[dim]Searching the web...[/dim]\")\n",
- "my_response = await agent.answer_async(my_query)\n",
- "\n",
- "display_response(my_response, console=console, title=\"Fusion Energy Developments\")\n",
- "\n",
- "# Show the tool calls\n",
- "if my_response.tool_calls:\n",
- " console.print(\"\\n[bold cyan]๐ง Tool Calls:[/bold cyan]\")\n",
- " for tc in my_response.tool_calls:\n",
- " console.print(f\" โข {tc.get('name', 'unknown')}: {tc.get('args', {})}\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Summary\n",
- "\n",
- "In this notebook, you learned:\n",
- "\n",
- "1. How Google Search grounding works with ADK's `GoogleSearchTool`\n",
- "2. How to use the `KnowledgeGroundedAgent` for grounded queries\n",
- "3. How to see explicit tool calls in the agent's response\n",
- "4. The difference between grounded and non-grounded responses\n",
- "\n",
- "**Next**: In the next notebook, we'll explore the agent's system instructions and the evaluation dataset."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "console.print(\n",
- " Panel(\n",
- " \"[green]โ[/green] Notebook complete!\\n\\n\"\n",
- " \"[cyan]Next:[/cyan] Open [bold]02_agent_basics.ipynb[/bold] to learn about the Knowledge Agent.\",\n",
- " title=\"๐ Done\",\n",
- " border_style=\"green\",\n",
- " )\n",
- ")"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.12.8"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/implementations/knowledge_qa/02_agent_basics.ipynb b/implementations/knowledge_qa/02_agent_basics.ipynb
deleted file mode 100644
index 7f9d088..0000000
--- a/implementations/knowledge_qa/02_agent_basics.ipynb
+++ /dev/null
@@ -1,323 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# 02: Knowledge-Grounded Agent Basics\n",
- "\n",
- "This notebook introduces the `KnowledgeGroundedAgent` class, which wraps Gemini\n",
- "with Google Search grounding into a full-featured QA agent.\n",
- "\n",
- "## Learning Objectives\n",
- "\n",
- "- Create and configure a `KnowledgeGroundedAgent`\n",
- "- Understand the agent's system instructions\n",
- "- Use the agent for single-turn Q&A\n",
- "- Explore the DeepSearchQA evaluation dataset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": "# Setup: Load environment and configure rich console\nfrom aieng.agent_evals import (\n create_console,\n display_example,\n display_info,\n display_response,\n display_success,\n)\nfrom aieng.agent_evals.knowledge_qa import (\n DeepSearchQADataset,\n KnowledgeAgentManager,\n KnowledgeGroundedAgent,\n)\nfrom aieng.agent_evals.knowledge_qa.agent import SYSTEM_INSTRUCTIONS\nfrom dotenv import load_dotenv\nfrom rich.markdown import Markdown\nfrom rich.panel import Panel\nfrom rich.table import Table\n\n\nconsole = create_console()\nload_dotenv(verbose=True)"
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 1. Creating a Knowledge Agent"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": "# Create the agent\nagent = KnowledgeGroundedAgent()\n\n# Display configuration\nconfig_table = Table(title=\"๐ค Agent Configuration\", show_header=True, header_style=\"bold cyan\")\nconfig_table.add_column(\"Setting\", style=\"cyan\")\nconfig_table.add_column(\"Value\", style=\"white\")\nconfig_table.add_row(\"Model\", agent.model)\nconfig_table.add_row(\"Planner Model\", agent.config.default_planner_model)\nconfig_table.add_row(\"Worker Model\", agent.config.default_worker_model)\n\nconsole.print(config_table)"
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 2. Understanding System Instructions\n",
- "\n",
- "The agent uses carefully crafted system instructions to guide its behavior."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": "console.print(\n Panel(\n Markdown(SYSTEM_INSTRUCTIONS),\n title=\"๐ System Instructions\",\n border_style=\"blue\",\n subtitle=\"[dim]Guides agent behavior[/dim]\",\n )\n)"
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 3. Single-Turn Q&A\n",
- "\n",
- "Let's use the agent to answer some questions."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Example 1: Current events\n",
- "question = \"What are the most significant AI developments in January 2026?\"\n",
- "\n",
- "console.print(\n",
- " Panel(\n",
- " f\"[bold green]{question}[/bold green]\",\n",
- " title=\"โ Question\",\n",
- " border_style=\"green\",\n",
- " )\n",
- ")\n",
- "\n",
- "console.print(\"[dim]๐ Agent is researching...[/dim]\")\n",
- "response = await agent.answer_async(question)\n",
- "\n",
- "display_response(response, console=console, title=\"AI Developments January 2026\")\n",
- "\n",
- "# Show tool calls made during reasoning\n",
- "if response.tool_calls:\n",
- " console.print(\"\\n[bold cyan]๐ง Tool Calls (ReAct Trace):[/bold cyan]\")\n",
- " for tc in response.tool_calls:\n",
- " console.print(f\" โข {tc.get('name', 'unknown')}: {tc.get('args', {})}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Example 2: Factual question\n",
- "question = \"What countries have successfully landed spacecraft on the Moon?\"\n",
- "\n",
- "console.print(\n",
- " Panel(\n",
- " f\"[bold green]{question}[/bold green]\",\n",
- " title=\"โ Question\",\n",
- " border_style=\"green\",\n",
- " )\n",
- ")\n",
- "\n",
- "console.print(\"[dim]๐ Agent is researching...[/dim]\")\n",
- "response = await agent.answer_async(question)\n",
- "\n",
- "display_response(response, console=console, title=\"Moon Landing Countries\")\n",
- "\n",
- "# Show tool calls\n",
- "if response.tool_calls:\n",
- " console.print(\"\\n[bold cyan]๐ง Tool Calls:[/bold cyan]\")\n",
- " for tc in response.tool_calls:\n",
- " console.print(f\" โข {tc.get('name', 'unknown')}: {tc.get('args', {})}\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": "## 4. Using the KnowledgeAgentManager\n\nFor applications that need to manage agent lifecycle, use `KnowledgeAgentManager`."
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": "# Create a manager (lazy initialization)\nmanager = KnowledgeAgentManager()\n\ndisplay_info(f\"Initialized: {manager.is_initialized()}\", console=console)\n\n# Access the agent (triggers initialization)\nmanaged_agent = manager.agent\ndisplay_info(f\"After access: {manager.is_initialized()}\", console=console)\n\n# Use the agent\nconsole.print(\"[dim]Querying...[/dim]\")\nresponse = await managed_agent.answer_async(\"What is the speed of light?\")\n\ndisplay_response(response, console=console, title=\"Quick Answer\", show_queries=False)\n\n# Cleanup\nmanager.close()\ndisplay_success(\"Manager closed\", console=console)"
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 5. Exploring the DeepSearchQA Dataset\n",
- "\n",
- "The DeepSearchQA dataset contains 896 research questions for evaluating knowledge agents."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": "# Load the dataset\nwith console.status(\"[cyan]Loading DeepSearchQA dataset...[/cyan]\", spinner=\"dots\"):\n dataset = DeepSearchQADataset()\n\n# Display dataset info\ninfo_table = Table(title=\"๐ DeepSearchQA Dataset\", show_header=True, header_style=\"bold cyan\")\ninfo_table.add_column(\"Metric\", style=\"cyan\")\ninfo_table.add_column(\"Value\", style=\"white\")\ninfo_table.add_row(\"Total Examples\", str(len(dataset)))\ninfo_table.add_row(\"Categories\", str(len(dataset.get_categories())))\n\nconsole.print(info_table)"
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Display categories\n",
- "categories = dataset.get_categories()\n",
- "\n",
- "cat_table = Table(title=\"๐ Problem Categories\", show_header=True, header_style=\"bold green\")\n",
- "cat_table.add_column(\"Category\", style=\"white\")\n",
- "cat_table.add_column(\"Count\", style=\"cyan\", justify=\"right\")\n",
- "\n",
- "for cat in sorted(categories):\n",
- " count = len(dataset.get_by_category(cat))\n",
- " cat_table.add_row(cat, str(count))\n",
- "\n",
- "console.print(cat_table)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Show a sample example using the shared display utility\n",
- "example = dataset[0]\n",
- "\n",
- "display_example(\n",
- " example_id=example.example_id,\n",
- " problem=example.problem,\n",
- " category=example.problem_category,\n",
- " answer=example.answer,\n",
- " answer_type=example.answer_type,\n",
- " console=console,\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Get random samples\n",
- "samples = dataset.sample(n=3, random_state=42)\n",
- "\n",
- "console.print(\"[bold]๐ Random Samples from Dataset[/bold]\\n\")\n",
- "\n",
- "for ex in samples:\n",
- " display_example(\n",
- " example_id=ex.example_id,\n",
- " problem=ex.problem[:300] + \"...\" if len(ex.problem) > 300 else ex.problem,\n",
- " category=ex.problem_category,\n",
- " answer=ex.answer,\n",
- " console=console,\n",
- " )"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 6. Testing the Agent on DeepSearchQA\n",
- "\n",
- "Let's test the agent on a sample question from the dataset."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Pick an example\n",
- "test_example = samples[0]\n",
- "\n",
- "console.print(\n",
- " Panel(\n",
- " f\"[bold]Testing on Example {test_example.example_id}[/bold]\\n\\n\"\n",
- " f\"[cyan]Category:[/cyan] {test_example.problem_category}\\n\"\n",
- " f\"[cyan]Expected Answer:[/cyan] {test_example.answer}\",\n",
- " title=\"๐งช Test Setup\",\n",
- " border_style=\"yellow\",\n",
- " )\n",
- ")\n",
- "\n",
- "# Ask the agent\n",
- "console.print(\n",
- " Panel(\n",
- " f\"[bold green]{test_example.problem}[/bold green]\",\n",
- " title=\"โ Question\",\n",
- " border_style=\"green\",\n",
- " )\n",
- ")\n",
- "\n",
- "console.print(\"[dim]๐ Agent is researching...[/dim]\")\n",
- "response = await agent.answer_async(test_example.problem)\n",
- "\n",
- "display_response(response, console=console, title=\"Agent Response\")\n",
- "\n",
- "# Show tool calls\n",
- "if response.tool_calls:\n",
- " console.print(\"\\n[bold cyan]๐ง Tool Calls:[/bold cyan]\")\n",
- " for tc in response.tool_calls:\n",
- " console.print(f\" โข {tc.get('name', 'unknown')}: {tc.get('args', {})}\")\n",
- "\n",
- "# Compare\n",
- "contains_answer = test_example.answer.lower() in response.text.lower()\n",
- "if contains_answer:\n",
- " console.print(\"\\n[green]โ CONTAINS EXPECTED ANSWER[/green]\")\n",
- "else:\n",
- " console.print(\"\\n[yellow]โ Answer may differ[/yellow]\")\n",
- "console.print(f\"[dim]Expected: {test_example.answer}[/dim]\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Summary\n",
- "\n",
- "In this notebook, you learned:\n",
- "\n",
- "1. How to create and configure a `KnowledgeGroundedAgent`\n",
- "2. The system instructions that guide agent behavior\n",
- "3. How to use the agent for single-turn Q&A\n",
- "4. How to explore the DeepSearchQA evaluation dataset\n",
- "\n",
- "**Next**: In the next notebook, we'll explore multi-turn conversations and run systematic evaluations."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "console.print(\n",
- " Panel(\n",
- " \"[green]โ[/green] Notebook complete!\\n\\n\"\n",
- " \"[cyan]Next:[/cyan] Open [bold]03_multi_turn.ipynb[/bold] to learn about multi-turn conversations and evaluation.\",\n",
- " title=\"๐ Done\",\n",
- " border_style=\"green\",\n",
- " )\n",
- ")"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.12.8"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/implementations/knowledge_qa/02_running_the_agent.ipynb b/implementations/knowledge_qa/02_running_the_agent.ipynb
new file mode 100644
index 0000000..834080a
--- /dev/null
+++ b/implementations/knowledge_qa/02_running_the_agent.ipynb
@@ -0,0 +1,426 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "intro",
+ "metadata": {},
+ "source": [
+ "# 02: Running the Agent\n",
+ "\n",
+ "In Notebook 01 we explored the dataset and tools. This notebook shows how to run the\n",
+ "`KnowledgeGroundedAgent` in practice.\n",
+ "\n",
+ "## What You'll Learn\n",
+ "\n",
+ "1. The agent's PlanReAct architecture and the `AgentResponse` data structure\n",
+ "2. Running a question with live progress display\n",
+ "3. Inspecting the response: plan, tool calls, sources, and reasoning\n",
+ "4. Multi-turn conversations using session state\n",
+ "5. Observability with Langfuse tracing\n",
+ "\n",
+ "## Prerequisites\n",
+ "\n",
+ "Complete Notebook 01. You'll need `GOOGLE_API_KEY` in your `.env` file.\n",
+ "For tracing (Section 4): `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are also required."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "setup",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\nimport uuid\nfrom pathlib import Path\n\nfrom aieng.agent_evals.knowledge_qa import KnowledgeGroundedAgent\nfrom aieng.agent_evals.knowledge_qa.notebook import display_response, run_with_display\nfrom aieng.agent_evals.langfuse import init_tracing\nfrom dotenv import load_dotenv\nfrom rich.console import Console\nfrom rich.panel import Panel\nfrom rich.table import Table\n\n\nif Path(\"\").absolute().name == \"eval-agents\":\n print(f\"Working directory: {Path('').absolute()}\")\nelse:\n os.chdir(Path(\"\").absolute().parent.parent)\n print(f\"Working directory set to: {Path('').absolute()}\")\n\nload_dotenv(verbose=True)\nconsole = Console(width=100)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "s1-arch",
+ "metadata": {},
+ "source": [
+ "## 1. Agent Architecture\n",
+ "\n",
+ "The `KnowledgeGroundedAgent` is built on Google ADK and combines two patterns:\n",
+ "\n",
+ "**PlanReAct** โ Before executing, the agent produces an explicit research plan with numbered\n",
+ "steps. Each step has a type (`SEARCH`, `FETCH`, `ANALYZE`) and a status that transitions from\n",
+ "`pending` โ `in_progress` โ `completed` (or `failed`/`skipped`). The plan can be revised\n",
+ "mid-run if the agent encounters unexpected results.\n",
+ "\n",
+ "**ReAct loop** โ Within each step, the agent alternates between reasoning (Thought), acting\n",
+ "(tool call), and observing (tool response).\n",
+ "\n",
+ "### The `AgentResponse` Object\n",
+ "\n",
+ "After running, `agent.answer_async(question)` returns an `AgentResponse`:\n",
+ "\n",
+ "| Field | Type | Description |\n",
+ "|-------|------|-------------|\n",
+ "| `text` | `str` | The final answer |\n",
+ "| `plan` | `ResearchPlan` | Numbered steps with statuses |\n",
+ "| `tool_calls` | `list[dict]` | Every tool invocation during execution |\n",
+ "| `sources` | `list[GroundingChunk]` | URLs used as evidence |\n",
+ "| `reasoning_chain` | `list[str]` | The model's intermediate reasoning |\n",
+ "| `total_duration_ms` | `int` | Wall-clock execution time |"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "create-agent",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "agent = KnowledgeGroundedAgent(enable_planning=True)\n",
+ "\n",
+ "config_table = Table(title=\"Agent Configuration\", show_header=False)\n",
+ "config_table.add_column(\"Setting\", style=\"cyan\")\n",
+ "config_table.add_column(\"Value\", style=\"white\")\n",
+ "config_table.add_row(\"Model\", agent.model)\n",
+ "config_table.add_row(\"Planning\", \"PlanReAct (enabled)\")\n",
+ "config_table.add_row(\"Session Service\", \"InMemorySessionService\")\n",
+ "config_table.add_row(\"Tools\", \"google_search, web_fetch, fetch_file, grep_file, read_file\")\n",
+ "\n",
+ "console.print(config_table)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "s2-running",
+ "metadata": {},
+ "source": [
+ "## 2. Running a Question\n",
+ "\n",
+ "`run_with_display` executes the agent in a Jupyter notebook with a live progress display showing:\n",
+ "\n",
+ "- The research plan with step statuses (updating in real time)\n",
+ "- Tool calls as they fire\n",
+ "\n",
+ "We'll use a question that requires web search โ the agent must find and verify a specific fact,\n",
+ "not recall it from training data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "run-agent",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "question = \"When was the highest single-day snowfall recorded in Toronto, and how much snow fell?\"\n",
+ "\n",
+ "response = await run_with_display(agent, question)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "show-answer",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_response(\n",
+ " console,\n",
+ " response.text,\n",
+ " subtitle=(\n",
+ " f\"Duration: {response.total_duration_ms / 1000:.1f}s | \"\n",
+ " f\"Tool calls: {len(response.tool_calls)} | \"\n",
+ " f\"Sources: {len(response.sources)}\"\n",
+ " ),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "s2-inspect",
+ "metadata": {},
+ "source": [
+ "### 2.1 Inspecting the Response\n",
+ "\n",
+ "The `AgentResponse` object contains the full execution trace."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "show-plan",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plan = response.plan\n\nplan_table = Table(title=\"Research Plan\")\nplan_table.add_column(\"#\", style=\"cyan\", width=3)\nplan_table.add_column(\"Step\", style=\"white\")\nplan_table.add_column(\"Type\", style=\"dim\", width=12)\nplan_table.add_column(\"Status\", style=\"green\")\n\nfor step in plan.steps:\n icon = {\"completed\": \"โ\", \"failed\": \"โ\", \"skipped\": \"โ\"}.get(step.status.value, \"ยท\")\n desc = step.description[:70] + \"...\" if len(step.description) > 70 else step.description\n plan_table.add_row(str(step.step_id), desc, step.step_type, f\"{icon} {step.status.value}\")\n\nconsole.print(plan_table)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "show-tools",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if response.tool_calls:\n",
+ " tools_table = Table(title=\"Tool Calls\")\n",
+ " tools_table.add_column(\"#\", style=\"dim\", width=3)\n",
+ " tools_table.add_column(\"Tool\", style=\"cyan\", width=16)\n",
+ " tools_table.add_column(\"Arguments (truncated)\", style=\"white\")\n",
+ "\n",
+ " for i, tc in enumerate(response.tool_calls[:15], 1):\n",
+ " name = tc.get(\"name\", \"unknown\")\n",
+ " args = str(tc.get(\"args\", {}))\n",
+ " args = args[:70] + \"...\" if len(args) > 70 else args\n",
+ " tools_table.add_row(str(i), name, args)\n",
+ "\n",
+ " if len(response.tool_calls) > 15:\n",
+ " tools_table.add_row(\"...\", f\"({len(response.tool_calls) - 15} more)\", \"\")\n",
+ "\n",
+ " console.print(tools_table)\n",
+ "else:\n",
+ " console.print(\"[dim]No tool calls recorded[/dim]\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "show-sources",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if response.sources:\n",
+ " seen: set[str] = set()\n",
+ " sources_table = Table(title=\"Sources\")\n",
+ " sources_table.add_column(\"#\", style=\"dim\", width=3)\n",
+ " sources_table.add_column(\"URL\", style=\"blue\")\n",
+ "\n",
+ " for src in response.sources:\n",
+ " if src.uri and src.uri not in seen:\n",
+ " seen.add(src.uri)\n",
+ " url = src.uri[:85] + \"...\" if len(src.uri) > 85 else src.uri\n",
+ " sources_table.add_row(str(len(seen)), url)\n",
+ " if len(seen) >= 10:\n",
+ " break\n",
+ "\n",
+ " console.print(sources_table)\n",
+ "else:\n",
+ " console.print(\"[dim]No sources recorded[/dim]\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "s3-multiturn",
+ "metadata": {},
+ "source": [
+ "## 3. Multi-Turn Conversations\n",
+ "\n",
+ "The agent uses an `InMemorySessionService` to maintain conversation context across turns.\n",
+ "Pass the same `session_id` to link questions together โ the agent will use prior context\n",
+ "when answering follow-up questions."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "multiturn",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "session_id = str(uuid.uuid4())\n",
+ "console.print(f\"Session ID: [dim]{session_id}[/dim]\\n\")\n",
+ "\n",
+ "# First turn: establish a subject\n",
+ "response1 = await agent.answer_async(\n",
+ " \"What is the capital of France?\",\n",
+ " session_id=session_id,\n",
+ ")\n",
+ "display_response(console, response1.text, title=\"Turn 1\")\n",
+ "\n",
+ "# Second turn: follow-up that references the prior context\n",
+ "response2 = await agent.answer_async(\n",
+ " \"What is the official language spoken there?\",\n",
+ " session_id=session_id,\n",
+ ")\n",
+ "display_response(console, response2.text, title=\"Turn 2 (follow-up)\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "s4-tracing",
+ "metadata": {},
+ "source": [
+ "## 4. Observability with Langfuse\n",
+ "\n",
+ "Langfuse captures a full trace of every agent run using OpenTelemetry, giving you visibility into:\n",
+ "\n",
+ "- Every tool call and its arguments\n",
+ "- Every LLM call with prompts and completions\n",
+ "- Timing for each span\n",
+ "- The full agent execution tree\n",
+ "\n",
+ "This is essential for debugging failures, measuring latency, and comparing configurations.\n",
+ "\n",
+ "### Trace Structure\n",
+ "\n",
+ "```\n",
+ "Trace: agent run\n",
+ "โโโ Span: planning (PlanReAct)\n",
+ "โ โโโ LLM Call: create_plan\n",
+ "โโโ Span: step-1-execution\n",
+ "โ โโโ Tool Call: google_search\n",
+ "โ โโโ Tool Call: web_fetch\n",
+ "โ โโโ LLM Call: step_summary\n",
+ "โโโ Span: step-2-execution\n",
+ "โ โโโ ...\n",
+ "โโโ Span: synthesis\n",
+ " โโโ LLM Call: final_answer\n",
+ "```\n",
+ "\n",
+ "### Prerequisites\n",
+ "\n",
+ "Set these in your `.env` file:\n",
+ "- `LANGFUSE_PUBLIC_KEY`\n",
+ "- `LANGFUSE_SECRET_KEY`\n",
+ "- `LANGFUSE_HOST` (optional, defaults to `https://cloud.langfuse.com`)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "check-creds",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "langfuse_configured = all(\n",
+ " [\n",
+ " os.getenv(\"LANGFUSE_PUBLIC_KEY\"),\n",
+ " os.getenv(\"LANGFUSE_SECRET_KEY\"),\n",
+ " ]\n",
+ ")\n",
+ "\n",
+ "if langfuse_configured:\n",
+ " console.print(\"[green]โ[/green] Langfuse credentials found\")\n",
+ "else:\n",
+ " console.print(\"[yellow]โ [/yellow] Langfuse credentials not found โ tracing cells will be skipped\")\n",
+ " console.print(\"[dim]Set LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY in .env[/dim]\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "init-tracing",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tracing_enabled = init_tracing()\n",
+ "\n",
+ "if tracing_enabled:\n",
+ " console.print(\"[green]โ[/green] Langfuse tracing initialized\")\n",
+ "else:\n",
+ " console.print(\"[yellow]โ [/yellow] Tracing not enabled (check credentials)\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "run-traced",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if tracing_enabled:\n",
+ " from langfuse import Langfuse\n",
+ "\n",
+ " langfuse = Langfuse()\n",
+ " traced_agent = KnowledgeGroundedAgent(enable_planning=True)\n",
+ " traced_question = \"What programming language was created by Guido van Rossum, and in what year?\"\n",
+ "\n",
+ " console.print(Panel(traced_question, title=\"Traced Question\", border_style=\"green\"))\n",
+ "\n",
+ " with langfuse.start_as_current_span(name=\"knowledge-agent\", input=traced_question):\n",
+ " trace_id = langfuse.get_current_trace_id()\n",
+ " traced_response = await traced_agent.answer_async(traced_question)\n",
+ " langfuse.update_current_span(output=traced_response.text)\n",
+ "\n",
+ " display_response(\n",
+ " console,\n",
+ " traced_response.text,\n",
+ " subtitle=f\"Duration: {traced_response.total_duration_ms / 1000:.1f}s\",\n",
+ " )\n",
+ "else:\n",
+ " console.print(\"[dim]Skipping (Langfuse not configured)[/dim]\")\n",
+ " trace_id = None"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "flush-traces",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if tracing_enabled:\n from IPython.display import HTML, display # noqa: A004\n from langfuse import Langfuse\n from opentelemetry import trace as otel_trace\n\n provider = otel_trace.get_tracer_provider()\n if hasattr(provider, \"force_flush\"):\n provider.force_flush(timeout_millis=5000)\n console.print(\"[green]โ[/green] Traces flushed to Langfuse\")\n\n if trace_id:\n trace_url = Langfuse().get_trace_url(trace_id=trace_id)\n display(HTML(f'View trace: {trace_url}
'))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "s4-ui",
+ "metadata": {},
+ "source": [
+ "### 4.1 Viewing Traces in the Langfuse UI\n\nOpen your Langfuse project and navigate to **Traces**. Each run appears as a\ntree of spans. Useful things to look at:\n\n- **Span timeline** โ which steps take the most time?\n- **Tool call arguments** โ what search queries did the agent use?\n- **LLM interactions** โ what did the model reason about before calling each tool?\n- **Errors** โ red spans show where failures occurred\n\nYou can also filter by trace name, time range, or input content."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "summary",
+ "metadata": {},
+ "source": [
+ "## Summary\n",
+ "\n",
+ "In this notebook you learned:\n",
+ "\n",
+ "1. **Creating the agent** โ `KnowledgeGroundedAgent(enable_planning=True)` with PlanReAct\n",
+ "2. **Running questions** โ `run_with_display` for live notebook progress; `agent.answer_async` for raw access\n",
+ "3. **The `AgentResponse`** โ plan, tool calls, sources, reasoning, and timing in one object\n",
+ "4. **Multi-turn conversations** โ linking turns with `session_id`\n",
+ "5. **Langfuse tracing** โ `init_tracing()` and the Langfuse SDK for full observability\n",
+ "\n",
+ "**Next:** In Notebook 03, we'll run a systematic evaluation using the DeepSearchQA benchmark."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "done",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "console.print(\n",
+ " Panel(\n",
+ " \"[green]โ[/green] Notebook complete!\\n\\n\"\n",
+ " \"[cyan]Next:[/cyan] Open [bold]03_evaluation.ipynb[/bold] to evaluate the agent at scale.\",\n",
+ " title=\"Done\",\n",
+ " border_style=\"green\",\n",
+ " )\n",
+ ")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/implementations/knowledge_qa/03_evaluation.ipynb b/implementations/knowledge_qa/03_evaluation.ipynb
new file mode 100644
index 0000000..c8d6bd3
--- /dev/null
+++ b/implementations/knowledge_qa/03_evaluation.ipynb
@@ -0,0 +1,482 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "intro",
+ "metadata": {},
+ "source": [
+ "# 03: Evaluation\n",
+ "\n",
+ "In Notebook 02 we ran individual questions by hand. This notebook evaluates the agent\n",
+ "systematically: we upload a dataset subset to Langfuse, run the agent on every item, and\n",
+ "score each response with an LLM-as-judge grader using the official DeepSearchQA methodology.\n",
+ "\n",
+ "## What You'll Learn\n",
+ "\n",
+ "1. Uploading a DeepSearchQA subset to Langfuse as a persistent dataset\n",
+ "2. The LLM-as-judge grader: precision, recall, F1, and the four outcome categories\n",
+ "3. A single-sample evaluation walkthrough\n",
+ "4. Running the full experiment with `run_experiment`\n",
+ "5. Inspecting and interpreting item-level results\n",
+ "\n",
+ "## Prerequisites\n",
+ "\n",
+ "Complete Notebooks 01 and 02. You'll need all credentials in `.env`:\n",
+ "- `GOOGLE_API_KEY`\n",
+ "- `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY`\n",
+ "- `OPENAI_API_KEY` (for the LLM grader)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "setup",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "import os\n",
+ "import tempfile\n",
+ "from pathlib import Path\n",
+ "from typing import Any\n",
+ "\n",
+ "import pandas as pd\n",
+ "from aieng.agent_evals.evaluation import run_experiment\n",
+ "from aieng.agent_evals.evaluation.graders.config import LLMRequestConfig\n",
+ "from aieng.agent_evals.knowledge_qa import DeepSearchQADataset, KnowledgeGroundedAgent\n",
+ "from aieng.agent_evals.knowledge_qa.deepsearchqa_grader import (\n",
+ " EvaluationOutcome,\n",
+ " evaluate_deepsearchqa_async,\n",
+ ")\n",
+ "from aieng.agent_evals.knowledge_qa.notebook import display_response, run_with_display\n",
+ "from aieng.agent_evals.langfuse import upload_dataset_to_langfuse\n",
+ "from dotenv import load_dotenv\n",
+ "from IPython.display import HTML, display # noqa: A004\n",
+ "from langfuse.experiment import Evaluation\n",
+ "from rich.console import Console\n",
+ "from rich.panel import Panel\n",
+ "from rich.table import Table\n",
+ "\n",
+ "\n",
+ "if Path(\"\").absolute().name == \"eval-agents\":\n",
+ " print(f\"Working directory: {Path('').absolute()}\")\n",
+ "else:\n",
+ " os.chdir(Path(\"\").absolute().parent.parent)\n",
+ " print(f\"Working directory set to: {Path('').absolute()}\")\n",
+ "\n",
+ "load_dotenv(verbose=True)\n",
+ "console = Console(width=100)\n",
+ "\n",
+ "DATASET_NAME = \"DeepSearchQA-Subset\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "s1-upload-intro",
+ "metadata": {},
+ "source": [
+ "## 1. Uploading the Dataset to Langfuse\n",
+ "\n",
+ "Langfuse stores our evaluation dataset so we can run multiple experiments against the same items\n",
+ "and compare results over time. Each dataset item has three fields:\n",
+ "\n",
+ "- **`input`**: the question (sent to the agent)\n",
+ "- **`expected_output`**: the ground truth answer (given to the grader, never shown to the agent)\n",
+ "- **`metadata`**: `category`, `answer_type`, `example_id`\n",
+ "\n",
+ "Items are deduplicated by a hash of their content, so running this cell again is safe."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "upload",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dataset = DeepSearchQADataset()\n",
+ "examples = dataset.get_by_category(\"Finance & Economics\")[:1]\n",
+ "\n",
+ "console.print(f\"Uploading [cyan]{len(examples)}[/cyan] examples to dataset '{DATASET_NAME}'...\")\n",
+ "\n",
+ "# Write examples to a temporary JSONL file for the upload utility\n",
+ "with tempfile.NamedTemporaryFile(mode=\"w\", suffix=\".jsonl\", delete=False, encoding=\"utf-8\") as f:\n",
+ " for ex in examples:\n",
+ " record = {\n",
+ " \"input\": ex.problem,\n",
+ " \"expected_output\": ex.answer,\n",
+ " \"metadata\": {\n",
+ " \"example_id\": ex.example_id,\n",
+ " \"category\": ex.problem_category,\n",
+ " \"answer_type\": ex.answer_type,\n",
+ " },\n",
+ " }\n",
+ " f.write(json.dumps(record, ensure_ascii=False) + \"\\n\")\n",
+ " temp_path = f.name\n",
+ "\n",
+ "await upload_dataset_to_langfuse(dataset_path=temp_path, dataset_name=DATASET_NAME)\n",
+ "os.unlink(temp_path)\n",
+ "\n",
+ "console.print(f\"[green]โ[/green] Dataset '{DATASET_NAME}' ready in Langfuse\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "s2-grader-intro",
+ "metadata": {},
+ "source": [
+ "## 2. The DeepSearchQA Grader\n",
+ "\n",
+ "The grader is an LLM-as-judge that evaluates answers using the official DeepSearchQA methodology\n",
+ "from Appendix A of the paper. It handles both answer types:\n",
+ "\n",
+ "- **Single Answer**: checks whether the response contains the one expected value\n",
+ "- **Set Answer**: checks which items from the ground truth set appear in the response,\n",
+ " and flags any extra items the agent included\n",
+ "\n",
+ "### Metrics\n",
+ "\n",
+ "Let **S** = predicted items, **G** = ground truth items:\n",
+ "\n",
+ "| Metric | Formula | Meaning |\n",
+ "|--------|---------|---------|\n",
+ "| **Precision** | \\|SโฉG\\| / \\|S\\| | Of what the agent said, how much was correct |\n",
+ "| **Recall** | \\|SโฉG\\| / \\|G\\| | Of the ground truth, how much did the agent find |\n",
+ "| **F1** | 2ยทPยทR / (P+R) | Harmonic mean of precision and recall |\n",
+ "\n",
+ "### Outcome Classification\n",
+ "\n",
+ "| Outcome | Condition | Interpretation |\n",
+ "|---------|-----------|----------------|\n",
+ "| `fully_correct` | S = G | Perfect answer |\n",
+ "| `correct_with_extraneous` | G โ S | All correct, but extra items included |\n",
+ "| `partially_correct` | SโฉG โ โ
| Some correct items found |\n",
+ "| `fully_incorrect` | SโฉG = โ
| No correct items |"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "s2-single-sample",
+ "metadata": {},
+ "source": [
+ "### 2.1 Single-Sample Walkthrough\n",
+ "\n",
+ "Before running at scale, let's walk through one example end-to-end: run the agent,\n",
+ "then grade its response with the LLM judge."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "pick-example",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Reproducibly select one Finance & Economics example\n",
+ "finance_examples = dataset.get_by_category(\"Finance & Economics\")\n",
+ "example = finance_examples[0]\n",
+ "\n",
+ "console.print(\n",
+ " Panel(\n",
+ " f\"[bold]ID:[/bold] {example.example_id}\\n\"\n",
+ " f\"[bold]Category:[/bold] {example.problem_category}\\n\"\n",
+ " f\"[bold]Answer Type:[/bold] {example.answer_type}\\n\\n\"\n",
+ " f\"[bold cyan]Question:[/bold cyan]\\n{example.problem}\\n\\n\"\n",
+ " f\"[bold yellow]Ground Truth:[/bold yellow]\\n{example.answer}\",\n",
+ " title=\"Evaluation Example\",\n",
+ " border_style=\"blue\",\n",
+ " )\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "run-agent",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "eval_agent = KnowledgeGroundedAgent(enable_planning=True)\n",
+ "eval_response = await run_with_display(eval_agent, example.problem)\n",
+ "\n",
+ "display_response(\n",
+ " console,\n",
+ " eval_response.text,\n",
+ " title=\"Agent Answer\",\n",
+ " subtitle=f\"Duration: {eval_response.total_duration_ms / 1000:.1f}s | Tools: {len(eval_response.tool_calls)}\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "grade-response",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "console.print(\"[dim]Grading with LLM judge...[/dim]\\n\")\n",
+ "\n",
+ "result = await evaluate_deepsearchqa_async(\n",
+ " question=example.problem,\n",
+ " answer=eval_response.text,\n",
+ " ground_truth=example.answer,\n",
+ " answer_type=example.answer_type,\n",
+ ")\n",
+ "\n",
+ "outcome_color = {\n",
+ " EvaluationOutcome.FULLY_CORRECT: \"green\",\n",
+ " EvaluationOutcome.CORRECT_WITH_EXTRANEOUS: \"yellow\",\n",
+ " EvaluationOutcome.PARTIALLY_CORRECT: \"orange1\",\n",
+ " EvaluationOutcome.FULLY_INCORRECT: \"red\",\n",
+ "}.get(result.outcome, \"white\")\n",
+ "\n",
+ "metrics_table = Table(title=\"Grader Results\")\n",
+ "metrics_table.add_column(\"Metric\", style=\"cyan\")\n",
+ "metrics_table.add_column(\"Value\", style=\"white\")\n",
+ "metrics_table.add_row(\"Outcome\", f\"[{outcome_color}]{result.outcome.value}[/{outcome_color}]\")\n",
+ "metrics_table.add_row(\"Precision\", f\"{result.precision:.3f}\")\n",
+ "metrics_table.add_row(\"Recall\", f\"{result.recall:.3f}\")\n",
+ "metrics_table.add_row(\"F1\", f\"[bold]{result.f1_score:.3f}[/bold]\")\n",
+ "console.print(metrics_table)\n",
+ "\n",
+ "if result.explanation:\n",
+ " console.print(Panel(result.explanation, title=\"Grader Explanation\", border_style=\"magenta\"))\n",
+ "\n",
+ "# Show per-item correctness for Set Answer questions\n",
+ "if result.correctness_details:\n",
+ " details_table = Table(title=\"Correctness Details\")\n",
+ " details_table.add_column(\"Expected Item\", style=\"white\")\n",
+ " details_table.add_column(\"Found\", style=\"cyan\", justify=\"center\")\n",
+ " for item, found in result.correctness_details.items():\n",
+ " icon = \"[green]โ[/green]\" if found else \"[red]โ[/red]\"\n",
+ " label = item[:60] + \"...\" if len(item) > 60 else item\n",
+ " details_table.add_row(label, icon)\n",
+ " console.print(details_table)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "s3-experiment-intro",
+ "metadata": {},
+ "source": [
+ "## 3. Running the Evaluation Experiment\n",
+ "\n",
+ "`run_experiment` runs the agent against every item in the Langfuse dataset, scores each\n",
+ "response, and records results in Langfuse. Each call creates a new named experiment run\n",
+ "that you can compare to previous runs in the UI.\n",
+ "\n",
+ "The experiment takes two functions:\n",
+ "\n",
+ "- **`agent_task`** โ receives a dataset item, runs the agent, returns the answer string\n",
+ "- **`deepsearchqa_evaluator`** โ receives question, answer, and ground truth; returns grader scores\n",
+ "\n",
+ "> **Note:** This makes one agent call and one grader call per item. With 10 items and\n",
+ "> `max_concurrency=1`, expect 20โ40 minutes depending on model latency."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "define-task-evaluator",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "async def agent_task(*, item: Any, **kwargs: Any) -> str:\n",
+ " \"\"\"Run the Knowledge Agent on a Langfuse dataset item.\"\"\"\n",
+ " agent = KnowledgeGroundedAgent(enable_planning=True)\n",
+ " response = await agent.answer_async(item.input)\n",
+ " return response.text\n",
+ "\n",
+ "\n",
+ "async def deepsearchqa_evaluator(\n",
+ " *,\n",
+ " input: str, # noqa: A002\n",
+ " output: str,\n",
+ " expected_output: str,\n",
+ " metadata: dict[str, Any] | None = None,\n",
+ " **kwargs: Any,\n",
+ ") -> list[Evaluation]:\n",
+ " \"\"\"LLM-as-judge grader using DeepSearchQA methodology.\"\"\"\n",
+ " answer_type = (metadata or {}).get(\"answer_type\", \"Set Answer\")\n",
+ " result = await evaluate_deepsearchqa_async(\n",
+ " question=input,\n",
+ " answer=output,\n",
+ " ground_truth=expected_output,\n",
+ " answer_type=answer_type,\n",
+ " model_config=LLMRequestConfig(temperature=0.0),\n",
+ " )\n",
+ " return result.to_evaluations()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "run-experiment",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "experiment_result = run_experiment(\n",
+ " DATASET_NAME,\n",
+ " name=\"knowledge-agent-baseline\",\n",
+ " task=agent_task,\n",
+ " evaluators=[deepsearchqa_evaluator],\n",
+ " description=\"Baseline Knowledge Agent on Finance & Economics questions.\",\n",
+ " max_concurrency=1,\n",
+ ")\n",
+ "\n",
+ "console.print(\"[green]โ[/green] Experiment complete\")\n",
+ "if experiment_result.dataset_run_url:\n",
+ " display(\n",
+ " HTML(\n",
+ " f'View experiment: {experiment_result.dataset_run_url}
'\n",
+ " )\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "s4-results-intro",
+ "metadata": {},
+ "source": [
+ "## 4. Inspecting Results\n",
+ "\n",
+ "The `ExperimentResult` object gives programmatic access to every item-level score.\n",
+ "Aggregate metrics are visible in the Langfuse experiment run summary in the UI."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "item-results",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rows = []\n",
+ "for item_result in experiment_result.item_results:\n",
+ " item = item_result.item\n",
+ " question = str(item.input)\n",
+ " row = {\n",
+ " \"question\": question[:55] + \"...\" if len(question) > 55 else question,\n",
+ " \"answer_type\": (item.metadata or {}).get(\"answer_type\", \"\"),\n",
+ " }\n",
+ " for evaluation in item_result.evaluations or []:\n",
+ " row[evaluation.name] = evaluation.value\n",
+ " rows.append(row)\n",
+ "\n",
+ "df = pd.DataFrame(rows)\n",
+ "print(df.to_string(index=False))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aggregate-scores",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Mean of numeric metrics\n",
+ "numeric_cols = [c for c in [\"F1\", \"Precision\", \"Recall\"] if c in df.columns]\n",
+ "if numeric_cols:\n",
+ " means_table = Table(title=\"Mean Scores\")\n",
+ " means_table.add_column(\"Metric\", style=\"cyan\")\n",
+ " means_table.add_column(\"Mean\", style=\"white\")\n",
+ " for col in numeric_cols:\n",
+ " means_table.add_row(col, f\"{df[col].mean():.3f}\")\n",
+ " console.print(means_table)\n",
+ "\n",
+ "# Outcome distribution\n",
+ "if \"Outcome\" in df.columns:\n",
+ " outcome_table = Table(title=\"Outcome Distribution\")\n",
+ " outcome_table.add_column(\"Outcome\", style=\"cyan\")\n",
+ " outcome_table.add_column(\"Count\", style=\"white\", justify=\"right\")\n",
+ " outcome_table.add_column(\"Fraction\", style=\"dim\", justify=\"right\")\n",
+ " total = len(df)\n",
+ " for outcome, count in df[\"Outcome\"].value_counts().items():\n",
+ " outcome_table.add_row(str(outcome), str(count), f\"{count / total:.0%}\")\n",
+ " console.print(outcome_table)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "s5-iteration",
+ "metadata": {},
+ "source": [
+ "## 5. Iterating on the Agent\n",
+ "\n",
+ "The dataset in Langfuse is persistent โ you don't need to re-upload it. To evaluate a modified\n",
+ "agent, call `run_experiment` again with a new `name` argument. Langfuse will create a new\n",
+ "experiment run and you can compare runs side-by-side in the UI.\n",
+ "\n",
+ "### Levers to Explore\n",
+ "\n",
+ "- **System prompt** โ edit `SYSTEM_INSTRUCTIONS_TEMPLATE` in `system_instructions.py` to change\n",
+ " the search strategy, verification rules, or final answer format\n",
+ "- **Planning** โ toggle `enable_planning=False` to skip PlanReAct and compare quality vs. speed\n",
+ "- **Model** โ change the Gemini model in `KnowledgeGroundedAgent` for different capability/cost trade-offs\n",
+ "- **Dataset** โ change the `category` filter in Section 1 or increase `samples` to cover more examples\n",
+ "\n",
+ "### What to Look for in Langfuse\n",
+ "\n",
+ "- Items with **low F1** โ did the agent fail to fetch the source? Stop early? Misread the question?\n",
+ "- Items with **`correct_with_extraneous`** โ is the agent over-generating? Can the prompt be tightened?\n",
+ "- **Latency outliers** โ which steps are slow? Is replanning happening unnecessarily?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "summary",
+ "metadata": {},
+ "source": [
+ "## Summary\n",
+ "\n",
+ "In this notebook you:\n",
+ "\n",
+ "1. **Uploaded** a DeepSearchQA subset to Langfuse as a persistent, reusable dataset\n",
+ "2. **Understood** the LLM-as-judge grader: precision, recall, F1, and the four outcome categories\n",
+ "3. **Walked through** a single-sample evaluation end-to-end\n",
+ "4. **Ran** a full experiment with `run_experiment` and inspected item-level scores\n",
+ "5. **Learned** how to iterate: re-run with a new experiment name to compare configurations in Langfuse\n",
+ "\n",
+ "The evaluation pipeline is the foundation for systematic agent improvement โ each iteration\n",
+ "produces a new experiment run that you can compare to the baseline in the Langfuse UI."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "done",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "console.print(Panel(\"[green]โ[/green] Notebook complete!\", title=\"Done\", border_style=\"green\"))\n",
+ "if experiment_result.dataset_run_url:\n",
+ " display(\n",
+ " HTML(\n",
+ " f'View experiment results: {experiment_result.dataset_run_url}
'\n",
+ " )\n",
+ " )"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/implementations/knowledge_qa/03_multi_turn.ipynb b/implementations/knowledge_qa/03_multi_turn.ipynb
deleted file mode 100644
index a0e3909..0000000
--- a/implementations/knowledge_qa/03_multi_turn.ipynb
+++ /dev/null
@@ -1,312 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": "# 03: Multi-Turn Conversations & Evaluation\n\nThis notebook demonstrates multi-turn conversation capabilities and\nhow to evaluate the agent on the DeepSearchQA benchmark.\n\n## Learning Objectives\n\n- Understand how ADK manages multi-turn conversations via sessions\n- Use the `DeepSearchQAEvaluator` for systematic evaluation\n- Analyze evaluation results with rich visualizations\n- Understand evaluation metrics for research agents"
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Setup: Load environment and configure rich console\n",
- "import uuid\n",
- "\n",
- "from aieng.agent_evals import (\n",
- " create_console,\n",
- " display_evaluation_result,\n",
- " display_metrics_table,\n",
- " display_success,\n",
- ")\n",
- "from aieng.agent_evals.knowledge_qa import (\n",
- " DeepSearchQADataset,\n",
- " DeepSearchQAEvaluator,\n",
- " KnowledgeGroundedAgent,\n",
- ")\n",
- "from dotenv import load_dotenv\n",
- "from rich.panel import Panel\n",
- "from rich.table import Table\n",
- "\n",
- "\n",
- "console = create_console()\n",
- "load_dotenv(verbose=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": "## 1. Multi-Turn Conversations with ADK\n\nThe `KnowledgeGroundedAgent` uses Google ADK's built-in session management via `InMemorySessionService`.\nWhen you pass a `session_id` to `answer_async()`, ADK maintains conversation history automatically.\n\nKey points:\n- Each unique `session_id` creates a separate conversation thread\n- ADK tracks all messages, tool calls, and context within that session\n- No manual history tracking needed - ADK handles it internally"
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Create agent and demonstrate multi-turn conversation\n",
- "agent = KnowledgeGroundedAgent()\n",
- "\n",
- "# Create a session ID for multi-turn conversation\n",
- "session_id = str(uuid.uuid4())\n",
- "\n",
- "console.print(\n",
- " Panel(\n",
- " f\"[cyan]Session ID:[/cyan] {session_id}\\n\\nADK will track conversation history for this session automatically.\",\n",
- " title=\"๐จ๏ธ New Session Created\",\n",
- " border_style=\"green\",\n",
- " )\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# First turn - ask a question\n",
- "response1 = await agent.answer_async(\"What is the capital of France?\", session_id=session_id)\n",
- "console.print(Panel(response1.text, title=\"Turn 1: Capital of France\", border_style=\"blue\"))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Second turn - follow-up question (ADK remembers the context)\n",
- "response2 = await agent.answer_async(\"What is its population?\", session_id=session_id)\n",
- "console.print(Panel(response2.text, title=\"Turn 2: Population (follow-up)\", border_style=\"blue\"))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": "## 2. Session Management in Applications\n\nFor web applications (like Gradio), you can store a session ID in the app's state:\n\n```python\n# In a Gradio app handler:\nif \"session_id\" not in session_state:\n session_state[\"session_id\"] = str(uuid.uuid4())\n\nresponse = await agent.answer_async(query, session_id=session_state[\"session_id\"])\n```\n\nSee `gradio_app.py` for a complete example."
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# For more details on ADK sessions, see:\n",
- "# https://google.github.io/adk-docs/sessions/\n",
- "\n",
- "display_success(\"Multi-turn conversation demo complete!\", console=console)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 3. Running DeepSearchQA Evaluation\n",
- "\n",
- "The `DeepSearchQAEvaluator` provides a systematic way to evaluate the agent."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Create evaluator using the existing agent\n",
- "evaluator = DeepSearchQAEvaluator(agent)\n",
- "\n",
- "display_success(f\"Dataset size: {len(evaluator.dataset)} examples\", console=console)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Evaluate a small sample\n",
- "console.print(\"[bold]๐ฌ Running evaluation on 3 examples...[/bold]\\n\")\n",
- "\n",
- "console.print(\"[dim]Evaluating...[/dim]\")\n",
- "results = await evaluator.evaluate_sample_async(n=3, random_state=42)\n",
- "\n",
- "display_success(f\"Completed {len(results)} evaluations\", console=console)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# View results using the display utility\n",
- "console.print(\"\\n[bold]๐ Evaluation Results[/bold]\\n\")\n",
- "\n",
- "for result in results:\n",
- " contains_answer = result.ground_truth.lower() in result.prediction.lower()\n",
- " display_evaluation_result(\n",
- " example_id=result.example_id,\n",
- " problem=result.problem,\n",
- " ground_truth=result.ground_truth,\n",
- " prediction=result.prediction,\n",
- " sources_used=result.sources_used,\n",
- " search_queries=result.search_queries,\n",
- " contains_answer=contains_answer,\n",
- " console=console,\n",
- " )"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 4. Analyzing Evaluation Results"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Convert to DataFrame for analysis\n",
- "df = evaluator.results_to_dataframe(results)\n",
- "\n",
- "# Calculate metrics\n",
- "containment_correct = sum(1 for r in results if r.ground_truth.lower() in r.prediction.lower())\n",
- "containment_accuracy = containment_correct / len(results) * 100\n",
- "\n",
- "metrics = {\n",
- " \"Total Examples\": len(results),\n",
- " \"Containment Accuracy\": f\"{containment_accuracy:.1f}%\",\n",
- " \"Avg Sources Used\": df[\"sources_used\"].mean(),\n",
- " \"Avg Search Queries\": df[\"search_queries\"].apply(len).mean(),\n",
- "}\n",
- "\n",
- "display_metrics_table(metrics, title=\"Evaluation Metrics\", console=console)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 5. Understanding Evaluation Metrics\n",
- "\n",
- "For research agents, we care about:\n",
- "\n",
- "1. **Answer Correctness**: Does the prediction match the ground truth?\n",
- "2. **Source Quality**: Are the sources relevant and authoritative?\n",
- "3. **Comprehensiveness**: Did the agent find all necessary information?\n",
- "4. **Search Efficiency**: How many searches were needed?\n",
- "\n",
- "DeepSearchQA specifically measures:\n",
- "- **Precision**: Quality of the answer\n",
- "- **Recall**: Completeness of the answer (for list-type questions)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Manual correctness check with better display\n",
- "def check_answer_contains_ground_truth(prediction: str, ground_truth: str) -> bool:\n",
- " \"\"\"Check if prediction contains the ground truth answer.\"\"\"\n",
- " return ground_truth.lower() in prediction.lower()\n",
- "\n",
- "\n",
- "# Check our results\n",
- "console.print(\"\\n[bold]๐ Correctness Check[/bold]\\n\")\n",
- "\n",
- "result_table = Table(show_header=True, header_style=\"bold cyan\")\n",
- "result_table.add_column(\"Example\", style=\"cyan\")\n",
- "result_table.add_column(\"Status\", style=\"white\")\n",
- "result_table.add_column(\"Expected\", style=\"dim\")\n",
- "\n",
- "for result in results:\n",
- " contains = check_answer_contains_ground_truth(result.prediction, result.ground_truth)\n",
- " status = \"[green]โ MATCH[/green]\" if contains else \"[yellow]โ NO MATCH[/yellow]\"\n",
- " result_table.add_row(\n",
- " str(result.example_id),\n",
- " status,\n",
- " result.ground_truth[:40] + \"...\" if len(result.ground_truth) > 40 else result.ground_truth,\n",
- " )\n",
- "\n",
- "console.print(result_table)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 6. Exploring Categories"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Get examples from a specific category\n",
- "dataset = DeepSearchQADataset()\n",
- "categories = dataset.get_categories()\n",
- "\n",
- "cat_table = Table(title=\"๐ Available Categories\", show_header=True, header_style=\"bold green\")\n",
- "cat_table.add_column(\"Category\", style=\"white\")\n",
- "cat_table.add_column(\"Count\", style=\"cyan\", justify=\"right\")\n",
- "\n",
- "for cat in sorted(categories):\n",
- " count = len(dataset.get_by_category(cat))\n",
- " cat_table.add_row(cat, str(count))\n",
- "\n",
- "console.print(cat_table)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": "## Summary\n\nIn this notebook, you learned:\n\n1. How ADK manages multi-turn conversations via `InMemorySessionService`\n2. How to use `session_id` for conversation continuity\n3. How to run systematic evaluations with `DeepSearchQAEvaluator`\n4. How to analyze evaluation results with rich visualizations\n5. Key metrics for evaluating research agents\n\n## Next Steps\n\n- Run the Gradio app for interactive testing\n- Experiment with different models (gemini-2.5-pro vs flash)\n- Try the async evaluator for larger-scale evaluation\n- Implement LLM-as-judge evaluation for more nuanced correctness checking"
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "console.print(\n",
- " Panel(\n",
- " \"[green]โ[/green] Notebook complete!\\n\\n\"\n",
- " \"[cyan]Next:[/cyan] Run [bold]gradio_app.py[/bold] for interactive testing.\",\n",
- " title=\"๐ Done\",\n",
- " border_style=\"green\",\n",
- " )\n",
- ")"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.12.8"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/implementations/knowledge_qa/README.md b/implementations/knowledge_qa/README.md
index d796e37..d0617bd 100644
--- a/implementations/knowledge_qa/README.md
+++ b/implementations/knowledge_qa/README.md
@@ -1,19 +1,18 @@
# Knowledge-Grounded QA Agent
-This implementation demonstrates a knowledge-grounded question answering agent using **Google ADK** with explicit **Google Search tool calls**, evaluated on the **DeepSearchQA** benchmark.
+This implementation demonstrates a knowledge-grounded question answering agent using **Google ADK** with a **PlanReAct** architecture, evaluated on the **DeepSearchQA** benchmark.
## Overview
-The knowledge agent uses a ReAct (Reasoning + Acting) architecture powered by Google ADK. It explicitly calls Google Search as a tool, making the reasoning process transparent through observable Thought โ Action โ Observation cycles. This approach searches the live web to find relevant information for questions requiring real-time data.
+The agent combines two patterns: **PlanReAct** (creates an explicit numbered research plan before executing) and a **ReAct loop** within each step (Thought โ Tool Call โ Observation). It searches the live web to find and verify facts rather than relying on training data.
## Features
-- **ReAct Architecture**: Explicit tool calls with traceable reasoning (Thought โ Action โ Observation)
-- **Google Search Tool**: Uses ADK's `GoogleSearchTool` for real-time web search
-- **Source Citation**: Automatically extracts and includes source URLs from search results
-- **DeepSearchQA Evaluation**: Built-in evaluation on the DeepSearchQA benchmark (900 research tasks)
+- **PlanReAct Architecture**: Explicit research plan with step statuses, revised mid-run if needed
+- **Five Tools**: `google_search`, `web_fetch`, `fetch_file`, `grep_file`, `read_file`
+- **Source Citation**: Extracts and cites source URLs from search results
+- **DeepSearchQA Evaluation**: LLM-as-judge evaluation on the DeepSearchQA benchmark (896 questions)
- **Multi-turn Conversations**: Session management via ADK's `InMemorySessionService`
-- **Gradio Interface**: Interactive chat UI for testing
## Setup
@@ -36,14 +35,6 @@ uv sync
## Usage
-### Interactive Chat
-
-Run the Gradio app:
-
-```bash
-uv run --env-file .env gradio implementations/knowledge_qa/gradio_app.py
-```
-
### Programmatic Usage
```python
@@ -64,43 +55,66 @@ print(f"Tool calls: {response.tool_calls}")
### Evaluation on DeepSearchQA
-```python
-from aieng.agent_evals.knowledge_qa import (
- KnowledgeGroundedAgent,
- DeepSearchQAEvaluator,
-)
+Use the main evaluation script to run comprehensive evaluations:
-agent = KnowledgeGroundedAgent()
-evaluator = DeepSearchQAEvaluator(agent)
+```bash
+# Run evaluation on 3 samples
+python implementations/knowledge_qa/evaluate.py --samples 3
-# Evaluate a sample (use await in Jupyter)
-results = await evaluator.evaluate_sample_async(n=10, random_state=42)
+# Run with specific example IDs
+python implementations/knowledge_qa/evaluate.py --ids 123 456 789
-# Convert to DataFrame for analysis
-df = evaluator.results_to_dataframe(results)
-print(df[["example_id", "ground_truth", "prediction", "sources_used"]])
+# Enable trace groundedness evaluation
+ENABLE_TRACE_GROUNDEDNESS=true python implementations/knowledge_qa/evaluate.py
+```
+
+Or use the CLI:
+
+```bash
+# Run evaluation via CLI
+uv run --env-file .env knowledge-qa eval --samples 3
+uv run --env-file .env knowledge-qa eval --ids 123 456 --show-plan
+```
+
+## Run with ADK Web UI
+
+To inspect the agent interactively, the module exposes a top-level `root_agent` for ADK discovery.
+
+```bash
+uv run adk web --port 8000 --reload --reload_agents implementations/
```
## Notebooks
-1. **01_grounding_basics.ipynb**: Introduction to the knowledge agent and Google Search tool
-2. **02_agent_basics.ipynb**: Creating agents with custom instructions
-3. **03_multi_turn.ipynb**: Multi-turn conversations and DeepSearchQA evaluation
+1. **01_dataset_and_tools.ipynb**: The DeepSearchQA dataset and the agent's five tools
+2. **02_running_the_agent.ipynb**: PlanReAct architecture, live progress display, multi-turn conversations, and Langfuse tracing
+3. **03_evaluation.ipynb**: Systematic evaluation with `run_experiment`, LLM-as-judge grading, and result inspection
## Architecture
```
aieng.agent_evals.knowledge_qa/
-โโโ config.py # Configuration (Pydantic settings)
-โโโ grounding_tool.py # GoogleSearchTool wrapper and response models
-โโโ agent.py # KnowledgeGroundedAgent (ADK Agent + Runner)
-โโโ session.py # Conversation session management
-โโโ evaluation.py # DeepSearchQA dataset and evaluator
+โโโ agent.py # KnowledgeGroundedAgent (ADK Agent + Runner)
+โโโ data/ # DeepSearchQA dataset loader
+โโโ deepsearchqa_grader.py # LLM-as-judge evaluation
+โโโ planner.py # Research planning
+โโโ token_tracker.py # Token usage tracking
+โโโ cli.py # Rich CLI interface
+
+aieng.agent_evals/
+โโโ configs.py # Configuration (Pydantic settings)
+โโโ evaluation/ # Evaluation harness
+โ โโโ experiment.py # Langfuse experiment runner
+โ โโโ graders/ # Evaluators (trace groundedness, etc.)
+โโโ tools/ # Shared tools
+ โโโ search.py # GoogleSearchTool wrapper
+ โโโ web.py # web_fetch for HTML/PDF
+ โโโ file.py # fetch_file, grep_file, read_file
```
## DeepSearchQA Dataset
-The [DeepSearchQA](https://www.kaggle.com/datasets/deepmind/deepsearchqa) benchmark consists of 900 "causal chain" research tasks across 17 categories. These questions require:
+The [DeepSearchQA](https://www.kaggle.com/datasets/deepmind/deepsearchqa) benchmark consists of 896 "causal chain" research tasks across 17 categories. These questions require:
- Multi-source lookups
- Statistical comparisons
diff --git a/implementations/knowledge_qa/agent.py b/implementations/knowledge_qa/agent.py
new file mode 100644
index 0000000..d386b0c
--- /dev/null
+++ b/implementations/knowledge_qa/agent.py
@@ -0,0 +1,21 @@
+"""ADK discovery entrypoint for the Knowledge QA agent.
+
+Exposes a module-level ``root_agent`` so ``adk web`` can discover it.
+
+Examples
+--------
+Run with ``adk web``:
+ uv run adk web --port 8000 --reload --reload_agents implementations/
+"""
+
+import logging
+
+from aieng.agent_evals.knowledge_qa.agent import KnowledgeGroundedAgent
+
+
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+logger = logging.getLogger(__name__)
+
+
+# ADK discovery expects a module-level `root_agent`
+root_agent = KnowledgeGroundedAgent().adk_agent