diff --git a/aieng-eval-agents/aieng/agent_evals/configs.py b/aieng-eval-agents/aieng/agent_evals/configs.py
index 390847b..61f7542 100644
--- a/aieng-eval-agents/aieng/agent_evals/configs.py
+++ b/aieng-eval-agents/aieng/agent_evals/configs.py
@@ -96,11 +96,6 @@ class Configs(BaseSettings):
         validation_alias=AliasChoices("OPENAI_API_KEY", "GEMINI_API_KEY", "GOOGLE_API_KEY"),
         description="API key for OpenAI-compatible API (accepts OPENAI_API_KEY, GEMINI_API_KEY, or GOOGLE_API_KEY).",
     )
-    gemini_api_key: SecretStr = Field(
-        default=SecretStr("default-gemini-api-key"),  # setting a default so some implementations can run without it
-        validation_alias=AliasChoices("GEMINI_API_KEY", "GOOGLE_API_KEY"),
-        description="API key for Google/Gemini API (accepts GEMINI_API_KEY, or GOOGLE_API_KEY).",
-    )
     default_planner_model: str = Field(
         default="gemini-2.5-pro",
         description="Model name for planning/complex reasoning tasks.",
diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py
index 0c02325..bc9ba87 100644
--- a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py
+++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py
@@ -7,6 +7,7 @@
 
 import asyncio
 import logging
+import os
 import time
 import uuid
 import warnings
@@ -235,6 +236,10 @@ def __init__(
         if thinking_budget > 0 and self._supports_thinking(self.model):
             thinking_config = types.ThinkingConfig(thinking_budget=thinking_budget)
 
+        # Google ADK reads GOOGLE_API_KEY from the environment directly.
+        # Bridge from OPENAI_API_KEY (or GEMINI_API_KEY) if not already set.
+        os.environ.setdefault("GOOGLE_API_KEY", config.openai_api_key.get_secret_value())
+
         self._agent = Agent(
             name="knowledge_qa",
             model=self.model,
@@ -345,6 +350,11 @@ def reset(self) -> None:
             )
         logger.debug("Agent state reset for new question")
 
+    @property
+    def adk_agent(self) -> Agent:
+        """Return the underlying ADK agent, e.g. for use with ``adk web``."""
+        return self._agent
+
     @property
     def current_plan(self) -> ResearchPlan | None:
         """Get the current research plan if one exists."""
diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/notebook.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/notebook.py
new file mode 100644
index 0000000..627f173
--- /dev/null
+++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/notebook.py
@@ -0,0 +1,339 @@
+"""Notebook display utilities for the Knowledge Agent.
+
+Provides live progress display for Jupyter notebooks, showing plan status
+and tool calls while the agent works, and formatted rendering of agent responses.
+
+Example
+-------
+>>> from aieng.agent_evals.knowledge_qa import KnowledgeGroundedAgent
+>>> from aieng.agent_evals.knowledge_qa.notebook import (
+...     display_response,
+...     run_with_display,
+... )
+>>> agent = KnowledgeGroundedAgent(enable_planning=True)
+>>> response = await run_with_display(agent, "What is quantum computing?")
+>>> display_response(console, response.text)
+"""
+
+import asyncio
+import logging
+import re
+from typing import TYPE_CHECKING
+
+from IPython.display import HTML, clear_output, display
+from rich.console import Console
+from rich.markdown import Markdown
+from rich.panel import Panel
+
+from .plan_parsing import StepStatus
+
+
+if TYPE_CHECKING:
+    from .agent import AgentResponse, KnowledgeGroundedAgent
+    from .plan_parsing import ResearchPlan
+
+
+class ToolCallCapture(logging.Handler):
+    """Captures tool calls from agent logs for display."""
+
+    def __init__(self):
+        super().__init__()
+        self.tool_calls: list[dict] = []
+
+    def emit(self, record):
+        """Capture tool call and response log messages."""
+        msg = record.getMessage()
+        if "Tool call:" in msg:
+            try:
+                parts = msg.split("Tool call: ", 1)[1]
+                paren_idx = parts.find("(")
+                if paren_idx > 0:
+                    tool_name = parts[:paren_idx]
+                    args_str = parts[paren_idx + 1 : -1]
+                    if len(args_str) > 60:
+                        args_str = args_str[:57] + "..."
+                    self.tool_calls.append({"name": tool_name, "args": args_str, "completed": False})
+            except Exception:
+                pass
+        elif "Tool response:" in msg:
+            try:
+                parts = msg.split("Tool response: ", 1)[1]
+                tool_name = parts.split(" ")[0]
+                for tc in reversed(self.tool_calls):
+                    if tc["name"] == tool_name and not tc["completed"]:
+                        tc["completed"] = True
+                        break
+            except Exception:
+                pass
+
+
+def _format_plan_html(plan: "ResearchPlan") -> str:
+    """Format the research plan as HTML."""
+    lines = ['<div style="font-family: monospace; padding: 10px; background: #f8f9fa; border-radius: 8px;">']
+    lines.append('<div style="font-weight: bold; margin-bottom: 8px;">📋 Research Plan</div>')
+
+    for step in plan.steps:
+        if step.status == StepStatus.COMPLETED:
+            icon, color = "✓", "#28a745"
+        elif step.status == StepStatus.FAILED:
+            icon, color = "✗", "#dc3545"
+        elif step.status == StepStatus.IN_PROGRESS:
+            icon, color = "→", "#ffc107"
+        elif step.status == StepStatus.SKIPPED:
+            icon, color = "○", "#6c757d"
+        else:
+            icon, color = "○", "#adb5bd"
+
+        lines.append(f'<div style="color: {color}; margin: 4px 0;">{icon} {step.step_id}. {step.description}</div>')
+
+    lines.append("</div>")
+    return "\n".join(lines)
+
+
+def _format_tools_html(tool_calls: list[dict]) -> str:
+    """Format tool calls as HTML."""
+    if not tool_calls:
+        return '<div style="color: #6c757d;">Waiting for tool calls...</div>'
+
+    lines = [
+        '<div style="font-family: monospace; padding: 10px; background: #e9ecef; border-radius: 8px; margin-top: 8px;">'
+    ]
+    lines.append(f'<div style="font-weight: bold; margin-bottom: 8px;">🔧 Tool Calls ({len(tool_calls)})</div>')
+
+    # Show last 8 tool calls
+    display_calls = tool_calls[-8:]
+    if len(tool_calls) > 8:
+        lines.append(f'<div style="color: #6c757d;">... ({len(tool_calls) - 8} earlier calls)</div>')
+
+    tool_icons = {
+        "google_search": "🔍",
+        "google_search_agent": "🔍",
+        "fetch_url": "🌐",
+        "web_fetch": "🌐",
+        "read_pdf": "📄",
+        "grep_file": "📑",
+        "read_file": "📖",
+    }
+
+    for tc in display_calls:
+        name = tc["name"]
+        if name == "google_search_agent":
+            name = "google_search"
+        icon = tool_icons.get(name, "🔧")
+        status_icon = "✓" if tc.get("completed") else "→"
+        status_color = "#28a745" if tc.get("completed") else "#ffc107"
+
+        lines.append(
+            f'<div style="margin: 2px 0;">'
+            f'<span style="color: {status_color};">{status_icon}</span> '
+            f"{icon} <b>{name}</b> "
+            f'<span style="color: #6c757d;">{tc["args"]}</span>'
+            f"</div>"
+        )
+
+    lines.append("</div>")
+    return "\n".join(lines)
+
+
+def _format_display_html(plan: "ResearchPlan | None", tool_calls: list[dict], question: str) -> str:
+    """Create the full HTML display."""
+    html = ['<div style="max-width: 800px;">']
+
+    # Question
+    html.append(
+        f'<div style="padding: 10px; background: #cfe2ff; border-radius: 8px; margin-bottom: 8px;">'
+        f"<b>Question:</b> {question}</div>"
+    )
+
+    # Plan
+    if plan and plan.steps:
+        html.append(_format_plan_html(plan))
+
+    # Tools
+    html.append(_format_tools_html(tool_calls))
+
+    html.append("</div>")
+    return "\n".join(html)
+
+
+def _parse_response_sections(text: str) -> tuple[str, list[str], str]:
+    """Extract answer, sources, and reasoning from structured agent response text.
+
+    The agent formats its final response as::
+
+        ANSWER: <direct answer>
+        SOURCES: <url(s)>
+        REASONING: <supporting quote or explanation>
+
+    Parameters
+    ----------
+    text : str
+        Raw response text from the agent.
+
+    Returns
+    -------
+    tuple[str, list[str], str]
+        ``(answer, sources, reasoning)`` where *sources* is a list of URLs.
+        If the text does not contain the expected sections, the full text is
+        returned as the answer with empty sources and reasoning.
+    """
+    answer_match = re.search(r"ANSWER:\s*(.*?)(?=\n\s*SOURCES:|\n\s*REASONING:|$)", text, re.DOTALL | re.IGNORECASE)
+    sources_match = re.search(r"SOURCES:\s*(.*?)(?=\n\s*ANSWER:|\n\s*REASONING:|$)", text, re.DOTALL | re.IGNORECASE)
+    reasoning_match = re.search(r"REASONING:\s*(.*?)(?=\n\s*ANSWER:|\n\s*SOURCES:|$)", text, re.DOTALL | re.IGNORECASE)
+
+    answer = answer_match.group(1).strip() if answer_match else text
+    sources_raw = sources_match.group(1).strip() if sources_match else ""
+    reasoning = reasoning_match.group(1).strip() if reasoning_match else ""
+
+    # Sources may be newline- or comma-separated URLs
+    sources = [s.strip() for s in re.split(r"[\n,]+", sources_raw) if s.strip().startswith("http")]
+
+    return answer, sources, reasoning
+
+
+def display_response(
+    console: Console,
+    text: str,
+    title: str = "Answer",
+    subtitle: str | None = None,
+) -> None:
+    """Display a structured agent response with separated, styled sections.
+
+    Parses the ``ANSWER`` / ``SOURCES`` / ``REASONING`` structure from the
+    agent's final response text and renders each section with appropriate Rich
+    styling: the answer in a cyan panel, sources in a dimmed panel, and
+    reasoning in a muted panel.
+
+    Parameters
+    ----------
+    console : Console
+        Rich console to render to.
+    text : str
+        Raw response text from the agent.
+    title : str, optional
+        Panel title for the answer section (default ``"Answer"``).
+    subtitle : str, optional
+        Panel subtitle, e.g. duration and tool-call count.
+
+    Example
+    -------
+    >>> duration = f"{response.total_duration_ms / 1000:.1f}s"
+    >>> display_response(console, response.text, subtitle=duration)
+    """
+    answer, sources, reasoning = _parse_response_sections(text)
+
+    console.print(Panel(Markdown(answer), title=title, border_style="cyan", subtitle=subtitle))
+
+    if sources:
+        src_lines = "\n".join(f"  [blue]{src}[/blue]" for src in sources[:6])
+        console.print(Panel(src_lines, title="Sources", border_style="dim", padding=(0, 1)))
+
+    if reasoning:
+        console.print(Panel(Markdown(reasoning), title="[dim]Reasoning[/dim]", border_style="dim", padding=(0, 1)))
+
+
+async def run_with_display(
+    agent: "KnowledgeGroundedAgent",
+    question: str,
+    refresh_rate: float = 0.5,
+) -> "AgentResponse":
+    """Run the agent with live progress display in a Jupyter notebook.
+
+    Shows the research plan checklist and tool calls while the agent works,
+    updating the display periodically.
+
+    Parameters
+    ----------
+    agent : KnowledgeGroundedAgent
+        The agent to run.
+    question : str
+        The question to answer.
+    refresh_rate : float
+        How often to update the display in seconds (default 0.5).
+
+    Returns
+    -------
+    AgentResponse
+        The agent's response.
+
+    Example
+    -------
+    >>> agent = KnowledgeGroundedAgent(enable_planning=True)
+    >>> response = await run_with_display(agent, "What is quantum computing?")
+    >>> print(response.text)
+    """
+    # Suppress verbose logging from external libraries (same as CLI)
+    verbose_loggers = ["google.adk", "google.genai", "httpx", "httpcore"]
+    original_levels = {}
+    for name in verbose_loggers:
+        _logger = logging.getLogger(name)
+        original_levels[name] = _logger.level
+        _logger.setLevel(logging.ERROR)
+        _logger.propagate = False
+
+    # Set up tool call capture on the agent logger (same as CLI)
+    tool_capture = ToolCallCapture()
+    tool_capture.setLevel(logging.INFO)
+    agent_logger = logging.getLogger("aieng.agent_evals.knowledge_qa.agent")
+    original_agent_level = agent_logger.level
+    original_handlers = agent_logger.handlers.copy()
+    agent_logger.handlers.clear()
+    agent_logger.addHandler(tool_capture)
+    agent_logger.setLevel(logging.INFO)
+    agent_logger.propagate = False
+
+    try:
+        # Create the plan first if planning is enabled
+        if agent.enable_planning and hasattr(agent, "create_plan_async"):
+            clear_output(wait=True)
+            display(HTML('<div style="color: #6c757d;">Creating research plan...</div>'))
+            await agent.create_plan_async(question)
+
+        # Start the agent task
+        task = asyncio.create_task(agent.answer_async(question))
+
+        # Update display while agent works
+        while not task.done():
+            clear_output(wait=True)
+            display(
+                HTML(
+                    _format_display_html(
+                        plan=agent.current_plan if hasattr(agent, "current_plan") else None,
+                        tool_calls=tool_capture.tool_calls,
+                        question=question,
+                    )
+                )
+            )
+            await asyncio.sleep(refresh_rate)
+
+        # Get the result
+        response = await task
+
+        # Final display with completion status
+        clear_output(wait=True)
+        display(
+            HTML(
+                _format_display_html(
+                    plan=agent.current_plan if hasattr(agent, "current_plan") else None,
+                    tool_calls=tool_capture.tool_calls,
+                    question=question,
+                )
+                + f'<div style="margin-top: 12px; padding: 10px; background: #d4edda; border-radius: 8px;">'
+                f"✓ Complete in {response.total_duration_ms / 1000:.1f}s | "
+                f"{len(response.tool_calls)} tool calls | "
+                f"{len(response.sources)} sources</div>"
+            )
+        )
+
+        return response
+
+    finally:
+        # Clean up logging - restore original state
+        agent_logger.removeHandler(tool_capture)
+        agent_logger.handlers = original_handlers
+        agent_logger.setLevel(original_agent_level)
+        agent_logger.propagate = True
+
+        # Restore verbose logger levels
+        for name, level in original_levels.items():
+            logging.getLogger(name).setLevel(level)
diff --git a/aieng-eval-agents/aieng/agent_evals/logging_config.py b/aieng-eval-agents/aieng/agent_evals/logging_config.py
new file mode 100644
index 0000000..b9bb75a
--- /dev/null
+++ b/aieng-eval-agents/aieng/agent_evals/logging_config.py
@@ -0,0 +1,87 @@
+"""Logging configuration with colors and clean output.
+
+This module provides a clean, colored logging setup for agent evaluations
+using the rich library. It reuses the console infrastructure from display.py
+for consistent styling across the codebase.
+"""
+
+import logging
+
+from rich.logging import RichHandler
+
+from .display import create_console
+
+
+def setup_logging(
+    level: int = logging.INFO,
+    show_time: bool = True,
+    show_path: bool = False,
+) -> None:
+    """Configure colored logging with rich.
+
+    Uses the same console theme as display.py for consistent styling.
+
+    Parameters
+    ----------
+    level : int, optional
+        Logging level, by default logging.INFO.
+    show_time : bool, optional
+        Whether to show timestamps, by default True.
+    show_path : bool, optional
+        Whether to show file path in logs, by default False.
+    """
+    # Reuse display console with force_jupyter=False for CLI
+    console = create_console(force_jupyter=False)
+
+    # Configure rich handler with clean formatting
+    rich_handler = RichHandler(
+        console=console,
+        show_time=show_time,
+        show_path=show_path,
+        markup=True,
+        rich_tracebacks=True,
+        tracebacks_show_locals=False,
+        omit_repeated_times=False,
+    )
+
+    # Simple format - rich handles styling
+    rich_handler.setFormatter(logging.Formatter("%(message)s", datefmt="[%X]"))
+
+    # Configure root logger
+    logging.basicConfig(
+        level=level,
+        format="%(message)s",
+        datefmt="[%X]",
+        handlers=[rich_handler],
+        force=True,
+    )
+
+    # Silence noisy third-party libraries
+    _silence_third_party_loggers()
+
+
+def _silence_third_party_loggers() -> None:
+    """Reduce noise from third-party libraries.
+
+    Sets logging levels for common noisy libraries to WARNING or ERROR
+    to keep evaluation output clean and focused on agent behavior.
+    """
+    # Google SDK libraries - only warnings and above
+    for logger_name in [
+        "google_adk",
+        "google_genai",
+        "google.adk",
+        "google.genai",
+    ]:
+        logging.getLogger(logger_name).setLevel(logging.WARNING)
+
+    # Tracing/observability - only warnings
+    logging.getLogger("langfuse").setLevel(logging.WARNING)
+
+    # HTTP/network libraries - errors only
+    for logger_name in ["httpx", "httpcore", "urllib3"]:
+        logging.getLogger(logger_name).setLevel(logging.ERROR)
+
+    # System libraries
+    logging.getLogger("asyncio").setLevel(logging.WARNING)
+    logging.getLogger("py.warnings").setLevel(logging.ERROR)
diff --git a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py
index ba33840..2c0c1cc 100644
--- a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py
+++ b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py
@@ -351,9 +351,9 @@ class TestKnowledgeGroundedAgent:
     def mock_config(self):
         """Create a mock config for testing."""
         config = MagicMock()
-        config.gemini_api_key = "test-api-key"
         config.default_worker_model = "gemini-2.5-flash"
         config.default_temperature = 0.0
+        config.openai_api_key.get_secret_value.return_value = "test-api-key"
         return config
 
     @patch("aieng.agent_evals.knowledge_qa.agent.PlanReActPlanner")
@@ -489,6 +489,7 @@ def test_lazy_initialization(self, *_mocks):
             mock_config = MagicMock()
             mock_config.default_worker_model = "gemini-2.5-flash"
             mock_config.default_temperature = 0.0
+            mock_config.openai_api_key.get_secret_value.return_value = "test-api-key"
             mock_config_class.return_value = mock_config
 
             manager = KnowledgeAgentManager(enable_caching=False, enable_compaction=False)
@@ -517,6 +518,7 @@ def test_close(self, *_mocks):
             mock_config = MagicMock()
             mock_config.default_worker_model = "gemini-2.5-flash"
             mock_config.default_temperature = 0.0
+            mock_config.openai_api_key.get_secret_value.return_value = "test-api-key"
             mock_config_class.return_value = mock_config
 
             manager = KnowledgeAgentManager(enable_caching=False, enable_compaction=False)
diff --git a/implementations/knowledge_qa/01_dataset_and_tools.ipynb b/implementations/knowledge_qa/01_dataset_and_tools.ipynb
new file mode 100644
index 0000000..d16941c
--- /dev/null
+++ b/implementations/knowledge_qa/01_dataset_and_tools.ipynb
@@ -0,0 +1,318 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "intro",
+   "metadata": {},
+   "source": [
+    "# 01: The DeepSearchQA Dataset & Agent Tools\n",
+    "\n",
+    "This notebook introduces the two foundational components of the Knowledge QA system:\n",
+    "\n",
+    "- **DeepSearchQA** — the benchmark dataset used to evaluate the agent\n",
+    "- **Agent tools** — the five capabilities the agent uses to research and verify answers\n",
+    "\n",
+    "## What You'll Learn\n",
+    "\n",
+    "1. What the DeepSearchQA dataset contains and how to explore it\n",
+    "2. The five tools the agent has access to, and how it's instructed to use them\n",
+    "\n",
+    "## Prerequisites\n",
+    "\n",
+    "- `GOOGLE_API_KEY` set in your `.env` file\n",
+    "- Dependencies installed with `uv sync`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "setup",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from aieng.agent_evals.knowledge_qa import DeepSearchQADataset\n",
+    "from aieng.agent_evals.knowledge_qa.system_instructions import build_system_instructions\n",
+    "from dotenv import load_dotenv\n",
+    "from rich.console import Console\n",
+    "from rich.markdown import Markdown\n",
+    "from rich.panel import Panel\n",
+    "from rich.table import Table\n",
+    "\n",
+    "\n",
+    "# Set working directory to the repository root\n",
+    "if Path(\"\").absolute().name == \"eval-agents\":\n",
+    "    print(f\"Working directory: {Path('').absolute()}\")\n",
+    "else:\n",
+    "    os.chdir(Path(\"\").absolute().parent.parent)\n",
+    "    print(f\"Working directory set to: {Path('').absolute()}\")\n",
+    "\n",
+    "load_dotenv(verbose=True)\n",
+    "console = Console(width=100)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s1-intro",
+   "metadata": {},
+   "source": [
+    "## 1. The DeepSearchQA Dataset\n",
+    "\n",
+    "[DeepSearchQA](https://www.kaggle.com/datasets/deepmind/deepsearchqa) is a benchmark from Google DeepMind\n",
+    "for evaluating deep research agents. It contains 896 research questions requiring multi-step web search\n",
+    "and reasoning to answer correctly.\n",
+    "\n",
+    "Each question is a **causal chain task**: the agent must follow a chain of searches, fetch real sources,\n",
+    "and verify facts before answering — not recall from training data.\n",
+    "\n",
+    "### Answer Types\n",
+    "\n",
+    "| Type | Description | Example |\n",
+    "|------|-------------|---------|\n",
+    "| **Single Answer** | One specific value | A date, a number, a proper name |\n",
+    "| **Set Answer** | Multiple required items | A list of countries, a set of policy changes |\n",
+    "\n",
+    "Evaluation uses an LLM-as-judge that computes **precision, recall, and F1** by comparing the agent's\n",
+    "answer to the ground truth item-by-item."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "load-dataset",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = DeepSearchQADataset()\n",
+    "\n",
+    "console.print(f\"Total examples: [cyan]{len(dataset)}[/cyan]\")\n",
+    "console.print(f\"Categories: [cyan]{len(dataset.get_categories())}[/cyan]\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s1-structure",
+   "metadata": {},
+   "source": [
+    "### 1.1 Dataset Structure\n",
+    "\n",
+    "Each example is a `DSQAExample` with five fields. Let's look at one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "show-structure",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "example = dataset[0]\n",
+    "\n",
+    "console.print(\n",
+    "    Panel(\n",
+    "        f\"[bold]example_id:[/bold]       {example.example_id}\\n\"\n",
+    "        f\"[bold]problem_category:[/bold] {example.problem_category}\\n\"\n",
+    "        f\"[bold]answer_type:[/bold]      {example.answer_type}\\n\\n\"\n",
+    "        f\"[bold cyan]problem:[/bold cyan]\\n{example.problem}\\n\\n\"\n",
+    "        f\"[bold yellow]answer:[/bold yellow]\\n{example.answer}\",\n",
+    "        title=\"DSQAExample\",\n",
+    "        border_style=\"blue\",\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s1-categories",
+   "metadata": {},
+   "source": [
+    "### 1.2 Categories\n",
+    "\n",
+    "The dataset spans 17 domains. Let's see how examples are distributed across them."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "categories",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "categories = dataset.get_categories()\n",
+    "\n",
+    "cat_table = Table(title=\"Dataset by Category\")\n",
+    "cat_table.add_column(\"Category\", style=\"cyan\")\n",
+    "cat_table.add_column(\"Total\", style=\"white\", justify=\"right\")\n",
+    "cat_table.add_column(\"Single Answer\", style=\"dim\", justify=\"right\")\n",
+    "cat_table.add_column(\"Set Answer\", style=\"dim\", justify=\"right\")\n",
+    "\n",
+    "for cat in sorted(categories):\n",
+    "    examples = dataset.get_by_category(cat)\n",
+    "    single = sum(1 for e in examples if e.answer_type == \"Single Answer\")\n",
+    "    set_ans = len(examples) - single\n",
+    "    cat_table.add_row(cat, str(len(examples)), str(single), str(set_ans))\n",
+    "\n",
+    "console.print(cat_table)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s1-answer-types",
+   "metadata": {},
+   "source": [
+    "### 1.3 Answer Types in Practice\n",
+    "\n",
+    "The answer type matters for evaluation — the grader treats \"Single Answer\" and \"Set Answer\"\n",
+    "differently when computing correctness."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "answer-types",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "single_ex = next(e for e in dataset.examples if e.answer_type == \"Single Answer\")\n",
+    "set_ex = next(e for e in dataset.examples if e.answer_type == \"Set Answer\")\n",
+    "\n",
+    "for label, ex, style in [\n",
+    "    (\"Single Answer\", single_ex, \"green\"),\n",
+    "    (\"Set Answer\", set_ex, \"yellow\"),\n",
+    "]:\n",
+    "    console.print(\n",
+    "        Panel(\n",
+    "            f\"[bold cyan]Question:[/bold cyan]\\n{ex.problem}\\n\\n[bold yellow]Answer:[/bold yellow]\\n{ex.answer}\",\n",
+    "            title=f\"{label} — {ex.problem_category}\",\n",
+    "            border_style=style,\n",
+    "        )\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s1-browse",
+   "metadata": {},
+   "source": [
+    "### 1.4 Browsing Examples\n",
+    "\n",
+    "You can retrieve examples by category or by ID."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "browse",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Examples by category\n",
+    "finance_examples = dataset.get_by_category(\"Finance & Economics\")\n",
+    "console.print(f\"Finance & Economics: [cyan]{len(finance_examples)}[/cyan] examples\\n\")\n",
+    "\n",
+    "# Display a preview table\n",
+    "browse_table = Table(title=\"Finance & Economics — First 5 Examples\")\n",
+    "browse_table.add_column(\"ID\", style=\"dim\", width=6)\n",
+    "browse_table.add_column(\"Answer Type\", style=\"cyan\", width=15)\n",
+    "browse_table.add_column(\"Question\", style=\"white\")\n",
+    "\n",
+    "for ex in finance_examples[:5]:\n",
+    "    q = ex.problem[:75] + \"...\" if len(ex.problem) > 75 else ex.problem\n",
+    "    browse_table.add_row(str(ex.example_id), ex.answer_type, q)\n",
+    "\n",
+    "console.print(browse_table)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s2-intro",
+   "metadata": {},
+   "source": [
+    "## 2. The Agent's Tools\n",
+    "\n",
+    "The `KnowledgeGroundedAgent` has five tools that form a natural research workflow:\n",
+    "\n",
+    "| Tool | Purpose | When the Agent Uses It |\n",
+    "|------|---------|----------------------|\n",
+    "| `google_search` | Find relevant URLs | First step for any sub-question |\n",
+    "| `web_fetch` | Read web pages and PDFs | To verify facts from the actual source |\n",
+    "| `fetch_file` | Download CSV, XLSX, JSON files | When the answer is in structured data |\n",
+    "| `grep_file` | Search within a downloaded file | To locate a specific value in a large file |\n",
+    "| `read_file` | Read sections of a downloaded file | To inspect a specific part of a downloaded file |\n",
+    "\n",
+    "**Why not answer from search snippets?** Snippets are brief and may be outdated or misleading.\n",
+    "The system instructions enforce a strict causal chain: **Search → Fetch → Verify → Answer**."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "system-instructions",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "instructions = build_system_instructions()\n",
+    "\n",
+    "console.print(\n",
+    "    Panel(\n",
+    "        Markdown(instructions),\n",
+    "        title=\"Agent System Instructions\",\n",
+    "        border_style=\"blue\",\n",
+    "        padding=(1, 2),\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "summary",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "\n",
+    "In this notebook you saw:\n",
+    "\n",
+    "1. **The DeepSearchQA dataset** — 896 research questions across 17 categories, evaluated with\n",
+    "   precision/recall/F1 using an LLM-as-judge\n",
+    "2. **The five agent tools** — search, web fetch, file download, grep, and file read\n",
+    "3. **The system instructions** — how the agent is guided to use its tools, including the\n",
+    "   critical search → fetch → verify → answer chain\n",
+    "\n",
+    "**Next:** In Notebook 02, we'll create the agent, run it on questions, and observe how it uses these tools."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "done",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "console.print(\n    Panel(\n        \"[green]✓[/green] Notebook complete!\\n\\n\"\n        \"[cyan]Next:[/cyan] Open [bold]02_running_the_agent.ipynb[/bold] to run the agent.\",\n        title=\"Done\",\n        border_style=\"green\",\n    )\n)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/implementations/knowledge_qa/01_grounding_basics.ipynb b/implementations/knowledge_qa/01_grounding_basics.ipynb
deleted file mode 100644
index 0401bb2..0000000
--- a/implementations/knowledge_qa/01_grounding_basics.ipynb
+++ /dev/null
@@ -1,200 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 01: Google Search Grounding with ADK\n",
-    "\n",
-    "This notebook introduces Google Search grounding using the Agent Development Kit (ADK),\n",
-    "which provides explicit, traceable tool calls for web search.\n",
-    "\n",
-    "## Learning Objectives\n",
-    "\n",
-    "- Understand how Google Search grounding works with ADK\n",
-    "- Use the `KnowledgeGroundedAgent` to make grounded queries\n",
-    "- See explicit tool calls in the agent's reasoning\n",
-    "- Compare grounded vs non-grounded responses"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": "# Setup: Load environment and configure rich console\nfrom aieng.agent_evals import (\n    create_console,\n    display_comparison,\n    display_response,\n    display_source_table,\n)\nfrom aieng.agent_evals.knowledge_qa import KnowledgeAgentConfig, KnowledgeGroundedAgent\nfrom dotenv import load_dotenv\nfrom google import genai\nfrom rich.panel import Panel\n\n\nconsole = create_console()\nload_dotenv(verbose=True)"
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1. Understanding Google Search Grounding with ADK\n",
-    "\n",
-    "The Agent Development Kit (ADK) provides a `GoogleSearchTool` that enables:\n",
-    "\n",
-    "1. **Explicit Tool Calls**: The agent decides when to search and you can see each call\n",
-    "2. **ReAct Pattern**: Thought → Action → Observation loop is visible\n",
-    "3. **Traceable**: Every search query and result is logged\n",
-    "4. **Real-time Information**: Access to current web data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": "# Initialize the knowledge agent (uses ADK with GoogleSearchTool internally)\nagent = KnowledgeGroundedAgent()\n\nconsole.print(\n    Panel(\n        f\"[green]✓[/green] Knowledge Agent initialized\\n[cyan]Model:[/cyan] {agent.model}\",\n        title=\"🔧 Setup Complete\",\n        border_style=\"green\",\n    )\n)"
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2. Making Your First Grounded Query"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Ask a question that requires current information\n",
-    "query = \"What is the current population of Tokyo?\"\n",
-    "\n",
-    "console.print(f\"\\n[cyan]📝 Query:[/cyan] {query}\\n\")\n",
-    "\n",
-    "console.print(\"[dim]Searching...[/dim]\")\n",
-    "response = await agent.answer_async(query)\n",
-    "\n",
-    "display_response(response, console=console, title=\"Tokyo Population\")\n",
-    "\n",
-    "# Show the tool calls made by the agent\n",
-    "if response.tool_calls:\n",
-    "    console.print(\"\\n[bold cyan]🔧 Tool Calls Made:[/bold cyan]\")\n",
-    "    for tc in response.tool_calls:\n",
-    "        console.print(f\"  • {tc.get('name', 'unknown')}: {tc.get('args', {})}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Display sources in a detailed table format\n",
-    "display_source_table(response, console=console)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3. Comparing Grounded vs Non-Grounded Responses\n",
-    "\n",
-    "This is where grounding truly shines. We'll ask about Toronto's record single-day snowfall.\n",
-    "\n",
-    "**Why this example works:**\n",
-    "- The record was set on **January 25, 2026** - after the model's training data cutoff\n",
-    "- Without grounding, the model can only guess based on historical data it was trained on\n",
-    "- With grounding, the model searches the web and finds the recent news about this event\n",
-    "\n",
-    "This clearly demonstrates that grounding enables access to information the model couldn't possibly know from training alone."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": "config = KnowledgeAgentConfig()\nclient = genai.Client(api_key=config.gemini_api_key)\n\n# This question requires very recent information (Jan 2026)\n# The non-grounded model will fail since its training data doesn't include this event\nquestion = \"Which day had the highest recorded snowfall in a single day in Toronto?\"\nexpected_answer = \"January 25, 2026\"\n\nconsole.print(f\"\\n[bold]Question:[/bold] {question}\")\nconsole.print(f\"[dim]Expected Answer: {expected_answer}[/dim]\\n\")\n\n# Without grounding - model relies on training data (cutoff before Jan 2026)\nconsole.print(\"[dim]Generating without grounding...[/dim]\")\nresponse_no_grounding = client.models.generate_content(\n    model=config.default_worker_model,\n    contents=question,\n)\n\n# With grounding - agent uses Google Search tool\nconsole.print(\"[dim]Generating with grounding (ADK agent)...[/dim]\")\nresponse_grounded = await agent.answer_async(question)\n\n# Side-by-side comparison using our display utility\ndisplay_comparison(response_no_grounding.text, response_grounded, console=console)\n\n# Show tool calls from the grounded response\nif response_grounded.tool_calls:\n    console.print(\"\\n[bold cyan]🔧 Tool Calls (Grounded):[/bold cyan]\")\n    for tc in response_grounded.tool_calls:\n        console.print(f\"  • {tc.get('name', 'unknown')}: {tc.get('args', {})}\")\n\n# Check if the grounded response contains the correct answer\nif expected_answer.lower() in response_grounded.text.lower() or \"january 25\" in response_grounded.text.lower():\n    console.print(\"\\n[green]✓ Grounded response contains the correct answer![/green]\")\nelse:\n    console.print(\"\\n[yellow]⚠ Check the grounded response for accuracy[/yellow]\")"
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 4. Exercise: Try Your Own Queries\n",
-    "\n",
-    "Try asking questions that:\n",
-    "- Require recent information (news, events, statistics)\n",
-    "- Need multiple facts combined\n",
-    "- Are about specific domains (sports, science, politics)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Try your own query\n",
-    "my_query = \"What are the latest developments in fusion energy?\"\n",
-    "\n",
-    "console.print(f\"[bold cyan]🔍 Query:[/bold cyan] {my_query}\\n\")\n",
-    "\n",
-    "console.print(\"[dim]Searching the web...[/dim]\")\n",
-    "my_response = await agent.answer_async(my_query)\n",
-    "\n",
-    "display_response(my_response, console=console, title=\"Fusion Energy Developments\")\n",
-    "\n",
-    "# Show the tool calls\n",
-    "if my_response.tool_calls:\n",
-    "    console.print(\"\\n[bold cyan]🔧 Tool Calls:[/bold cyan]\")\n",
-    "    for tc in my_response.tool_calls:\n",
-    "        console.print(f\"  • {tc.get('name', 'unknown')}: {tc.get('args', {})}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Summary\n",
-    "\n",
-    "In this notebook, you learned:\n",
-    "\n",
-    "1. How Google Search grounding works with ADK's `GoogleSearchTool`\n",
-    "2. How to use the `KnowledgeGroundedAgent` for grounded queries\n",
-    "3. How to see explicit tool calls in the agent's response\n",
-    "4. The difference between grounded and non-grounded responses\n",
-    "\n",
-    "**Next**: In the next notebook, we'll explore the agent's system instructions and the evaluation dataset."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "console.print(\n",
-    "    Panel(\n",
-    "        \"[green]✓[/green] Notebook complete!\\n\\n\"\n",
-    "        \"[cyan]Next:[/cyan] Open [bold]02_agent_basics.ipynb[/bold] to learn about the Knowledge Agent.\",\n",
-    "        title=\"🎉 Done\",\n",
-    "        border_style=\"green\",\n",
-    "    )\n",
-    ")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/implementations/knowledge_qa/02_agent_basics.ipynb b/implementations/knowledge_qa/02_agent_basics.ipynb
deleted file mode 100644
index 7f9d088..0000000
--- a/implementations/knowledge_qa/02_agent_basics.ipynb
+++ /dev/null
@@ -1,323 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 02: Knowledge-Grounded Agent Basics\n",
-    "\n",
-    "This notebook introduces the `KnowledgeGroundedAgent` class, which wraps Gemini\n",
-    "with Google Search grounding into a full-featured QA agent.\n",
-    "\n",
-    "## Learning Objectives\n",
-    "\n",
-    "- Create and configure a `KnowledgeGroundedAgent`\n",
-    "- Understand the agent's system instructions\n",
-    "- Use the agent for single-turn Q&A\n",
-    "- Explore the DeepSearchQA evaluation dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": "# Setup: Load environment and configure rich console\nfrom aieng.agent_evals import (\n    create_console,\n    display_example,\n    display_info,\n    display_response,\n    display_success,\n)\nfrom aieng.agent_evals.knowledge_qa import (\n    DeepSearchQADataset,\n    KnowledgeAgentManager,\n    KnowledgeGroundedAgent,\n)\nfrom aieng.agent_evals.knowledge_qa.agent import SYSTEM_INSTRUCTIONS\nfrom dotenv import load_dotenv\nfrom rich.markdown import Markdown\nfrom rich.panel import Panel\nfrom rich.table import Table\n\n\nconsole = create_console()\nload_dotenv(verbose=True)"
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1. Creating a Knowledge Agent"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": "# Create the agent\nagent = KnowledgeGroundedAgent()\n\n# Display configuration\nconfig_table = Table(title=\"🤖 Agent Configuration\", show_header=True, header_style=\"bold cyan\")\nconfig_table.add_column(\"Setting\", style=\"cyan\")\nconfig_table.add_column(\"Value\", style=\"white\")\nconfig_table.add_row(\"Model\", agent.model)\nconfig_table.add_row(\"Planner Model\", agent.config.default_planner_model)\nconfig_table.add_row(\"Worker Model\", agent.config.default_worker_model)\n\nconsole.print(config_table)"
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2. Understanding System Instructions\n",
-    "\n",
-    "The agent uses carefully crafted system instructions to guide its behavior."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": "console.print(\n    Panel(\n        Markdown(SYSTEM_INSTRUCTIONS),\n        title=\"📜 System Instructions\",\n        border_style=\"blue\",\n        subtitle=\"[dim]Guides agent behavior[/dim]\",\n    )\n)"
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3. Single-Turn Q&A\n",
-    "\n",
-    "Let's use the agent to answer some questions."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Example 1: Current events\n",
-    "question = \"What are the most significant AI developments in January 2026?\"\n",
-    "\n",
-    "console.print(\n",
-    "    Panel(\n",
-    "        f\"[bold green]{question}[/bold green]\",\n",
-    "        title=\"❓ Question\",\n",
-    "        border_style=\"green\",\n",
-    "    )\n",
-    ")\n",
-    "\n",
-    "console.print(\"[dim]🔍 Agent is researching...[/dim]\")\n",
-    "response = await agent.answer_async(question)\n",
-    "\n",
-    "display_response(response, console=console, title=\"AI Developments January 2026\")\n",
-    "\n",
-    "# Show tool calls made during reasoning\n",
-    "if response.tool_calls:\n",
-    "    console.print(\"\\n[bold cyan]🔧 Tool Calls (ReAct Trace):[/bold cyan]\")\n",
-    "    for tc in response.tool_calls:\n",
-    "        console.print(f\"  • {tc.get('name', 'unknown')}: {tc.get('args', {})}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Example 2: Factual question\n",
-    "question = \"What countries have successfully landed spacecraft on the Moon?\"\n",
-    "\n",
-    "console.print(\n",
-    "    Panel(\n",
-    "        f\"[bold green]{question}[/bold green]\",\n",
-    "        title=\"❓ Question\",\n",
-    "        border_style=\"green\",\n",
-    "    )\n",
-    ")\n",
-    "\n",
-    "console.print(\"[dim]🔍 Agent is researching...[/dim]\")\n",
-    "response = await agent.answer_async(question)\n",
-    "\n",
-    "display_response(response, console=console, title=\"Moon Landing Countries\")\n",
-    "\n",
-    "# Show tool calls\n",
-    "if response.tool_calls:\n",
-    "    console.print(\"\\n[bold cyan]🔧 Tool Calls:[/bold cyan]\")\n",
-    "    for tc in response.tool_calls:\n",
-    "        console.print(f\"  • {tc.get('name', 'unknown')}: {tc.get('args', {})}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": "## 4. Using the KnowledgeAgentManager\n\nFor applications that need to manage agent lifecycle, use `KnowledgeAgentManager`."
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": "# Create a manager (lazy initialization)\nmanager = KnowledgeAgentManager()\n\ndisplay_info(f\"Initialized: {manager.is_initialized()}\", console=console)\n\n# Access the agent (triggers initialization)\nmanaged_agent = manager.agent\ndisplay_info(f\"After access: {manager.is_initialized()}\", console=console)\n\n# Use the agent\nconsole.print(\"[dim]Querying...[/dim]\")\nresponse = await managed_agent.answer_async(\"What is the speed of light?\")\n\ndisplay_response(response, console=console, title=\"Quick Answer\", show_queries=False)\n\n# Cleanup\nmanager.close()\ndisplay_success(\"Manager closed\", console=console)"
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 5. Exploring the DeepSearchQA Dataset\n",
-    "\n",
-    "The DeepSearchQA dataset contains 896 research questions for evaluating knowledge agents."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": "# Load the dataset\nwith console.status(\"[cyan]Loading DeepSearchQA dataset...[/cyan]\", spinner=\"dots\"):\n    dataset = DeepSearchQADataset()\n\n# Display dataset info\ninfo_table = Table(title=\"📊 DeepSearchQA Dataset\", show_header=True, header_style=\"bold cyan\")\ninfo_table.add_column(\"Metric\", style=\"cyan\")\ninfo_table.add_column(\"Value\", style=\"white\")\ninfo_table.add_row(\"Total Examples\", str(len(dataset)))\ninfo_table.add_row(\"Categories\", str(len(dataset.get_categories())))\n\nconsole.print(info_table)"
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Display categories\n",
-    "categories = dataset.get_categories()\n",
-    "\n",
-    "cat_table = Table(title=\"📁 Problem Categories\", show_header=True, header_style=\"bold green\")\n",
-    "cat_table.add_column(\"Category\", style=\"white\")\n",
-    "cat_table.add_column(\"Count\", style=\"cyan\", justify=\"right\")\n",
-    "\n",
-    "for cat in sorted(categories):\n",
-    "    count = len(dataset.get_by_category(cat))\n",
-    "    cat_table.add_row(cat, str(count))\n",
-    "\n",
-    "console.print(cat_table)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Show a sample example using the shared display utility\n",
-    "example = dataset[0]\n",
-    "\n",
-    "display_example(\n",
-    "    example_id=example.example_id,\n",
-    "    problem=example.problem,\n",
-    "    category=example.problem_category,\n",
-    "    answer=example.answer,\n",
-    "    answer_type=example.answer_type,\n",
-    "    console=console,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get random samples\n",
-    "samples = dataset.sample(n=3, random_state=42)\n",
-    "\n",
-    "console.print(\"[bold]📚 Random Samples from Dataset[/bold]\\n\")\n",
-    "\n",
-    "for ex in samples:\n",
-    "    display_example(\n",
-    "        example_id=ex.example_id,\n",
-    "        problem=ex.problem[:300] + \"...\" if len(ex.problem) > 300 else ex.problem,\n",
-    "        category=ex.problem_category,\n",
-    "        answer=ex.answer,\n",
-    "        console=console,\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 6. Testing the Agent on DeepSearchQA\n",
-    "\n",
-    "Let's test the agent on a sample question from the dataset."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Pick an example\n",
-    "test_example = samples[0]\n",
-    "\n",
-    "console.print(\n",
-    "    Panel(\n",
-    "        f\"[bold]Testing on Example {test_example.example_id}[/bold]\\n\\n\"\n",
-    "        f\"[cyan]Category:[/cyan] {test_example.problem_category}\\n\"\n",
-    "        f\"[cyan]Expected Answer:[/cyan] {test_example.answer}\",\n",
-    "        title=\"🧪 Test Setup\",\n",
-    "        border_style=\"yellow\",\n",
-    "    )\n",
-    ")\n",
-    "\n",
-    "# Ask the agent\n",
-    "console.print(\n",
-    "    Panel(\n",
-    "        f\"[bold green]{test_example.problem}[/bold green]\",\n",
-    "        title=\"❓ Question\",\n",
-    "        border_style=\"green\",\n",
-    "    )\n",
-    ")\n",
-    "\n",
-    "console.print(\"[dim]🔍 Agent is researching...[/dim]\")\n",
-    "response = await agent.answer_async(test_example.problem)\n",
-    "\n",
-    "display_response(response, console=console, title=\"Agent Response\")\n",
-    "\n",
-    "# Show tool calls\n",
-    "if response.tool_calls:\n",
-    "    console.print(\"\\n[bold cyan]🔧 Tool Calls:[/bold cyan]\")\n",
-    "    for tc in response.tool_calls:\n",
-    "        console.print(f\"  • {tc.get('name', 'unknown')}: {tc.get('args', {})}\")\n",
-    "\n",
-    "# Compare\n",
-    "contains_answer = test_example.answer.lower() in response.text.lower()\n",
-    "if contains_answer:\n",
-    "    console.print(\"\\n[green]✓ CONTAINS EXPECTED ANSWER[/green]\")\n",
-    "else:\n",
-    "    console.print(\"\\n[yellow]⚠ Answer may differ[/yellow]\")\n",
-    "console.print(f\"[dim]Expected: {test_example.answer}[/dim]\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Summary\n",
-    "\n",
-    "In this notebook, you learned:\n",
-    "\n",
-    "1. How to create and configure a `KnowledgeGroundedAgent`\n",
-    "2. The system instructions that guide agent behavior\n",
-    "3. How to use the agent for single-turn Q&A\n",
-    "4. How to explore the DeepSearchQA evaluation dataset\n",
-    "\n",
-    "**Next**: In the next notebook, we'll explore multi-turn conversations and run systematic evaluations."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "console.print(\n",
-    "    Panel(\n",
-    "        \"[green]✓[/green] Notebook complete!\\n\\n\"\n",
-    "        \"[cyan]Next:[/cyan] Open [bold]03_multi_turn.ipynb[/bold] to learn about multi-turn conversations and evaluation.\",\n",
-    "        title=\"🎉 Done\",\n",
-    "        border_style=\"green\",\n",
-    "    )\n",
-    ")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/implementations/knowledge_qa/02_running_the_agent.ipynb b/implementations/knowledge_qa/02_running_the_agent.ipynb
new file mode 100644
index 0000000..834080a
--- /dev/null
+++ b/implementations/knowledge_qa/02_running_the_agent.ipynb
@@ -0,0 +1,426 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "intro",
+   "metadata": {},
+   "source": [
+    "# 02: Running the Agent\n",
+    "\n",
+    "In Notebook 01 we explored the dataset and tools. This notebook shows how to run the\n",
+    "`KnowledgeGroundedAgent` in practice.\n",
+    "\n",
+    "## What You'll Learn\n",
+    "\n",
+    "1. The agent's PlanReAct architecture and the `AgentResponse` data structure\n",
+    "2. Running a question with live progress display\n",
+    "3. Inspecting the response: plan, tool calls, sources, and reasoning\n",
+    "4. Multi-turn conversations using session state\n",
+    "5. Observability with Langfuse tracing\n",
+    "\n",
+    "## Prerequisites\n",
+    "\n",
+    "Complete Notebook 01. You'll need `GOOGLE_API_KEY` in your `.env` file.\n",
+    "For tracing (Section 4): `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are also required."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "setup",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\nimport uuid\nfrom pathlib import Path\n\nfrom aieng.agent_evals.knowledge_qa import KnowledgeGroundedAgent\nfrom aieng.agent_evals.knowledge_qa.notebook import display_response, run_with_display\nfrom aieng.agent_evals.langfuse import init_tracing\nfrom dotenv import load_dotenv\nfrom rich.console import Console\nfrom rich.panel import Panel\nfrom rich.table import Table\n\n\nif Path(\"\").absolute().name == \"eval-agents\":\n    print(f\"Working directory: {Path('').absolute()}\")\nelse:\n    os.chdir(Path(\"\").absolute().parent.parent)\n    print(f\"Working directory set to: {Path('').absolute()}\")\n\nload_dotenv(verbose=True)\nconsole = Console(width=100)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s1-arch",
+   "metadata": {},
+   "source": [
+    "## 1. Agent Architecture\n",
+    "\n",
+    "The `KnowledgeGroundedAgent` is built on Google ADK and combines two patterns:\n",
+    "\n",
+    "**PlanReAct** — Before executing, the agent produces an explicit research plan with numbered\n",
+    "steps. Each step has a type (`SEARCH`, `FETCH`, `ANALYZE`) and a status that transitions from\n",
+    "`pending` → `in_progress` → `completed` (or `failed`/`skipped`). The plan can be revised\n",
+    "mid-run if the agent encounters unexpected results.\n",
+    "\n",
+    "**ReAct loop** — Within each step, the agent alternates between reasoning (Thought), acting\n",
+    "(tool call), and observing (tool response).\n",
+    "\n",
+    "### The `AgentResponse` Object\n",
+    "\n",
+    "After running, `agent.answer_async(question)` returns an `AgentResponse`:\n",
+    "\n",
+    "| Field | Type | Description |\n",
+    "|-------|------|-------------|\n",
+    "| `text` | `str` | The final answer |\n",
+    "| `plan` | `ResearchPlan` | Numbered steps with statuses |\n",
+    "| `tool_calls` | `list[dict]` | Every tool invocation during execution |\n",
+    "| `sources` | `list[GroundingChunk]` | URLs used as evidence |\n",
+    "| `reasoning_chain` | `list[str]` | The model's intermediate reasoning |\n",
+    "| `total_duration_ms` | `int` | Wall-clock execution time |"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "create-agent",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "agent = KnowledgeGroundedAgent(enable_planning=True)\n",
+    "\n",
+    "config_table = Table(title=\"Agent Configuration\", show_header=False)\n",
+    "config_table.add_column(\"Setting\", style=\"cyan\")\n",
+    "config_table.add_column(\"Value\", style=\"white\")\n",
+    "config_table.add_row(\"Model\", agent.model)\n",
+    "config_table.add_row(\"Planning\", \"PlanReAct (enabled)\")\n",
+    "config_table.add_row(\"Session Service\", \"InMemorySessionService\")\n",
+    "config_table.add_row(\"Tools\", \"google_search, web_fetch, fetch_file, grep_file, read_file\")\n",
+    "\n",
+    "console.print(config_table)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s2-running",
+   "metadata": {},
+   "source": [
+    "## 2. Running a Question\n",
+    "\n",
+    "`run_with_display` executes the agent in a Jupyter notebook with a live progress display showing:\n",
+    "\n",
+    "- The research plan with step statuses (updating in real time)\n",
+    "- Tool calls as they fire\n",
+    "\n",
+    "We'll use a question that requires web search — the agent must find and verify a specific fact,\n",
+    "not recall it from training data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "run-agent",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question = \"When was the highest single-day snowfall recorded in Toronto, and how much snow fell?\"\n",
+    "\n",
+    "response = await run_with_display(agent, question)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "show-answer",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "display_response(\n",
+    "    console,\n",
+    "    response.text,\n",
+    "    subtitle=(\n",
+    "        f\"Duration: {response.total_duration_ms / 1000:.1f}s  |  \"\n",
+    "        f\"Tool calls: {len(response.tool_calls)}  |  \"\n",
+    "        f\"Sources: {len(response.sources)}\"\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s2-inspect",
+   "metadata": {},
+   "source": [
+    "### 2.1 Inspecting the Response\n",
+    "\n",
+    "The `AgentResponse` object contains the full execution trace."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "show-plan",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plan = response.plan\n\nplan_table = Table(title=\"Research Plan\")\nplan_table.add_column(\"#\", style=\"cyan\", width=3)\nplan_table.add_column(\"Step\", style=\"white\")\nplan_table.add_column(\"Type\", style=\"dim\", width=12)\nplan_table.add_column(\"Status\", style=\"green\")\n\nfor step in plan.steps:\n    icon = {\"completed\": \"✓\", \"failed\": \"✗\", \"skipped\": \"○\"}.get(step.status.value, \"·\")\n    desc = step.description[:70] + \"...\" if len(step.description) > 70 else step.description\n    plan_table.add_row(str(step.step_id), desc, step.step_type, f\"{icon} {step.status.value}\")\n\nconsole.print(plan_table)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "show-tools",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if response.tool_calls:\n",
+    "    tools_table = Table(title=\"Tool Calls\")\n",
+    "    tools_table.add_column(\"#\", style=\"dim\", width=3)\n",
+    "    tools_table.add_column(\"Tool\", style=\"cyan\", width=16)\n",
+    "    tools_table.add_column(\"Arguments (truncated)\", style=\"white\")\n",
+    "\n",
+    "    for i, tc in enumerate(response.tool_calls[:15], 1):\n",
+    "        name = tc.get(\"name\", \"unknown\")\n",
+    "        args = str(tc.get(\"args\", {}))\n",
+    "        args = args[:70] + \"...\" if len(args) > 70 else args\n",
+    "        tools_table.add_row(str(i), name, args)\n",
+    "\n",
+    "    if len(response.tool_calls) > 15:\n",
+    "        tools_table.add_row(\"...\", f\"({len(response.tool_calls) - 15} more)\", \"\")\n",
+    "\n",
+    "    console.print(tools_table)\n",
+    "else:\n",
+    "    console.print(\"[dim]No tool calls recorded[/dim]\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "show-sources",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if response.sources:\n",
+    "    seen: set[str] = set()\n",
+    "    sources_table = Table(title=\"Sources\")\n",
+    "    sources_table.add_column(\"#\", style=\"dim\", width=3)\n",
+    "    sources_table.add_column(\"URL\", style=\"blue\")\n",
+    "\n",
+    "    for src in response.sources:\n",
+    "        if src.uri and src.uri not in seen:\n",
+    "            seen.add(src.uri)\n",
+    "            url = src.uri[:85] + \"...\" if len(src.uri) > 85 else src.uri\n",
+    "            sources_table.add_row(str(len(seen)), url)\n",
+    "        if len(seen) >= 10:\n",
+    "            break\n",
+    "\n",
+    "    console.print(sources_table)\n",
+    "else:\n",
+    "    console.print(\"[dim]No sources recorded[/dim]\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s3-multiturn",
+   "metadata": {},
+   "source": [
+    "## 3. Multi-Turn Conversations\n",
+    "\n",
+    "The agent uses an `InMemorySessionService` to maintain conversation context across turns.\n",
+    "Pass the same `session_id` to link questions together — the agent will use prior context\n",
+    "when answering follow-up questions."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "multiturn",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "session_id = str(uuid.uuid4())\n",
+    "console.print(f\"Session ID: [dim]{session_id}[/dim]\\n\")\n",
+    "\n",
+    "# First turn: establish a subject\n",
+    "response1 = await agent.answer_async(\n",
+    "    \"What is the capital of France?\",\n",
+    "    session_id=session_id,\n",
+    ")\n",
+    "display_response(console, response1.text, title=\"Turn 1\")\n",
+    "\n",
+    "# Second turn: follow-up that references the prior context\n",
+    "response2 = await agent.answer_async(\n",
+    "    \"What is the official language spoken there?\",\n",
+    "    session_id=session_id,\n",
+    ")\n",
+    "display_response(console, response2.text, title=\"Turn 2 (follow-up)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s4-tracing",
+   "metadata": {},
+   "source": [
+    "## 4. Observability with Langfuse\n",
+    "\n",
+    "Langfuse captures a full trace of every agent run using OpenTelemetry, giving you visibility into:\n",
+    "\n",
+    "- Every tool call and its arguments\n",
+    "- Every LLM call with prompts and completions\n",
+    "- Timing for each span\n",
+    "- The full agent execution tree\n",
+    "\n",
+    "This is essential for debugging failures, measuring latency, and comparing configurations.\n",
+    "\n",
+    "### Trace Structure\n",
+    "\n",
+    "```\n",
+    "Trace: agent run\n",
+    "├── Span: planning (PlanReAct)\n",
+    "│   └── LLM Call: create_plan\n",
+    "├── Span: step-1-execution\n",
+    "│   ├── Tool Call: google_search\n",
+    "│   ├── Tool Call: web_fetch\n",
+    "│   └── LLM Call: step_summary\n",
+    "├── Span: step-2-execution\n",
+    "│   └── ...\n",
+    "└── Span: synthesis\n",
+    "    └── LLM Call: final_answer\n",
+    "```\n",
+    "\n",
+    "### Prerequisites\n",
+    "\n",
+    "Set these in your `.env` file:\n",
+    "- `LANGFUSE_PUBLIC_KEY`\n",
+    "- `LANGFUSE_SECRET_KEY`\n",
+    "- `LANGFUSE_HOST` (optional, defaults to `https://cloud.langfuse.com`)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "check-creds",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "langfuse_configured = all(\n",
+    "    [\n",
+    "        os.getenv(\"LANGFUSE_PUBLIC_KEY\"),\n",
+    "        os.getenv(\"LANGFUSE_SECRET_KEY\"),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "if langfuse_configured:\n",
+    "    console.print(\"[green]✓[/green] Langfuse credentials found\")\n",
+    "else:\n",
+    "    console.print(\"[yellow]⚠[/yellow] Langfuse credentials not found — tracing cells will be skipped\")\n",
+    "    console.print(\"[dim]Set LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY in .env[/dim]\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "init-tracing",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tracing_enabled = init_tracing()\n",
+    "\n",
+    "if tracing_enabled:\n",
+    "    console.print(\"[green]✓[/green] Langfuse tracing initialized\")\n",
+    "else:\n",
+    "    console.print(\"[yellow]⚠[/yellow] Tracing not enabled (check credentials)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "run-traced",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if tracing_enabled:\n",
+    "    from langfuse import Langfuse\n",
+    "\n",
+    "    langfuse = Langfuse()\n",
+    "    traced_agent = KnowledgeGroundedAgent(enable_planning=True)\n",
+    "    traced_question = \"What programming language was created by Guido van Rossum, and in what year?\"\n",
+    "\n",
+    "    console.print(Panel(traced_question, title=\"Traced Question\", border_style=\"green\"))\n",
+    "\n",
+    "    with langfuse.start_as_current_span(name=\"knowledge-agent\", input=traced_question):\n",
+    "        trace_id = langfuse.get_current_trace_id()\n",
+    "        traced_response = await traced_agent.answer_async(traced_question)\n",
+    "        langfuse.update_current_span(output=traced_response.text)\n",
+    "\n",
+    "    display_response(\n",
+    "        console,\n",
+    "        traced_response.text,\n",
+    "        subtitle=f\"Duration: {traced_response.total_duration_ms / 1000:.1f}s\",\n",
+    "    )\n",
+    "else:\n",
+    "    console.print(\"[dim]Skipping (Langfuse not configured)[/dim]\")\n",
+    "    trace_id = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "flush-traces",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if tracing_enabled:\n    from IPython.display import HTML, display  # noqa: A004\n    from langfuse import Langfuse\n    from opentelemetry import trace as otel_trace\n\n    provider = otel_trace.get_tracer_provider()\n    if hasattr(provider, \"force_flush\"):\n        provider.force_flush(timeout_millis=5000)\n    console.print(\"[green]✓[/green] Traces flushed to Langfuse\")\n\n    if trace_id:\n        trace_url = Langfuse().get_trace_url(trace_id=trace_id)\n        display(HTML(f'<p>View trace: <a href=\"{trace_url}\" target=\"_blank\">{trace_url}</a></p>'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s4-ui",
+   "metadata": {},
+   "source": [
+    "### 4.1 Viewing Traces in the Langfuse UI\n\nOpen your Langfuse project and navigate to **Traces**. Each run appears as a\ntree of spans. Useful things to look at:\n\n- **Span timeline** — which steps take the most time?\n- **Tool call arguments** — what search queries did the agent use?\n- **LLM interactions** — what did the model reason about before calling each tool?\n- **Errors** — red spans show where failures occurred\n\nYou can also filter by trace name, time range, or input content."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "summary",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "\n",
+    "In this notebook you learned:\n",
+    "\n",
+    "1. **Creating the agent** — `KnowledgeGroundedAgent(enable_planning=True)` with PlanReAct\n",
+    "2. **Running questions** — `run_with_display` for live notebook progress; `agent.answer_async` for raw access\n",
+    "3. **The `AgentResponse`** — plan, tool calls, sources, reasoning, and timing in one object\n",
+    "4. **Multi-turn conversations** — linking turns with `session_id`\n",
+    "5. **Langfuse tracing** — `init_tracing()` and the Langfuse SDK for full observability\n",
+    "\n",
+    "**Next:** In Notebook 03, we'll run a systematic evaluation using the DeepSearchQA benchmark."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "done",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "console.print(\n",
+    "    Panel(\n",
+    "        \"[green]✓[/green] Notebook complete!\\n\\n\"\n",
+    "        \"[cyan]Next:[/cyan] Open [bold]03_evaluation.ipynb[/bold] to evaluate the agent at scale.\",\n",
+    "        title=\"Done\",\n",
+    "        border_style=\"green\",\n",
+    "    )\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/implementations/knowledge_qa/03_evaluation.ipynb b/implementations/knowledge_qa/03_evaluation.ipynb
new file mode 100644
index 0000000..c8d6bd3
--- /dev/null
+++ b/implementations/knowledge_qa/03_evaluation.ipynb
@@ -0,0 +1,482 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "intro",
+   "metadata": {},
+   "source": [
+    "# 03: Evaluation\n",
+    "\n",
+    "In Notebook 02 we ran individual questions by hand. This notebook evaluates the agent\n",
+    "systematically: we upload a dataset subset to Langfuse, run the agent on every item, and\n",
+    "score each response with an LLM-as-judge grader using the official DeepSearchQA methodology.\n",
+    "\n",
+    "## What You'll Learn\n",
+    "\n",
+    "1. Uploading a DeepSearchQA subset to Langfuse as a persistent dataset\n",
+    "2. The LLM-as-judge grader: precision, recall, F1, and the four outcome categories\n",
+    "3. A single-sample evaluation walkthrough\n",
+    "4. Running the full experiment with `run_experiment`\n",
+    "5. Inspecting and interpreting item-level results\n",
+    "\n",
+    "## Prerequisites\n",
+    "\n",
+    "Complete Notebooks 01 and 02. You'll need all credentials in `.env`:\n",
+    "- `GOOGLE_API_KEY`\n",
+    "- `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY`\n",
+    "- `OPENAI_API_KEY` (for the LLM grader)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "setup",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import os\n",
+    "import tempfile\n",
+    "from pathlib import Path\n",
+    "from typing import Any\n",
+    "\n",
+    "import pandas as pd\n",
+    "from aieng.agent_evals.evaluation import run_experiment\n",
+    "from aieng.agent_evals.evaluation.graders.config import LLMRequestConfig\n",
+    "from aieng.agent_evals.knowledge_qa import DeepSearchQADataset, KnowledgeGroundedAgent\n",
+    "from aieng.agent_evals.knowledge_qa.deepsearchqa_grader import (\n",
+    "    EvaluationOutcome,\n",
+    "    evaluate_deepsearchqa_async,\n",
+    ")\n",
+    "from aieng.agent_evals.knowledge_qa.notebook import display_response, run_with_display\n",
+    "from aieng.agent_evals.langfuse import upload_dataset_to_langfuse\n",
+    "from dotenv import load_dotenv\n",
+    "from IPython.display import HTML, display  # noqa: A004\n",
+    "from langfuse.experiment import Evaluation\n",
+    "from rich.console import Console\n",
+    "from rich.panel import Panel\n",
+    "from rich.table import Table\n",
+    "\n",
+    "\n",
+    "if Path(\"\").absolute().name == \"eval-agents\":\n",
+    "    print(f\"Working directory: {Path('').absolute()}\")\n",
+    "else:\n",
+    "    os.chdir(Path(\"\").absolute().parent.parent)\n",
+    "    print(f\"Working directory set to: {Path('').absolute()}\")\n",
+    "\n",
+    "load_dotenv(verbose=True)\n",
+    "console = Console(width=100)\n",
+    "\n",
+    "DATASET_NAME = \"DeepSearchQA-Subset\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s1-upload-intro",
+   "metadata": {},
+   "source": [
+    "## 1. Uploading the Dataset to Langfuse\n",
+    "\n",
+    "Langfuse stores our evaluation dataset so we can run multiple experiments against the same items\n",
+    "and compare results over time. Each dataset item has three fields:\n",
+    "\n",
+    "- **`input`**: the question (sent to the agent)\n",
+    "- **`expected_output`**: the ground truth answer (given to the grader, never shown to the agent)\n",
+    "- **`metadata`**: `category`, `answer_type`, `example_id`\n",
+    "\n",
+    "Items are deduplicated by a hash of their content, so running this cell again is safe."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "upload",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = DeepSearchQADataset()\n",
+    "examples = dataset.get_by_category(\"Finance & Economics\")[:1]\n",
+    "\n",
+    "console.print(f\"Uploading [cyan]{len(examples)}[/cyan] examples to dataset '{DATASET_NAME}'...\")\n",
+    "\n",
+    "# Write examples to a temporary JSONL file for the upload utility\n",
+    "with tempfile.NamedTemporaryFile(mode=\"w\", suffix=\".jsonl\", delete=False, encoding=\"utf-8\") as f:\n",
+    "    for ex in examples:\n",
+    "        record = {\n",
+    "            \"input\": ex.problem,\n",
+    "            \"expected_output\": ex.answer,\n",
+    "            \"metadata\": {\n",
+    "                \"example_id\": ex.example_id,\n",
+    "                \"category\": ex.problem_category,\n",
+    "                \"answer_type\": ex.answer_type,\n",
+    "            },\n",
+    "        }\n",
+    "        f.write(json.dumps(record, ensure_ascii=False) + \"\\n\")\n",
+    "    temp_path = f.name\n",
+    "\n",
+    "await upload_dataset_to_langfuse(dataset_path=temp_path, dataset_name=DATASET_NAME)\n",
+    "os.unlink(temp_path)\n",
+    "\n",
+    "console.print(f\"[green]✓[/green] Dataset '{DATASET_NAME}' ready in Langfuse\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s2-grader-intro",
+   "metadata": {},
+   "source": [
+    "## 2. The DeepSearchQA Grader\n",
+    "\n",
+    "The grader is an LLM-as-judge that evaluates answers using the official DeepSearchQA methodology\n",
+    "from Appendix A of the paper. It handles both answer types:\n",
+    "\n",
+    "- **Single Answer**: checks whether the response contains the one expected value\n",
+    "- **Set Answer**: checks which items from the ground truth set appear in the response,\n",
+    "  and flags any extra items the agent included\n",
+    "\n",
+    "### Metrics\n",
+    "\n",
+    "Let **S** = predicted items, **G** = ground truth items:\n",
+    "\n",
+    "| Metric | Formula | Meaning |\n",
+    "|--------|---------|---------|\n",
+    "| **Precision** | \\|S∩G\\| / \\|S\\| | Of what the agent said, how much was correct |\n",
+    "| **Recall** | \\|S∩G\\| / \\|G\\| | Of the ground truth, how much did the agent find |\n",
+    "| **F1** | 2·P·R / (P+R) | Harmonic mean of precision and recall |\n",
+    "\n",
+    "### Outcome Classification\n",
+    "\n",
+    "| Outcome | Condition | Interpretation |\n",
+    "|---------|-----------|----------------|\n",
+    "| `fully_correct` | S = G | Perfect answer |\n",
+    "| `correct_with_extraneous` | G ⊆ S | All correct, but extra items included |\n",
+    "| `partially_correct` | S∩G ≠ ∅ | Some correct items found |\n",
+    "| `fully_incorrect` | S∩G = ∅ | No correct items |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s2-single-sample",
+   "metadata": {},
+   "source": [
+    "### 2.1 Single-Sample Walkthrough\n",
+    "\n",
+    "Before running at scale, let's walk through one example end-to-end: run the agent,\n",
+    "then grade its response with the LLM judge."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "pick-example",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reproducibly select one Finance & Economics example\n",
+    "finance_examples = dataset.get_by_category(\"Finance & Economics\")\n",
+    "example = finance_examples[0]\n",
+    "\n",
+    "console.print(\n",
+    "    Panel(\n",
+    "        f\"[bold]ID:[/bold] {example.example_id}\\n\"\n",
+    "        f\"[bold]Category:[/bold] {example.problem_category}\\n\"\n",
+    "        f\"[bold]Answer Type:[/bold] {example.answer_type}\\n\\n\"\n",
+    "        f\"[bold cyan]Question:[/bold cyan]\\n{example.problem}\\n\\n\"\n",
+    "        f\"[bold yellow]Ground Truth:[/bold yellow]\\n{example.answer}\",\n",
+    "        title=\"Evaluation Example\",\n",
+    "        border_style=\"blue\",\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "run-agent",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_agent = KnowledgeGroundedAgent(enable_planning=True)\n",
+    "eval_response = await run_with_display(eval_agent, example.problem)\n",
+    "\n",
+    "display_response(\n",
+    "    console,\n",
+    "    eval_response.text,\n",
+    "    title=\"Agent Answer\",\n",
+    "    subtitle=f\"Duration: {eval_response.total_duration_ms / 1000:.1f}s  |  Tools: {len(eval_response.tool_calls)}\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "grade-response",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "console.print(\"[dim]Grading with LLM judge...[/dim]\\n\")\n",
+    "\n",
+    "result = await evaluate_deepsearchqa_async(\n",
+    "    question=example.problem,\n",
+    "    answer=eval_response.text,\n",
+    "    ground_truth=example.answer,\n",
+    "    answer_type=example.answer_type,\n",
+    ")\n",
+    "\n",
+    "outcome_color = {\n",
+    "    EvaluationOutcome.FULLY_CORRECT: \"green\",\n",
+    "    EvaluationOutcome.CORRECT_WITH_EXTRANEOUS: \"yellow\",\n",
+    "    EvaluationOutcome.PARTIALLY_CORRECT: \"orange1\",\n",
+    "    EvaluationOutcome.FULLY_INCORRECT: \"red\",\n",
+    "}.get(result.outcome, \"white\")\n",
+    "\n",
+    "metrics_table = Table(title=\"Grader Results\")\n",
+    "metrics_table.add_column(\"Metric\", style=\"cyan\")\n",
+    "metrics_table.add_column(\"Value\", style=\"white\")\n",
+    "metrics_table.add_row(\"Outcome\", f\"[{outcome_color}]{result.outcome.value}[/{outcome_color}]\")\n",
+    "metrics_table.add_row(\"Precision\", f\"{result.precision:.3f}\")\n",
+    "metrics_table.add_row(\"Recall\", f\"{result.recall:.3f}\")\n",
+    "metrics_table.add_row(\"F1\", f\"[bold]{result.f1_score:.3f}[/bold]\")\n",
+    "console.print(metrics_table)\n",
+    "\n",
+    "if result.explanation:\n",
+    "    console.print(Panel(result.explanation, title=\"Grader Explanation\", border_style=\"magenta\"))\n",
+    "\n",
+    "# Show per-item correctness for Set Answer questions\n",
+    "if result.correctness_details:\n",
+    "    details_table = Table(title=\"Correctness Details\")\n",
+    "    details_table.add_column(\"Expected Item\", style=\"white\")\n",
+    "    details_table.add_column(\"Found\", style=\"cyan\", justify=\"center\")\n",
+    "    for item, found in result.correctness_details.items():\n",
+    "        icon = \"[green]✓[/green]\" if found else \"[red]✗[/red]\"\n",
+    "        label = item[:60] + \"...\" if len(item) > 60 else item\n",
+    "        details_table.add_row(label, icon)\n",
+    "    console.print(details_table)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s3-experiment-intro",
+   "metadata": {},
+   "source": [
+    "## 3. Running the Evaluation Experiment\n",
+    "\n",
+    "`run_experiment` runs the agent against every item in the Langfuse dataset, scores each\n",
+    "response, and records results in Langfuse. Each call creates a new named experiment run\n",
+    "that you can compare to previous runs in the UI.\n",
+    "\n",
+    "The experiment takes two functions:\n",
+    "\n",
+    "- **`agent_task`** — receives a dataset item, runs the agent, returns the answer string\n",
+    "- **`deepsearchqa_evaluator`** — receives question, answer, and ground truth; returns grader scores\n",
+    "\n",
+    "> **Note:** This makes one agent call and one grader call per item. With 10 items and\n",
+    "> `max_concurrency=1`, expect 20–40 minutes depending on model latency."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "define-task-evaluator",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "async def agent_task(*, item: Any, **kwargs: Any) -> str:\n",
+    "    \"\"\"Run the Knowledge Agent on a Langfuse dataset item.\"\"\"\n",
+    "    agent = KnowledgeGroundedAgent(enable_planning=True)\n",
+    "    response = await agent.answer_async(item.input)\n",
+    "    return response.text\n",
+    "\n",
+    "\n",
+    "async def deepsearchqa_evaluator(\n",
+    "    *,\n",
+    "    input: str,  # noqa: A002\n",
+    "    output: str,\n",
+    "    expected_output: str,\n",
+    "    metadata: dict[str, Any] | None = None,\n",
+    "    **kwargs: Any,\n",
+    ") -> list[Evaluation]:\n",
+    "    \"\"\"LLM-as-judge grader using DeepSearchQA methodology.\"\"\"\n",
+    "    answer_type = (metadata or {}).get(\"answer_type\", \"Set Answer\")\n",
+    "    result = await evaluate_deepsearchqa_async(\n",
+    "        question=input,\n",
+    "        answer=output,\n",
+    "        ground_truth=expected_output,\n",
+    "        answer_type=answer_type,\n",
+    "        model_config=LLMRequestConfig(temperature=0.0),\n",
+    "    )\n",
+    "    return result.to_evaluations()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "run-experiment",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "experiment_result = run_experiment(\n",
+    "    DATASET_NAME,\n",
+    "    name=\"knowledge-agent-baseline\",\n",
+    "    task=agent_task,\n",
+    "    evaluators=[deepsearchqa_evaluator],\n",
+    "    description=\"Baseline Knowledge Agent on Finance & Economics questions.\",\n",
+    "    max_concurrency=1,\n",
+    ")\n",
+    "\n",
+    "console.print(\"[green]✓[/green] Experiment complete\")\n",
+    "if experiment_result.dataset_run_url:\n",
+    "    display(\n",
+    "        HTML(\n",
+    "            f'<p>View experiment: <a href=\"{experiment_result.dataset_run_url}\" target=\"_blank\">{experiment_result.dataset_run_url}</a></p>'\n",
+    "        )\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s4-results-intro",
+   "metadata": {},
+   "source": [
+    "## 4. Inspecting Results\n",
+    "\n",
+    "The `ExperimentResult` object gives programmatic access to every item-level score.\n",
+    "Aggregate metrics are visible in the Langfuse experiment run summary in the UI."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "item-results",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rows = []\n",
+    "for item_result in experiment_result.item_results:\n",
+    "    item = item_result.item\n",
+    "    question = str(item.input)\n",
+    "    row = {\n",
+    "        \"question\": question[:55] + \"...\" if len(question) > 55 else question,\n",
+    "        \"answer_type\": (item.metadata or {}).get(\"answer_type\", \"\"),\n",
+    "    }\n",
+    "    for evaluation in item_result.evaluations or []:\n",
+    "        row[evaluation.name] = evaluation.value\n",
+    "    rows.append(row)\n",
+    "\n",
+    "df = pd.DataFrame(rows)\n",
+    "print(df.to_string(index=False))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aggregate-scores",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Mean of numeric metrics\n",
+    "numeric_cols = [c for c in [\"F1\", \"Precision\", \"Recall\"] if c in df.columns]\n",
+    "if numeric_cols:\n",
+    "    means_table = Table(title=\"Mean Scores\")\n",
+    "    means_table.add_column(\"Metric\", style=\"cyan\")\n",
+    "    means_table.add_column(\"Mean\", style=\"white\")\n",
+    "    for col in numeric_cols:\n",
+    "        means_table.add_row(col, f\"{df[col].mean():.3f}\")\n",
+    "    console.print(means_table)\n",
+    "\n",
+    "# Outcome distribution\n",
+    "if \"Outcome\" in df.columns:\n",
+    "    outcome_table = Table(title=\"Outcome Distribution\")\n",
+    "    outcome_table.add_column(\"Outcome\", style=\"cyan\")\n",
+    "    outcome_table.add_column(\"Count\", style=\"white\", justify=\"right\")\n",
+    "    outcome_table.add_column(\"Fraction\", style=\"dim\", justify=\"right\")\n",
+    "    total = len(df)\n",
+    "    for outcome, count in df[\"Outcome\"].value_counts().items():\n",
+    "        outcome_table.add_row(str(outcome), str(count), f\"{count / total:.0%}\")\n",
+    "    console.print(outcome_table)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s5-iteration",
+   "metadata": {},
+   "source": [
+    "## 5. Iterating on the Agent\n",
+    "\n",
+    "The dataset in Langfuse is persistent — you don't need to re-upload it. To evaluate a modified\n",
+    "agent, call `run_experiment` again with a new `name` argument. Langfuse will create a new\n",
+    "experiment run and you can compare runs side-by-side in the UI.\n",
+    "\n",
+    "### Levers to Explore\n",
+    "\n",
+    "- **System prompt** — edit `SYSTEM_INSTRUCTIONS_TEMPLATE` in `system_instructions.py` to change\n",
+    "  the search strategy, verification rules, or final answer format\n",
+    "- **Planning** — toggle `enable_planning=False` to skip PlanReAct and compare quality vs. speed\n",
+    "- **Model** — change the Gemini model in `KnowledgeGroundedAgent` for different capability/cost trade-offs\n",
+    "- **Dataset** — change the `category` filter in Section 1 or increase `samples` to cover more examples\n",
+    "\n",
+    "### What to Look for in Langfuse\n",
+    "\n",
+    "- Items with **low F1** — did the agent fail to fetch the source? Stop early? Misread the question?\n",
+    "- Items with **`correct_with_extraneous`** — is the agent over-generating? Can the prompt be tightened?\n",
+    "- **Latency outliers** — which steps are slow? Is replanning happening unnecessarily?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "summary",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "\n",
+    "In this notebook you:\n",
+    "\n",
+    "1. **Uploaded** a DeepSearchQA subset to Langfuse as a persistent, reusable dataset\n",
+    "2. **Understood** the LLM-as-judge grader: precision, recall, F1, and the four outcome categories\n",
+    "3. **Walked through** a single-sample evaluation end-to-end\n",
+    "4. **Ran** a full experiment with `run_experiment` and inspected item-level scores\n",
+    "5. **Learned** how to iterate: re-run with a new experiment name to compare configurations in Langfuse\n",
+    "\n",
+    "The evaluation pipeline is the foundation for systematic agent improvement — each iteration\n",
+    "produces a new experiment run that you can compare to the baseline in the Langfuse UI."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "done",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "console.print(Panel(\"[green]✓[/green] Notebook complete!\", title=\"Done\", border_style=\"green\"))\n",
+    "if experiment_result.dataset_run_url:\n",
+    "    display(\n",
+    "        HTML(\n",
+    "            f'<p>View experiment results: <a href=\"{experiment_result.dataset_run_url}\" target=\"_blank\">{experiment_result.dataset_run_url}</a></p>'\n",
+    "        )\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/implementations/knowledge_qa/03_multi_turn.ipynb b/implementations/knowledge_qa/03_multi_turn.ipynb
deleted file mode 100644
index a0e3909..0000000
--- a/implementations/knowledge_qa/03_multi_turn.ipynb
+++ /dev/null
@@ -1,312 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": "# 03: Multi-Turn Conversations & Evaluation\n\nThis notebook demonstrates multi-turn conversation capabilities and\nhow to evaluate the agent on the DeepSearchQA benchmark.\n\n## Learning Objectives\n\n- Understand how ADK manages multi-turn conversations via sessions\n- Use the `DeepSearchQAEvaluator` for systematic evaluation\n- Analyze evaluation results with rich visualizations\n- Understand evaluation metrics for research agents"
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Setup: Load environment and configure rich console\n",
-    "import uuid\n",
-    "\n",
-    "from aieng.agent_evals import (\n",
-    "    create_console,\n",
-    "    display_evaluation_result,\n",
-    "    display_metrics_table,\n",
-    "    display_success,\n",
-    ")\n",
-    "from aieng.agent_evals.knowledge_qa import (\n",
-    "    DeepSearchQADataset,\n",
-    "    DeepSearchQAEvaluator,\n",
-    "    KnowledgeGroundedAgent,\n",
-    ")\n",
-    "from dotenv import load_dotenv\n",
-    "from rich.panel import Panel\n",
-    "from rich.table import Table\n",
-    "\n",
-    "\n",
-    "console = create_console()\n",
-    "load_dotenv(verbose=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": "## 1. Multi-Turn Conversations with ADK\n\nThe `KnowledgeGroundedAgent` uses Google ADK's built-in session management via `InMemorySessionService`.\nWhen you pass a `session_id` to `answer_async()`, ADK maintains conversation history automatically.\n\nKey points:\n- Each unique `session_id` creates a separate conversation thread\n- ADK tracks all messages, tool calls, and context within that session\n- No manual history tracking needed - ADK handles it internally"
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create agent and demonstrate multi-turn conversation\n",
-    "agent = KnowledgeGroundedAgent()\n",
-    "\n",
-    "# Create a session ID for multi-turn conversation\n",
-    "session_id = str(uuid.uuid4())\n",
-    "\n",
-    "console.print(\n",
-    "    Panel(\n",
-    "        f\"[cyan]Session ID:[/cyan] {session_id}\\n\\nADK will track conversation history for this session automatically.\",\n",
-    "        title=\"🗨️ New Session Created\",\n",
-    "        border_style=\"green\",\n",
-    "    )\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# First turn - ask a question\n",
-    "response1 = await agent.answer_async(\"What is the capital of France?\", session_id=session_id)\n",
-    "console.print(Panel(response1.text, title=\"Turn 1: Capital of France\", border_style=\"blue\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Second turn - follow-up question (ADK remembers the context)\n",
-    "response2 = await agent.answer_async(\"What is its population?\", session_id=session_id)\n",
-    "console.print(Panel(response2.text, title=\"Turn 2: Population (follow-up)\", border_style=\"blue\"))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": "## 2. Session Management in Applications\n\nFor web applications (like Gradio), you can store a session ID in the app's state:\n\n```python\n# In a Gradio app handler:\nif \"session_id\" not in session_state:\n    session_state[\"session_id\"] = str(uuid.uuid4())\n\nresponse = await agent.answer_async(query, session_id=session_state[\"session_id\"])\n```\n\nSee `gradio_app.py` for a complete example."
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# For more details on ADK sessions, see:\n",
-    "# https://google.github.io/adk-docs/sessions/\n",
-    "\n",
-    "display_success(\"Multi-turn conversation demo complete!\", console=console)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3. Running DeepSearchQA Evaluation\n",
-    "\n",
-    "The `DeepSearchQAEvaluator` provides a systematic way to evaluate the agent."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create evaluator using the existing agent\n",
-    "evaluator = DeepSearchQAEvaluator(agent)\n",
-    "\n",
-    "display_success(f\"Dataset size: {len(evaluator.dataset)} examples\", console=console)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Evaluate a small sample\n",
-    "console.print(\"[bold]🔬 Running evaluation on 3 examples...[/bold]\\n\")\n",
-    "\n",
-    "console.print(\"[dim]Evaluating...[/dim]\")\n",
-    "results = await evaluator.evaluate_sample_async(n=3, random_state=42)\n",
-    "\n",
-    "display_success(f\"Completed {len(results)} evaluations\", console=console)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# View results using the display utility\n",
-    "console.print(\"\\n[bold]📋 Evaluation Results[/bold]\\n\")\n",
-    "\n",
-    "for result in results:\n",
-    "    contains_answer = result.ground_truth.lower() in result.prediction.lower()\n",
-    "    display_evaluation_result(\n",
-    "        example_id=result.example_id,\n",
-    "        problem=result.problem,\n",
-    "        ground_truth=result.ground_truth,\n",
-    "        prediction=result.prediction,\n",
-    "        sources_used=result.sources_used,\n",
-    "        search_queries=result.search_queries,\n",
-    "        contains_answer=contains_answer,\n",
-    "        console=console,\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 4. Analyzing Evaluation Results"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Convert to DataFrame for analysis\n",
-    "df = evaluator.results_to_dataframe(results)\n",
-    "\n",
-    "# Calculate metrics\n",
-    "containment_correct = sum(1 for r in results if r.ground_truth.lower() in r.prediction.lower())\n",
-    "containment_accuracy = containment_correct / len(results) * 100\n",
-    "\n",
-    "metrics = {\n",
-    "    \"Total Examples\": len(results),\n",
-    "    \"Containment Accuracy\": f\"{containment_accuracy:.1f}%\",\n",
-    "    \"Avg Sources Used\": df[\"sources_used\"].mean(),\n",
-    "    \"Avg Search Queries\": df[\"search_queries\"].apply(len).mean(),\n",
-    "}\n",
-    "\n",
-    "display_metrics_table(metrics, title=\"Evaluation Metrics\", console=console)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 5. Understanding Evaluation Metrics\n",
-    "\n",
-    "For research agents, we care about:\n",
-    "\n",
-    "1. **Answer Correctness**: Does the prediction match the ground truth?\n",
-    "2. **Source Quality**: Are the sources relevant and authoritative?\n",
-    "3. **Comprehensiveness**: Did the agent find all necessary information?\n",
-    "4. **Search Efficiency**: How many searches were needed?\n",
-    "\n",
-    "DeepSearchQA specifically measures:\n",
-    "- **Precision**: Quality of the answer\n",
-    "- **Recall**: Completeness of the answer (for list-type questions)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Manual correctness check with better display\n",
-    "def check_answer_contains_ground_truth(prediction: str, ground_truth: str) -> bool:\n",
-    "    \"\"\"Check if prediction contains the ground truth answer.\"\"\"\n",
-    "    return ground_truth.lower() in prediction.lower()\n",
-    "\n",
-    "\n",
-    "# Check our results\n",
-    "console.print(\"\\n[bold]📊 Correctness Check[/bold]\\n\")\n",
-    "\n",
-    "result_table = Table(show_header=True, header_style=\"bold cyan\")\n",
-    "result_table.add_column(\"Example\", style=\"cyan\")\n",
-    "result_table.add_column(\"Status\", style=\"white\")\n",
-    "result_table.add_column(\"Expected\", style=\"dim\")\n",
-    "\n",
-    "for result in results:\n",
-    "    contains = check_answer_contains_ground_truth(result.prediction, result.ground_truth)\n",
-    "    status = \"[green]✓ MATCH[/green]\" if contains else \"[yellow]✗ NO MATCH[/yellow]\"\n",
-    "    result_table.add_row(\n",
-    "        str(result.example_id),\n",
-    "        status,\n",
-    "        result.ground_truth[:40] + \"...\" if len(result.ground_truth) > 40 else result.ground_truth,\n",
-    "    )\n",
-    "\n",
-    "console.print(result_table)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 6. Exploring Categories"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get examples from a specific category\n",
-    "dataset = DeepSearchQADataset()\n",
-    "categories = dataset.get_categories()\n",
-    "\n",
-    "cat_table = Table(title=\"📁 Available Categories\", show_header=True, header_style=\"bold green\")\n",
-    "cat_table.add_column(\"Category\", style=\"white\")\n",
-    "cat_table.add_column(\"Count\", style=\"cyan\", justify=\"right\")\n",
-    "\n",
-    "for cat in sorted(categories):\n",
-    "    count = len(dataset.get_by_category(cat))\n",
-    "    cat_table.add_row(cat, str(count))\n",
-    "\n",
-    "console.print(cat_table)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": "## Summary\n\nIn this notebook, you learned:\n\n1. How ADK manages multi-turn conversations via `InMemorySessionService`\n2. How to use `session_id` for conversation continuity\n3. How to run systematic evaluations with `DeepSearchQAEvaluator`\n4. How to analyze evaluation results with rich visualizations\n5. Key metrics for evaluating research agents\n\n## Next Steps\n\n- Run the Gradio app for interactive testing\n- Experiment with different models (gemini-2.5-pro vs flash)\n- Try the async evaluator for larger-scale evaluation\n- Implement LLM-as-judge evaluation for more nuanced correctness checking"
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "console.print(\n",
-    "    Panel(\n",
-    "        \"[green]✓[/green] Notebook complete!\\n\\n\"\n",
-    "        \"[cyan]Next:[/cyan] Run [bold]gradio_app.py[/bold] for interactive testing.\",\n",
-    "        title=\"🎉 Done\",\n",
-    "        border_style=\"green\",\n",
-    "    )\n",
-    ")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/implementations/knowledge_qa/README.md b/implementations/knowledge_qa/README.md
index d796e37..d0617bd 100644
--- a/implementations/knowledge_qa/README.md
+++ b/implementations/knowledge_qa/README.md
@@ -1,19 +1,18 @@
 # Knowledge-Grounded QA Agent
 
-This implementation demonstrates a knowledge-grounded question answering agent using **Google ADK** with explicit **Google Search tool calls**, evaluated on the **DeepSearchQA** benchmark.
+This implementation demonstrates a knowledge-grounded question answering agent using **Google ADK** with a **PlanReAct** architecture, evaluated on the **DeepSearchQA** benchmark.
 
 ## Overview
 
-The knowledge agent uses a ReAct (Reasoning + Acting) architecture powered by Google ADK. It explicitly calls Google Search as a tool, making the reasoning process transparent through observable Thought → Action → Observation cycles. This approach searches the live web to find relevant information for questions requiring real-time data.
+The agent combines two patterns: **PlanReAct** (creates an explicit numbered research plan before executing) and a **ReAct loop** within each step (Thought → Tool Call → Observation). It searches the live web to find and verify facts rather than relying on training data.
 
 ## Features
 
-- **ReAct Architecture**: Explicit tool calls with traceable reasoning (Thought → Action → Observation)
-- **Google Search Tool**: Uses ADK's `GoogleSearchTool` for real-time web search
-- **Source Citation**: Automatically extracts and includes source URLs from search results
-- **DeepSearchQA Evaluation**: Built-in evaluation on the DeepSearchQA benchmark (900 research tasks)
+- **PlanReAct Architecture**: Explicit research plan with step statuses, revised mid-run if needed
+- **Five Tools**: `google_search`, `web_fetch`, `fetch_file`, `grep_file`, `read_file`
+- **Source Citation**: Extracts and cites source URLs from search results
+- **DeepSearchQA Evaluation**: LLM-as-judge evaluation on the DeepSearchQA benchmark (896 questions)
 - **Multi-turn Conversations**: Session management via ADK's `InMemorySessionService`
-- **Gradio Interface**: Interactive chat UI for testing
 
 ## Setup
 
@@ -36,14 +35,6 @@ uv sync
 
 ## Usage
 
-### Interactive Chat
-
-Run the Gradio app:
-
-```bash
-uv run --env-file .env gradio implementations/knowledge_qa/gradio_app.py
-```
-
 ### Programmatic Usage
 
 ```python
@@ -64,43 +55,66 @@ print(f"Tool calls: {response.tool_calls}")
 
 ### Evaluation on DeepSearchQA
 
-```python
-from aieng.agent_evals.knowledge_qa import (
-    KnowledgeGroundedAgent,
-    DeepSearchQAEvaluator,
-)
+Use the main evaluation script to run comprehensive evaluations:
 
-agent = KnowledgeGroundedAgent()
-evaluator = DeepSearchQAEvaluator(agent)
+```bash
+# Run evaluation on 3 samples
+python implementations/knowledge_qa/evaluate.py --samples 3
 
-# Evaluate a sample (use await in Jupyter)
-results = await evaluator.evaluate_sample_async(n=10, random_state=42)
+# Run with specific example IDs
+python implementations/knowledge_qa/evaluate.py --ids 123 456 789
 
-# Convert to DataFrame for analysis
-df = evaluator.results_to_dataframe(results)
-print(df[["example_id", "ground_truth", "prediction", "sources_used"]])
+# Enable trace groundedness evaluation
+ENABLE_TRACE_GROUNDEDNESS=true python implementations/knowledge_qa/evaluate.py
+```
+
+Or use the CLI:
+
+```bash
+# Run evaluation via CLI
+uv run --env-file .env knowledge-qa eval --samples 3
+uv run --env-file .env knowledge-qa eval --ids 123 456 --show-plan
+```
+
+## Run with ADK Web UI
+
+To inspect the agent interactively, the module exposes a top-level `root_agent` for ADK discovery.
+
+```bash
+uv run adk web --port 8000 --reload --reload_agents implementations/
 ```
 
 ## Notebooks
 
-1. **01_grounding_basics.ipynb**: Introduction to the knowledge agent and Google Search tool
-2. **02_agent_basics.ipynb**: Creating agents with custom instructions
-3. **03_multi_turn.ipynb**: Multi-turn conversations and DeepSearchQA evaluation
+1. **01_dataset_and_tools.ipynb**: The DeepSearchQA dataset and the agent's five tools
+2. **02_running_the_agent.ipynb**: PlanReAct architecture, live progress display, multi-turn conversations, and Langfuse tracing
+3. **03_evaluation.ipynb**: Systematic evaluation with `run_experiment`, LLM-as-judge grading, and result inspection
 
 ## Architecture
 
 ```
 aieng.agent_evals.knowledge_qa/
-├── config.py          # Configuration (Pydantic settings)
-├── grounding_tool.py  # GoogleSearchTool wrapper and response models
-├── agent.py           # KnowledgeGroundedAgent (ADK Agent + Runner)
-├── session.py         # Conversation session management
-└── evaluation.py      # DeepSearchQA dataset and evaluator
+├── agent.py                # KnowledgeGroundedAgent (ADK Agent + Runner)
+├── data/                   # DeepSearchQA dataset loader
+├── deepsearchqa_grader.py  # LLM-as-judge evaluation
+├── planner.py              # Research planning
+├── token_tracker.py        # Token usage tracking
+└── cli.py                  # Rich CLI interface
+
+aieng.agent_evals/
+├── configs.py              # Configuration (Pydantic settings)
+├── evaluation/             # Evaluation harness
+│   ├── experiment.py       # Langfuse experiment runner
+│   └── graders/            # Evaluators (trace groundedness, etc.)
+└── tools/                  # Shared tools
+    ├── search.py           # GoogleSearchTool wrapper
+    ├── web.py              # web_fetch for HTML/PDF
+    └── file.py             # fetch_file, grep_file, read_file
 ```
 
 ## DeepSearchQA Dataset
 
-The [DeepSearchQA](https://www.kaggle.com/datasets/deepmind/deepsearchqa) benchmark consists of 900 "causal chain" research tasks across 17 categories. These questions require:
+The [DeepSearchQA](https://www.kaggle.com/datasets/deepmind/deepsearchqa) benchmark consists of 896 "causal chain" research tasks across 17 categories. These questions require:
 
 - Multi-source lookups
 - Statistical comparisons
diff --git a/implementations/knowledge_qa/agent.py b/implementations/knowledge_qa/agent.py
new file mode 100644
index 0000000..d386b0c
--- /dev/null
+++ b/implementations/knowledge_qa/agent.py
@@ -0,0 +1,21 @@
+"""ADK discovery entrypoint for the Knowledge QA agent.
+
+Exposes a module-level ``root_agent`` so ``adk web`` can discover it.
+
+Examples
+--------
+Run with ``adk web``:
+    uv run adk web --port 8000 --reload --reload_agents implementations/
+"""
+
+import logging
+
+from aieng.agent_evals.knowledge_qa.agent import KnowledgeGroundedAgent
+
+
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+logger = logging.getLogger(__name__)
+
+
+# ADK discovery expects a module-level `root_agent`
+root_agent = KnowledgeGroundedAgent().adk_agent