diff --git a/examples/evaluation/callbacks/.env b/examples/evaluation/callbacks/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/callbacks/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY、TRPC_AGENT_BASE_URL、TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/callbacks/README.md b/examples/evaluation/callbacks/README.md new file mode 100644 index 0000000..91d725c --- /dev/null +++ b/examples/evaluation/callbacks/README.md @@ -0,0 +1,22 @@ +# 回调 Callbacks 示例 + +在评测中注册 **Callbacks**:在推理集/用例推理、打分集/用例打分的 8 个生命周期节点挂载钩子,用于打点、日志、采样或上报。 + +## 目录结构 + +- `callbacks/`:示例根目录 +- `agent/`:内含 `agent.py`、`callbacks_example.evalset.json`、`test_config.json`、`config.py` +- `test_callbacks.py`:调用 `AgentEvaluator.evaluate(..., callbacks=callbacks)`,注册 `before_inference_set`、`after_inference_case`、`before_evaluate_set`、`after_evaluate_case` 并打日志 + +## 环境要求 + +Python 3.10+。需配置 `TRPC_AGENT_API_KEY` 等环境变量(同 quickstart)。 + +## 运行 + +```bash +cd examples/evaluation/callbacks +pytest test_callbacks.py -v --tb=short -s +``` + +`-s` 可看到回调中的 print 输出。 diff --git a/examples/evaluation/callbacks/agent/__init__.py b/examples/evaluation/callbacks/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/callbacks/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/callbacks/agent/agent.py b/examples/evaluation/callbacks/agent/agent.py new file mode 100644 index 0000000..5bff805 --- /dev/null +++ b/examples/evaluation/callbacks/agent/agent.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Weather agent for callbacks example.""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def get_weather(city: str) -> Dict[str, Any]: + """查询指定城市当前天气。""" + data = { + "上海": {"temperature": 18, "condition": "多云"}, + "北京": {"temperature": 15, "condition": "晴"}, + } + result = data.get(city, {"temperature": 20, "condition": "未知"}) + return {"city": city, **result} + + +def create_agent() -> LlmAgent: + api_key, url, model_name = get_model_config() + return LlmAgent( + name="weather_agent", + description="天气查询助手", + model=OpenAIModel(model_name=model_name, api_key=api_key, base_url=url), + instruction="你是天气助手,用 get_weather 查询城市天气并简要回答。", + tools=[FunctionTool(get_weather)], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/callbacks/agent/callbacks_example.evalset.json b/examples/evaluation/callbacks/agent/callbacks_example.evalset.json new file mode 100644 index 0000000..47515f6 --- /dev/null +++ b/examples/evaluation/callbacks/agent/callbacks_example.evalset.json @@ -0,0 +1,37 @@ +{ + "eval_set_id": "callbacks_example", + "name": "回调示例", + "description": "演示在评测中注册 Callbacks,在推理/打分生命周期打日志或计数", + "eval_cases": [ + { + "eval_id": "callbacks_001", + "conversation": [ + { + "invocation_id": "e-1", + "user_content": { + "parts": [{"text": "上海天气怎么样"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "18"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": "t1", + "name": "get_weather", + "args": {"city": "上海"} + } + ] + } + } + ], + "session_input": { + "app_name": "weather_agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/callbacks/agent/config.py b/examples/evaluation/callbacks/agent/config.py new file mode 100644 index 0000000..428fb8d --- /dev/null +++ b/examples/evaluation/callbacks/agent/config.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4.7") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/callbacks/agent/test_config.json b/examples/evaluation/callbacks/agent/test_config.json new file mode 100644 index 0000000..c6824ba --- /dev/null +++ b/examples/evaluation/callbacks/agent/test_config.json @@ -0,0 +1,27 @@ +{ + "metrics": [ + { + "metric_name": "tool_trajectory_avg_score", + "threshold": 0.8, + "criterion": { + "tool_trajectory": { + "default": { + "name": {"match": "exact", "case_insensitive": false}, + "arguments": {"match": "exact"} + }, + "order_sensitive": false, + "subset_matching": false + } + } + }, + { + "metric_name": "final_response_avg_score", + "threshold": 0.6, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": true} + } + } + } + ] +} diff --git a/examples/evaluation/callbacks/test_callbacks.py b/examples/evaluation/callbacks/test_callbacks.py new file mode 100644 index 0000000..8a026e8 --- /dev/null +++ b/examples/evaluation/callbacks/test_callbacks.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""回调示例:在评测中注册 8 个生命周期 Callbacks,打日志并演示 context 传递。""" + +import os +import pytest +from trpc_agent_sdk.evaluation import ( + AgentEvaluator, + Callbacks, + Callback, + CallbackResult, + BeforeInferenceSetArgs, + AfterInferenceSetArgs, + BeforeInferenceCaseArgs, + AfterInferenceCaseArgs, + BeforeEvaluateSetArgs, + AfterEvaluateSetArgs, + BeforeEvaluateCaseArgs, + AfterEvaluateCaseArgs, +) + +triggered: list[str] = [] + + +def before_inference_set(ctx, args: BeforeInferenceSetArgs): + triggered.append("before_inference_set") + print("[callback] 推理集开始", args.request.eval_set_id, flush=True) + return None + + +def after_inference_set(ctx, args: AfterInferenceSetArgs): + triggered.append("after_inference_set") + n = len(args.results) if args.results else 0 + print("[callback] 推理集结束,共", n, "个用例", flush=True) + return None + + +def before_inference_case(ctx, args: BeforeInferenceCaseArgs): + triggered.append("before_inference_case") + print("[callback] 用例推理开始", args.eval_case_id, flush=True) + return None + + +def after_inference_case(ctx, args: AfterInferenceCaseArgs): + triggered.append("after_inference_case") + print("[callback] 用例推理结束", args.result.eval_case_id, flush=True) + return None + + +def before_evaluate_set(ctx, args: BeforeEvaluateSetArgs): + triggered.append("before_evaluate_set") + n = len(args.request.inference_results) + print("[callback] 打分集开始 cases=", n, flush=True) + return CallbackResult(context={"phase": "evaluate"}) + + +def after_evaluate_set(ctx, args: AfterEvaluateSetArgs): + triggered.append("after_evaluate_set") + n = len(args.result.eval_case_results) if args.result else 0 + phase = (ctx.get("context") or {}).get("phase", "?") + print("[callback] 打分集结束,共", n, "个用例,ctx.phase=", phase, flush=True) + return None + + +def before_evaluate_case(ctx, args: BeforeEvaluateCaseArgs): + triggered.append("before_evaluate_case") + print("[callback] 用例打分开始", args.eval_case_id, flush=True) + return None + + +def after_evaluate_case(ctx, args: AfterEvaluateCaseArgs): + triggered.append("after_evaluate_case") + print("[callback] 用例打分结束", args.result.eval_id, flush=True) + return None + + +@pytest.mark.asyncio +async def test_with_callbacks(): + triggered.clear() + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join(test_dir, "agent", "callbacks_example.evalset.json") + + callbacks = Callbacks() + callbacks.register( + "demo", + Callback( + before_inference_set=before_inference_set, + after_inference_set=after_inference_set, + before_inference_case=before_inference_case, + after_inference_case=after_inference_case, + before_evaluate_set=before_evaluate_set, + after_evaluate_set=after_evaluate_set, + before_evaluate_case=before_evaluate_case, + after_evaluate_case=after_evaluate_case, + ), + ) + + await AgentEvaluator.evaluate( + agent_module="agent", + agent_name="weather_agent", + eval_dataset_file_path_or_dir=eval_set_path, + callbacks=callbacks, + ) + + expected = [ + "before_inference_set", + "before_inference_case", + "after_inference_case", + "after_inference_set", + "before_evaluate_set", + "before_evaluate_case", + "after_evaluate_case", + "after_evaluate_set", + ] + assert triggered == expected, triggered diff --git a/examples/evaluation/context_messages/.env b/examples/evaluation/context_messages/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/context_messages/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY、TRPC_AGENT_BASE_URL、TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/context_messages/README.md b/examples/evaluation/context_messages/README.md new file mode 100644 index 0000000..76bd75c --- /dev/null +++ b/examples/evaluation/context_messages/README.md @@ -0,0 +1,20 @@ +# 上下文注入示例 + +在评测用例上配置 **context_messages**:评估服务在每轮推理前会将其中内容注入会话上下文,用于传递系统说明、领域知识或格式约束。 + +## 目录结构 + +- `context_messages/`:示例根目录 +- `agent/`:内含 `agent.py`、`context_example.evalset.json`(含带 context_messages 的用例)、`test_config.json`、`config.py` +- `test_context_messages.py`:调用 `AgentEvaluator.evaluate` 跑评测 + +## 环境要求 + +Python 3.10+。需配置 `TRPC_AGENT_API_KEY` 等环境变量(同 quickstart)。 + +## 运行 + +```bash +cd examples/evaluation/context_messages +pytest test_context_messages.py -v --tb=short -s +``` diff --git a/examples/evaluation/context_messages/agent/__init__.py b/examples/evaluation/context_messages/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/context_messages/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/context_messages/agent/agent.py b/examples/evaluation/context_messages/agent/agent.py new file mode 100644 index 0000000..1539966 --- /dev/null +++ b/examples/evaluation/context_messages/agent/agent.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Weather agent for context_messages example.""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def get_weather(city: str) -> Dict[str, Any]: + """查询指定城市当前天气。""" + data = { + "上海": {"temperature": 18, "condition": "多云"}, + "北京": {"temperature": 15, "condition": "晴"}, + } + result = data.get(city, {"temperature": 20, "condition": "未知"}) + return {"city": city, **result} + + +def create_agent() -> LlmAgent: + api_key, url, model_name = get_model_config() + return LlmAgent( + name="weather_agent", + description="天气查询助手", + model=OpenAIModel(model_name=model_name, api_key=api_key, base_url=url), + instruction="你是天气助手,用 get_weather 查询城市天气并简要回答。", + tools=[FunctionTool(get_weather)], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/context_messages/agent/config.py b/examples/evaluation/context_messages/agent/config.py new file mode 100644 index 0000000..428fb8d --- /dev/null +++ b/examples/evaluation/context_messages/agent/config.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4.7") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/context_messages/agent/context_example.evalset.json b/examples/evaluation/context_messages/agent/context_example.evalset.json new file mode 100644 index 0000000..612200b --- /dev/null +++ b/examples/evaluation/context_messages/agent/context_example.evalset.json @@ -0,0 +1,47 @@ +{ + "eval_set_id": "context_messages_example", + "name": "上下文注入示例", + "description": "用例中配置 context_messages,在每轮推理前注入到会话", + "eval_cases": [ + { + "eval_id": "with_context_001", + "context_messages": [ + { + "parts": [ + { + "text": "你是一个天气助手,只回答与天气相关的问题,回答要简短。" + } + ], + "role": "user" + } + ], + "conversation": [ + { + "invocation_id": "e-1", + "user_content": { + "parts": [{"text": "上海天气怎么样"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "18"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": "t1", + "name": "get_weather", + "args": {"city": "上海"} + } + ] + } + } + ], + "session_input": { + "app_name": "weather_agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/context_messages/agent/test_config.json b/examples/evaluation/context_messages/agent/test_config.json new file mode 100644 index 0000000..c6824ba --- /dev/null +++ b/examples/evaluation/context_messages/agent/test_config.json @@ -0,0 +1,27 @@ +{ + "metrics": [ + { + "metric_name": "tool_trajectory_avg_score", + "threshold": 0.8, + "criterion": { + "tool_trajectory": { + "default": { + "name": {"match": "exact", "case_insensitive": false}, + "arguments": {"match": "exact"} + }, + "order_sensitive": false, + "subset_matching": false + } + } + }, + { + "metric_name": "final_response_avg_score", + "threshold": 0.6, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": true} + } + } + } + ] +} diff --git a/examples/evaluation/context_messages/test_context_messages.py b/examples/evaluation/context_messages/test_context_messages.py new file mode 100644 index 0000000..ee32161 --- /dev/null +++ b/examples/evaluation/context_messages/test_context_messages.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""上下文注入示例:在用例中配置 context_messages,每轮推理前注入到会话。""" + +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator + + +@pytest.mark.asyncio +async def test_context_messages(): + """用例带 context_messages,推理前会注入到会话上下文。""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join( + test_dir, "agent", "context_example.evalset.json" + ) + + await AgentEvaluator.evaluate( + agent_module="agent", + agent_name="weather_agent", + eval_dataset_file_path_or_dir=eval_set_path, + print_detailed_results=True, + ) diff --git a/examples/evaluation/custom_runner/.env b/examples/evaluation/custom_runner/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/custom_runner/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY、TRPC_AGENT_BASE_URL、TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/custom_runner/README.md b/examples/evaluation/custom_runner/README.md new file mode 100644 index 0000000..64bede7 --- /dev/null +++ b/examples/evaluation/custom_runner/README.md @@ -0,0 +1,39 @@ +# 自定义 Runner 示例 + +本示例演示如何在评测中**传入自定义 Runner**:推理由你提供的 Runner(含 Agent、SessionService 等)执行,打分逻辑仍由评测框架完成。 + +## 适用场景 + +- 复用已有会话服务(如 Redis、SQL) +- 与线上/本地部署使用同一 Runner 形态,统一鉴权、日志等 +- 希望评测与真实运行环境一致 + +## 目录结构 + +``` +custom_runner/ +├── agent/ +│ ├── __init__.py +│ ├── agent.py # 天气 Agent(root_agent) +│ ├── config.py # 模型配置 +│ ├── test_config.json # 评测指标配置 +│ └── custom_runner_example.evalset.json # 评测集 +├── test_custom_runner.py # 构造 Runner 并调用 evaluate(..., runner=runner) +└── README.md +``` + +## 运行方式 + +在 **本目录** 下执行(需已配置 `TRPC_AGENT_API_KEY` 等环境变量): + +```bash +pytest test_custom_runner.py -v -s +``` + +## 要点 + +1. **构造 Runner**:使用 `Runner(app_name=..., agent=..., session_service=...)`,本示例使用 `InMemorySessionService()`,可按需替换为其他 SessionService。 +2. **传入评测**:在 `AgentEvaluator.evaluate(..., runner=runner)` 或 `get_executer(..., runner=runner)` 中传入你的 Runner。 +3. **session_input**:若评测用例中配置了 `session_input`,框架会按需在该 Runner 的会话中创建/更新会话。 + +更多说明见文档 [run_eval_pytest.md](../../../docs/evaluation/run_eval_pytest.md) 中的「自定义 Runner」小节。 diff --git a/examples/evaluation/custom_runner/agent/__init__.py b/examples/evaluation/custom_runner/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/custom_runner/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/custom_runner/agent/agent.py b/examples/evaluation/custom_runner/agent/agent.py new file mode 100644 index 0000000..3be5b7e --- /dev/null +++ b/examples/evaluation/custom_runner/agent/agent.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Weather agent for custom runner example.""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def get_weather(city: str) -> Dict[str, Any]: + """查询指定城市当前天气。""" + data = { + "上海": {"temperature": 18, "condition": "多云"}, + "北京": {"temperature": 15, "condition": "晴"}, + } + result = data.get(city, {"temperature": 20, "condition": "未知"}) + return {"city": city, **result} + + +def create_agent() -> LlmAgent: + api_key, url, model_name = get_model_config() + return LlmAgent( + name="weather_agent", + description="天气查询助手", + model=OpenAIModel(model_name=model_name, api_key=api_key, base_url=url), + instruction="你是天气助手,用 get_weather 查询城市天气并简要回答。", + tools=[FunctionTool(get_weather)], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/custom_runner/agent/config.py b/examples/evaluation/custom_runner/agent/config.py new file mode 100644 index 0000000..428fb8d --- /dev/null +++ b/examples/evaluation/custom_runner/agent/config.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4.7") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/custom_runner/agent/custom_runner_example.evalset.json b/examples/evaluation/custom_runner/agent/custom_runner_example.evalset.json new file mode 100644 index 0000000..8a1d80e --- /dev/null +++ b/examples/evaluation/custom_runner/agent/custom_runner_example.evalset.json @@ -0,0 +1,37 @@ +{ + "eval_set_id": "custom_runner_example", + "name": "自定义 Runner 示例", + "description": "使用自建 Runner(含 agent 与 session_service)跑评测,打分仍由框架完成", + "eval_cases": [ + { + "eval_id": "custom_runner_001", + "conversation": [ + { + "invocation_id": "e-1", + "user_content": { + "parts": [{"text": "上海天气怎么样"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "18"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": "t1", + "name": "get_weather", + "args": {"city": "上海"} + } + ] + } + } + ], + "session_input": { + "app_name": "weather_agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/custom_runner/agent/test_config.json b/examples/evaluation/custom_runner/agent/test_config.json new file mode 100644 index 0000000..c6824ba --- /dev/null +++ b/examples/evaluation/custom_runner/agent/test_config.json @@ -0,0 +1,27 @@ +{ + "metrics": [ + { + "metric_name": "tool_trajectory_avg_score", + "threshold": 0.8, + "criterion": { + "tool_trajectory": { + "default": { + "name": {"match": "exact", "case_insensitive": false}, + "arguments": {"match": "exact"} + }, + "order_sensitive": false, + "subset_matching": false + } + } + }, + { + "metric_name": "final_response_avg_score", + "threshold": 0.6, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": true} + } + } + } + ] +} diff --git a/examples/evaluation/custom_runner/test_custom_runner.py b/examples/evaluation/custom_runner/test_custom_runner.py new file mode 100644 index 0000000..e243d11 --- /dev/null +++ b/examples/evaluation/custom_runner/test_custom_runner.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""自定义 Runner 示例:使用自建 Runner(agent + session_service)跑评测。""" + +import os +import pytest + +from trpc_agent_sdk.evaluation import AgentEvaluator +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService + +# 从当前示例的 agent 包加载 root_agent +from agent import root_agent + + +@pytest.mark.asyncio +async def test_evaluate_with_custom_runner(): + """使用自定义 Runner 执行评测:自建 Runner 负责推理,打分由框架完成。""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join( + test_dir, "agent", "custom_runner_example.evalset.json" + ) + + # 自建会话服务(可替换为 Redis/SQL 等) + session_service = InMemorySessionService() + + # 构造 Runner:与线上/本地部署使用同一 Runner 形态,便于复用环境 + runner = Runner( + app_name="weather_agent", + agent=root_agent, + session_service=session_service, + ) + + # 传入 runner 后,推理由该 Runner 执行,打分仍由评测框架完成 + await AgentEvaluator.evaluate( + agent_module="agent", + agent_name="weather_agent", + eval_dataset_file_path_or_dir=eval_set_path, + runner=runner, + ) diff --git a/examples/evaluation/llm_final_response/.env b/examples/evaluation/llm_final_response/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/llm_final_response/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY、TRPC_AGENT_BASE_URL、TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/llm_final_response/README.md b/examples/evaluation/llm_final_response/README.md new file mode 100644 index 0000000..34e6cd9 --- /dev/null +++ b/examples/evaluation/llm_final_response/README.md @@ -0,0 +1,23 @@ +# llm_final_response 评估器示例 + +使用 **llm_final_response** 指标:由裁判模型对比 Agent 实际最终回答与评测集中的参考答案,给出是否一致的判定。 + +## 目录结构 + +- `agent/`:Agent 模块(`agent.py`、`config.py`)、评测集 `llm_final_response.evalset.json`、`test_config.json` +- `test_llm_final_response.py`:pytest 入口 + +## 环境变量 + +- `TRPC_AGENT_API_KEY` 或 `API_KEY`(必填,Agent 与裁判模型共用) +- `TRPC_AGENT_BASE_URL`(可选) +- `TRPC_AGENT_MODEL_NAME`(可选,默认 glm-4-flash) + +## 运行 + +```bash +cd examples/evaluation/llm_final_response +pytest test_llm_final_response.py -v --tb=short -s +``` + +评测集用例中需提供预期的 `final_response`,裁判模型将实际回答与该参考对比后输出 valid/invalid。 diff --git a/examples/evaluation/llm_final_response/agent/__init__.py b/examples/evaluation/llm_final_response/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/llm_final_response/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/llm_final_response/agent/agent.py b/examples/evaluation/llm_final_response/agent/agent.py new file mode 100644 index 0000000..e42f996 --- /dev/null +++ b/examples/evaluation/llm_final_response/agent/agent.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Simple QA agent for llm_final_response evaluator demo.""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def _create_model() -> OpenAIModel: + api_key, url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=url) + + +def get_weather(city: str) -> Dict[str, Any]: + """查询指定城市当前天气。""" + weather_data = { + "北京": {"temperature": 15, "condition": "晴"}, + "上海": {"temperature": 18, "condition": "多云"}, + "深圳": {"temperature": 25, "condition": "晴"}, + } + result = weather_data.get( + city, {"temperature": 20, "condition": "未知"} + ) + return {"city": city, **result} + + +def create_agent() -> LlmAgent: + """Create the agent for llm_final_response demo.""" + return LlmAgent( + name="llm_final_response_agent", + description="简单问答助手,可查天气", + model=_create_model(), + instruction=( + "你是问答助手。用户问天气时用 get_weather 查询后简洁回答," + "例如只回答温度与天气状况。" + ), + tools=[FunctionTool(get_weather)], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/llm_final_response/agent/config.py b/examples/evaluation/llm_final_response/agent/config.py new file mode 100644 index 0000000..5f1c9c4 --- /dev/null +++ b/examples/evaluation/llm_final_response/agent/config.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + """Get model config from environment variables.""" + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4-flash") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/llm_final_response/agent/llm_final_response.evalset.json b/examples/evaluation/llm_final_response/agent/llm_final_response.evalset.json new file mode 100644 index 0000000..3c78d36 --- /dev/null +++ b/examples/evaluation/llm_final_response/agent/llm_final_response.evalset.json @@ -0,0 +1,31 @@ +{ + "eval_set_id": "llm_final_response", + "name": "LLM 最终响应评估示例", + "description": "使用 llm_final_response 评估器对比实际回答与参考答案", + "eval_cases": [ + { + "eval_id": "case_001", + "conversation": [ + { + "invocation_id": "inv-001", + "user_content": { + "parts": [{"text": "上海天气怎么样"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "上海当前天气多云,气温 18°C。"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [{"id": "t1", "name": "get_weather", "args": {"city": "上海"}}] + } + } + ], + "session_input": { + "app_name": "llm_final_response_agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/llm_final_response/agent/test_config.json b/examples/evaluation/llm_final_response/agent/test_config.json new file mode 100644 index 0000000..f7b5ad1 --- /dev/null +++ b/examples/evaluation/llm_final_response/agent/test_config.json @@ -0,0 +1,19 @@ +{ + "metrics": [ + { + "metric_name": "llm_final_response", + "threshold": 1, + "criterion": { + "llm_judge": { + "judge_model": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "api_key": "${TRPC_AGENT_API_KEY}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "num_samples": 1, + "generation_config": {"max_tokens": 2000, "temperature": 0.2} + } + } + } + } + ] +} diff --git a/examples/evaluation/llm_final_response/test_llm_final_response.py b/examples/evaluation/llm_final_response/test_llm_final_response.py new file mode 100644 index 0000000..af85d64 --- /dev/null +++ b/examples/evaluation/llm_final_response/test_llm_final_response.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""llm_final_response 评估器示例测试""" + +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator + + +@pytest.mark.asyncio +async def test_llm_final_response_demo(): + """使用 llm_final_response 指标评测 Agent:裁判模型对比实际回答与参考答案。""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join( + test_dir, "agent", "llm_final_response.evalset.json" + ) + + await AgentEvaluator.evaluate( + agent_module="agent", + agent_name="llm_final_response_agent", + eval_dataset_file_path_or_dir=eval_set_path, + print_detailed_results=True, + ) diff --git a/examples/evaluation/llm_judge_tools/.env b/examples/evaluation/llm_judge_tools/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/llm_judge_tools/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY、TRPC_AGENT_BASE_URL、TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/llm_judge_tools/README.md b/examples/evaluation/llm_judge_tools/README.md new file mode 100644 index 0000000..843f3ca --- /dev/null +++ b/examples/evaluation/llm_judge_tools/README.md @@ -0,0 +1,63 @@ +# 为裁判 Agent 注册工具示例(Rubric 指标) + +通过 **LLM_EVALUATOR_REGISTRY** 为 LLM Judge 的裁判 Agent 注册工具,并在 **rubric 指标**(`llm_rubric_response`)的细则中**规定工具的调用时机和用法**,使裁判在打分时必须先调用工具获取判定标准,再按标准条款判定。 + +## 原理 + +- 本示例使用 **llm_rubric_response** 指标(而非 llm_final_response),通过多条 Rubric 约束裁判行为。 +- 在 `test_config.json` 的 **rubrics** 中明确写出: + - **调用时机**:裁判在打分前必须先调用 `get_eval_policy` 获取本用例的判定标准。 + - **用法**:仅根据该工具返回的标准条款进行判定,不得自行增加或减少条款;后续细则要求「根据 get_eval_policy 返回的标准第 N 条判定」。 +- 裁判由框架内的 **LlmAgent** 担任;通过 `LLM_EVALUATOR_REGISTRY.register_judge_tools("llm_rubric_response", [FunctionTool(get_eval_policy)])` 为裁判注入 **get_eval_policy** 工具。裁判模型在推理时会看到 rubrics,因此会先调用工具再按条款打分,工具效果明显。 + +## 目录结构 + +- `agent/`:被评测的 Agent、评测集 `judge_tools.evalset.json`、`test_config.json`(含 llm_rubric_response 与规定工具用法的 rubrics) +- `test_llm_judge_tools.py`:注册 judge 工具 `get_eval_policy` 并执行评测 + +## 示例代码 + +在运行评测前为 **llm_rubric_response** 注册工具: + +```python +from trpc_agent_sdk.evaluation import LLM_EVALUATOR_REGISTRY +from trpc_agent_sdk.tools import FunctionTool + +def get_eval_policy() -> str: + """裁判在打分前必须调用:返回本用例的判定标准。""" + return ( + "本用例判定标准(共 3 条):\n" + "1. 最终回答须包含明确的温度数值。\n" + "2. 最终回答须包含天气状况描述。\n" + "3. 回答须与用户问题直接相关。" + ) + +LLM_EVALUATOR_REGISTRY.register_judge_tools( + "llm_rubric_response", + [FunctionTool(get_eval_policy)], +) +``` + +在 `test_config.json` 的 rubrics 中规定调用时机与用法(见 `agent/test_config.json`): + +- 第一条 rubric:裁判在打分前必须先调用 `get_eval_policy`,再仅根据返回的条款判定。 +- 其余 rubrics:根据 `get_eval_policy` 返回的标准第 1、2、3 条逐条判定。 + +取消注册: + +```python +LLM_EVALUATOR_REGISTRY.unregister_judge_tools("llm_rubric_response") +``` + +## 环境变量 + +- `TRPC_AGENT_API_KEY` 或 `API_KEY`(必填,Agent 与裁判模型共用) +- `TRPC_AGENT_BASE_URL`(可选) +- `TRPC_AGENT_MODEL_NAME`(可选) + +## 运行 + +```bash +cd examples/evaluation/llm_judge_tools +pytest test_llm_judge_tools.py -v --tb=short -s +``` diff --git a/examples/evaluation/llm_judge_tools/agent/__init__.py b/examples/evaluation/llm_judge_tools/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/llm_judge_tools/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/llm_judge_tools/agent/agent.py b/examples/evaluation/llm_judge_tools/agent/agent.py new file mode 100644 index 0000000..f39b45e --- /dev/null +++ b/examples/evaluation/llm_judge_tools/agent/agent.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Simple QA agent for llm_judge_tools example.""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def _create_model() -> OpenAIModel: + api_key, url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=url) + + +def get_weather(city: str) -> Dict[str, Any]: + """查询指定城市当前天气。""" + weather_data = { + "北京": {"temperature": 15, "condition": "晴"}, + "上海": {"temperature": 18, "condition": "多云"}, + "深圳": {"temperature": 25, "condition": "晴"}, + } + result = weather_data.get( + city, {"temperature": 20, "condition": "未知"} + ) + return {"city": city, **result} + + +def create_agent() -> LlmAgent: + """Create the agent for llm_judge_tools demo.""" + return LlmAgent( + name="llm_judge_tools_agent", + description="简单问答助手,可查天气", + model=_create_model(), + instruction=( + "你是问答助手。用户问天气时用 get_weather 查询后简洁回答," + "例如只回答温度与天气状况。" + ), + tools=[FunctionTool(get_weather)], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/llm_judge_tools/agent/config.py b/examples/evaluation/llm_judge_tools/agent/config.py new file mode 100644 index 0000000..5f1c9c4 --- /dev/null +++ b/examples/evaluation/llm_judge_tools/agent/config.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + """Get model config from environment variables.""" + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4-flash") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/llm_judge_tools/agent/judge_tools.evalset.json b/examples/evaluation/llm_judge_tools/agent/judge_tools.evalset.json new file mode 100644 index 0000000..46a9f7f --- /dev/null +++ b/examples/evaluation/llm_judge_tools/agent/judge_tools.evalset.json @@ -0,0 +1,31 @@ +{ + "eval_set_id": "llm_judge_tools", + "name": "Judge 注册工具示例", + "description": "使用 llm_rubric_response,在 rubric 中规定裁判须先调用 get_eval_policy 再按返回条款判定", + "eval_cases": [ + { + "eval_id": "case_001", + "conversation": [ + { + "invocation_id": "inv-001", + "user_content": { + "parts": [{"text": "上海天气怎么样"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "上海当前天气多云,气温 18°C。"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [{"id": "t1", "name": "get_weather", "args": {"city": "上海"}}] + } + } + ], + "session_input": { + "app_name": "llm_judge_tools_agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/llm_judge_tools/agent/test_config.json b/examples/evaluation/llm_judge_tools/agent/test_config.json new file mode 100644 index 0000000..2b6bc7b --- /dev/null +++ b/examples/evaluation/llm_judge_tools/agent/test_config.json @@ -0,0 +1,53 @@ +{ + "metrics": [ + { + "metric_name": "llm_rubric_response", + "threshold": 1, + "criterion": { + "llm_judge": { + "judge_model": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "api_key": "${TRPC_AGENT_API_KEY}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "num_samples": 1, + "generation_config": {"max_tokens": 2000, "temperature": 0.2} + }, + "rubrics": [ + { + "id": "call_tool_first", + "content": { + "text": "裁判在打分前必须先调用 get_eval_policy 获取本用例的判定标准,再仅根据该工具返回的标准条款进行判定,不得自行增加或减少条款。" + }, + "description": "判定前须调用工具获取标准", + "type": "JUDGE_TOOL_USAGE" + }, + { + "id": "clause_1", + "content": { + "text": "根据 get_eval_policy 返回的标准第 1 条判定:最终回答是否满足该条。" + }, + "description": "按政策第 1 条判定", + "type": "FINAL_RESPONSE_QUALITY" + }, + { + "id": "clause_2", + "content": { + "text": "根据 get_eval_policy 返回的标准第 2 条判定:最终回答是否满足该条。" + }, + "description": "按政策第 2 条判定", + "type": "FINAL_RESPONSE_QUALITY" + }, + { + "id": "clause_3", + "content": { + "text": "根据 get_eval_policy 返回的标准第 3 条判定:最终回答是否满足该条。" + }, + "description": "按政策第 3 条判定", + "type": "RELEVANCE" + } + ] + } + } + } + ] +} diff --git a/examples/evaluation/llm_judge_tools/test_llm_judge_tools.py b/examples/evaluation/llm_judge_tools/test_llm_judge_tools.py new file mode 100644 index 0000000..33ca41d --- /dev/null +++ b/examples/evaluation/llm_judge_tools/test_llm_judge_tools.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""为裁判 Agent 注册工具的示例:使用 llm_rubric_response,在 rubric 中规定工具的调用时机与用法。""" + +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator, LLM_EVALUATOR_REGISTRY +from trpc_agent_sdk.tools import FunctionTool + + +def get_eval_policy() -> str: + """裁判在打分前必须调用的工具:返回本用例的判定标准。裁判须先调用本工具获取标准,再仅按返回的条款逐条判定。""" + return ( + "本用例判定标准(共 3 条):\n" + "1. 最终回答须包含明确的温度数值(如 18、18°C)。\n" + "2. 最终回答须包含天气状况描述(如晴、多云、阴)。\n" + "3. 回答须与用户问题直接相关,不得答非所问。" + ) + + +# 为 llm_rubric_response 的 judge agent 注册工具;rubric 中已规定「必须先调用 get_eval_policy 再按返回条款判定」 +LLM_EVALUATOR_REGISTRY.register_judge_tools( + "llm_rubric_response", + [FunctionTool(get_eval_policy)], +) + + +@pytest.mark.asyncio +async def test_llm_judge_with_tools(): + """使用 llm_rubric_response:裁判按 rubric 须先调用 get_eval_policy 获取判定标准,再按标准条款打分。""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join(test_dir, "agent", "judge_tools.evalset.json") + + await AgentEvaluator.evaluate( + agent_module="agent", + agent_name="llm_judge_tools_agent", + eval_dataset_file_path_or_dir=eval_set_path, + print_detailed_results=True, + ) diff --git a/examples/evaluation/llm_rubric_knowledge_recall/.env b/examples/evaluation/llm_rubric_knowledge_recall/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/llm_rubric_knowledge_recall/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY、TRPC_AGENT_BASE_URL、TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/llm_rubric_knowledge_recall/README.md b/examples/evaluation/llm_rubric_knowledge_recall/README.md new file mode 100644 index 0000000..6a9d960 --- /dev/null +++ b/examples/evaluation/llm_rubric_knowledge_recall/README.md @@ -0,0 +1,23 @@ +# llm_rubric_knowledge_recall 评估器示例 + +使用 **llm_rubric_knowledge_recall** 指标:从 Agent 实际轨迹中提取**知识检索类工具**(默认 `knowledge_search`)的调用结果,由裁判模型根据 **rubrics** 判定检索内容是否足以支撑问题或细则,适用于 RAG 召回质量评估。 + +## 目录结构 + +- `agent/`:Agent 带 `knowledge_search` 工具、评测集 `llm_rubric_knowledge_recall.evalset.json`、`test_config.json`(含 `rubrics` 与可选 `knowledge_tool_names`) +- `test_llm_rubric_knowledge_recall.py`:pytest 入口 + +## 环境变量 + +- `TRPC_AGENT_API_KEY` 或 `API_KEY`(必填) +- `TRPC_AGENT_BASE_URL`(可选) +- `TRPC_AGENT_MODEL_NAME`(可选,默认 glm-4-flash) + +## 运行 + +```bash +cd examples/evaluation/llm_rubric_knowledge_recall +pytest test_llm_rubric_knowledge_recall.py -v --tb=short -s +``` + +Agent 必须在实际运行中调用 `knowledge_search`(或你在 `knowledge_tool_names` 中配置的工具名),否则轨迹中无检索结果,裁判无法稳定打分。本示例中 `knowledge_search` 返回模拟文档,裁判据此与 rubrics 判定。 diff --git a/examples/evaluation/llm_rubric_knowledge_recall/agent/__init__.py b/examples/evaluation/llm_rubric_knowledge_recall/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/llm_rubric_knowledge_recall/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/llm_rubric_knowledge_recall/agent/agent.py b/examples/evaluation/llm_rubric_knowledge_recall/agent/agent.py new file mode 100644 index 0000000..e1655de --- /dev/null +++ b/examples/evaluation/llm_rubric_knowledge_recall/agent/agent.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent with knowledge_search for llm_rubric_knowledge_recall evaluator demo.""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def _create_model() -> OpenAIModel: + api_key, url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=url) + + +def knowledge_search(query: str, top_k: int = 3) -> Dict[str, Any]: + """知识检索:根据 query 返回模拟的检索结果(示例用)。""" + # 模拟检索结果,裁判将根据这些内容与 rubrics 判定召回质量 + mock_docs = [ + {"title": "产品A", "content": "产品A 适用于企业协作,支持文档与任务管理。"}, + {"title": "产品B", "content": "产品B 提供 API 与 SDK,便于集成。"}, + {"title": "产品C", "content": "产品C 面向个人用户,提供笔记与待办。"}, + ] + return { + "query": query, + "results": mock_docs[:top_k], + } + + +def create_agent() -> LlmAgent: + """Create the agent for llm_rubric_knowledge_recall demo.""" + return LlmAgent( + name="llm_rubric_knowledge_recall_agent", + description="带知识检索的问答助手", + model=_create_model(), + instruction=( + "你是知识问答助手。用户提问时先调用 knowledge_search 检索相关知识," + "再根据检索结果组织回答。必须调用 knowledge_search 后再回答。" + ), + tools=[FunctionTool(knowledge_search)], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/llm_rubric_knowledge_recall/agent/config.py b/examples/evaluation/llm_rubric_knowledge_recall/agent/config.py new file mode 100644 index 0000000..5f1c9c4 --- /dev/null +++ b/examples/evaluation/llm_rubric_knowledge_recall/agent/config.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + """Get model config from environment variables.""" + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4-flash") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/llm_rubric_knowledge_recall/agent/llm_rubric_knowledge_recall.evalset.json b/examples/evaluation/llm_rubric_knowledge_recall/agent/llm_rubric_knowledge_recall.evalset.json new file mode 100644 index 0000000..dfb97a6 --- /dev/null +++ b/examples/evaluation/llm_rubric_knowledge_recall/agent/llm_rubric_knowledge_recall.evalset.json @@ -0,0 +1,33 @@ +{ + "eval_set_id": "llm_rubric_knowledge_recall", + "name": "LLM 细则知识库召回评估示例", + "description": "使用 llm_rubric_knowledge_recall 评估器判定检索到的知识是否支撑 rubrics", + "eval_cases": [ + { + "eval_id": "case_001", + "conversation": [ + { + "invocation_id": "inv-001", + "user_content": { + "parts": [{"text": "公司有哪些产品?请简单介绍"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "根据检索结果,公司有产品A(企业协作)、产品B(API/SDK)、产品C(个人笔记与待办)。"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + {"id": "t1", "name": "knowledge_search", "args": {"query": "公司产品", "top_k": 3}} + ] + } + } + ], + "session_input": { + "app_name": "llm_rubric_knowledge_recall_agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/llm_rubric_knowledge_recall/agent/test_config.json b/examples/evaluation/llm_rubric_knowledge_recall/agent/test_config.json new file mode 100644 index 0000000..c2f831d --- /dev/null +++ b/examples/evaluation/llm_rubric_knowledge_recall/agent/test_config.json @@ -0,0 +1,32 @@ +{ + "metrics": [ + { + "metric_name": "llm_rubric_knowledge_recall", + "threshold": 1, + "criterion": { + "llm_judge": { + "judge_model": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "api_key": "${TRPC_AGENT_API_KEY}", + "base_url": "${TRPC_AGENT_BASE_URL}" + }, + "rubrics": [ + { + "id": "coverage", + "content": {"text": "检索到的内容须覆盖问题中的关键信息(如产品、功能)"}, + "description": "召回覆盖", + "type": "KNOWLEDGE_COVERAGE" + }, + { + "id": "relevance", + "content": {"text": "检索结果须与用户问题相关"}, + "description": "召回相关性", + "type": "KNOWLEDGE_RELEVANCE" + } + ], + "knowledge_tool_names": ["knowledge_search"] + } + } + } + ] +} diff --git a/examples/evaluation/llm_rubric_knowledge_recall/test_llm_rubric_knowledge_recall.py b/examples/evaluation/llm_rubric_knowledge_recall/test_llm_rubric_knowledge_recall.py new file mode 100644 index 0000000..92fde20 --- /dev/null +++ b/examples/evaluation/llm_rubric_knowledge_recall/test_llm_rubric_knowledge_recall.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""llm_rubric_knowledge_recall 评估器示例测试""" + +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator + + +@pytest.mark.asyncio +async def test_llm_rubric_knowledge_recall_demo(): + """使用 llm_rubric_knowledge_recall 指标:裁判根据轨迹中的知识检索结果与 rubrics 判定召回质量。""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join( + test_dir, "agent", "llm_rubric_knowledge_recall.evalset.json" + ) + + await AgentEvaluator.evaluate( + agent_module="agent", + agent_name="llm_rubric_knowledge_recall_agent", + eval_dataset_file_path_or_dir=eval_set_path, + print_detailed_results=True, + ) diff --git a/examples/evaluation/llm_rubric_response/.env b/examples/evaluation/llm_rubric_response/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/llm_rubric_response/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY、TRPC_AGENT_BASE_URL、TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/llm_rubric_response/README.md b/examples/evaluation/llm_rubric_response/README.md new file mode 100644 index 0000000..ad9304c --- /dev/null +++ b/examples/evaluation/llm_rubric_response/README.md @@ -0,0 +1,23 @@ +# llm_rubric_response 评估器示例 + +使用 **llm_rubric_response** 指标:裁判模型根据配置的 **rubrics**(评估细则)逐条判定 Agent 最终回答是否满足,单轮分数为各细则得分平均值。 + +## 目录结构 + +- `agent/`:Agent 模块、评测集 `llm_rubric_response.evalset.json`、`test_config.json`(含 `rubrics`) +- `test_llm_rubric_response.py`:pytest 入口 + +## 环境变量 + +- `TRPC_AGENT_API_KEY` 或 `API_KEY`(必填) +- `TRPC_AGENT_BASE_URL`(可选) +- `TRPC_AGENT_MODEL_NAME`(可选,默认 glm-4-flash) + +## 运行 + +```bash +cd examples/evaluation/llm_rubric_response +pytest test_llm_rubric_response.py -v --tb=short -s +``` + +`test_config.json` 中需配置 `criterion.llm_judge.judge_model` 与 `criterion.llm_judge.rubrics`,每条 rubric 的 `content.text` 会展示给裁判模型用于判定。 diff --git a/examples/evaluation/llm_rubric_response/agent/__init__.py b/examples/evaluation/llm_rubric_response/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/llm_rubric_response/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/llm_rubric_response/agent/agent.py b/examples/evaluation/llm_rubric_response/agent/agent.py new file mode 100644 index 0000000..50c923a --- /dev/null +++ b/examples/evaluation/llm_rubric_response/agent/agent.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Simple agent for llm_rubric_response evaluator demo.""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def _create_model() -> OpenAIModel: + api_key, url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=url) + + +def get_weather(city: str) -> Dict[str, Any]: + """查询指定城市当前天气。""" + weather_data = { + "北京": {"temperature": 15, "condition": "晴"}, + "上海": {"temperature": 18, "condition": "多云"}, + "深圳": {"temperature": 25, "condition": "晴"}, + } + result = weather_data.get( + city, {"temperature": 20, "condition": "未知"} + ) + return {"city": city, **result} + + +def create_agent() -> LlmAgent: + """Create the agent for llm_rubric_response demo.""" + return LlmAgent( + name="llm_rubric_response_agent", + description="简单问答助手,可查天气", + model=_create_model(), + instruction=( + "你是问答助手。用户问天气时用 get_weather 查询后回答。" + "回答须包含明确结论(如温度、天气状况),且与用户问题直接相关。" + ), + tools=[FunctionTool(get_weather)], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/llm_rubric_response/agent/config.py b/examples/evaluation/llm_rubric_response/agent/config.py new file mode 100644 index 0000000..5f1c9c4 --- /dev/null +++ b/examples/evaluation/llm_rubric_response/agent/config.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + """Get model config from environment variables.""" + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4-flash") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/llm_rubric_response/agent/llm_rubric_response.evalset.json b/examples/evaluation/llm_rubric_response/agent/llm_rubric_response.evalset.json new file mode 100644 index 0000000..8b4cacd --- /dev/null +++ b/examples/evaluation/llm_rubric_response/agent/llm_rubric_response.evalset.json @@ -0,0 +1,31 @@ +{ + "eval_set_id": "llm_rubric_response", + "name": "LLM 细则响应评估示例", + "description": "使用 llm_rubric_response 评估器按 rubrics 判定最终回答质量", + "eval_cases": [ + { + "eval_id": "case_001", + "conversation": [ + { + "invocation_id": "inv-001", + "user_content": { + "parts": [{"text": "上海今天天气如何"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "上海当前多云,18°C。"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [{"id": "t1", "name": "get_weather", "args": {"city": "上海"}}] + } + } + ], + "session_input": { + "app_name": "llm_rubric_response_agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/llm_rubric_response/agent/test_config.json b/examples/evaluation/llm_rubric_response/agent/test_config.json new file mode 100644 index 0000000..995a9e6 --- /dev/null +++ b/examples/evaluation/llm_rubric_response/agent/test_config.json @@ -0,0 +1,31 @@ +{ + "metrics": [ + { + "metric_name": "llm_rubric_response", + "threshold": 1, + "criterion": { + "llm_judge": { + "judge_model": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "api_key": "${TRPC_AGENT_API_KEY}", + "base_url": "${TRPC_AGENT_BASE_URL}" + }, + "rubrics": [ + { + "id": "conclusion", + "content": {"text": "回答须包含明确的结论或数值(如温度、天气状况)"}, + "description": "结论明确", + "type": "FINAL_RESPONSE_QUALITY" + }, + { + "id": "relevance", + "content": {"text": "回答须与用户问题直接相关"}, + "description": "切题", + "type": "RELEVANCE" + } + ] + } + } + } + ] +} diff --git a/examples/evaluation/llm_rubric_response/test_llm_rubric_response.py b/examples/evaluation/llm_rubric_response/test_llm_rubric_response.py new file mode 100644 index 0000000..c8fd094 --- /dev/null +++ b/examples/evaluation/llm_rubric_response/test_llm_rubric_response.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""llm_rubric_response 评估器示例测试""" + +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator + + +@pytest.mark.asyncio +async def test_llm_rubric_response_demo(): + """使用 llm_rubric_response 指标评测 Agent:裁判按 rubrics 逐条判定最终回答。""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join( + test_dir, "agent", "llm_rubric_response.evalset.json" + ) + + await AgentEvaluator.evaluate( + agent_module="agent", + agent_name="llm_rubric_response_agent", + eval_dataset_file_path_or_dir=eval_set_path, + print_detailed_results=True, + ) diff --git a/examples/evaluation/pass_at_k/.env b/examples/evaluation/pass_at_k/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/pass_at_k/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY、TRPC_AGENT_BASE_URL、TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/pass_at_k/README.md b/examples/evaluation/pass_at_k/README.md new file mode 100644 index 0000000..71b2bb6 --- /dev/null +++ b/examples/evaluation/pass_at_k/README.md @@ -0,0 +1,22 @@ +# pass@k / pass^k 示例 + +多轮运行后解析 (n, c),计算 pass@1、pass@5、pass^2。目录与用法仿照 [quickstart](../quickstart/)。 + +## 目录结构 + +- `pass_at_k/`:示例根目录 +- `agent/`:内含 `agent.py`、`weather_agent.evalset.json`、`config.py`、`test_config.json`(其中 **num_runs: 5**) +- `test_pass_at_k.py`:使用 get_executer、evaluate、get_result、parse_pass_nc、pass_at_k、pass_hat_k + +## 环境要求 + +Python 3.10+。环境变量同 quickstart(`TRPC_AGENT_API_KEY` 等)。 + +## 运行 + +```bash +cd examples/evaluation/pass_at_k +pytest test_pass_at_k.py -v --tb=short -s +``` + +终端会打印各评测集的 n、c 以及 pass@1、pass@5、pass^2。 diff --git a/examples/evaluation/pass_at_k/agent/__init__.py b/examples/evaluation/pass_at_k/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/pass_at_k/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/pass_at_k/agent/agent.py b/examples/evaluation/pass_at_k/agent/agent.py new file mode 100644 index 0000000..7f4c6a7 --- /dev/null +++ b/examples/evaluation/pass_at_k/agent/agent.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Weather agent for pass@k example.""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def _create_model() -> OpenAIModel: + api_key, url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=url) + + +def get_weather(city: str) -> Dict[str, Any]: + """查询指定城市当前天气。""" + weather_data = { + "北京": {"temperature": 15, "condition": "晴"}, + "上海": {"temperature": 18, "condition": "多云"}, + "深圳": {"temperature": 25, "condition": "晴"}, + "杭州": {"temperature": 20, "condition": "小雨"}, + } + result = weather_data.get( + city, {"temperature": 20, "condition": "未知"} + ) + return {"city": city, **result} + + +def create_agent() -> LlmAgent: + """Create the weather agent.""" + return LlmAgent( + name="weather_agent", + description="天气查询助手", + model=_create_model(), + instruction="你是天气助手,用 get_weather 查询城市天气并简要回答。", + tools=[FunctionTool(get_weather)], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/pass_at_k/agent/config.py b/examples/evaluation/pass_at_k/agent/config.py new file mode 100644 index 0000000..fc6bea3 --- /dev/null +++ b/examples/evaluation/pass_at_k/agent/config.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + """Get model config from environment variables.""" + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4.7") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/pass_at_k/agent/test_config.json b/examples/evaluation/pass_at_k/agent/test_config.json new file mode 100644 index 0000000..447464b --- /dev/null +++ b/examples/evaluation/pass_at_k/agent/test_config.json @@ -0,0 +1,28 @@ +{ + "metrics": [ + { + "metric_name": "tool_trajectory_avg_score", + "threshold": 0.8, + "criterion": { + "tool_trajectory": { + "default": { + "name": {"match": "exact", "case_insensitive": false}, + "arguments": {"match": "exact"} + }, + "order_sensitive": false, + "subset_matching": false + } + } + }, + { + "metric_name": "final_response_avg_score", + "threshold": 0.6, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": true} + } + } + } + ], + "num_runs": 5 +} diff --git a/examples/evaluation/pass_at_k/agent/weather_agent.evalset.json b/examples/evaluation/pass_at_k/agent/weather_agent.evalset.json new file mode 100644 index 0000000..80f84b5 --- /dev/null +++ b/examples/evaluation/pass_at_k/agent/weather_agent.evalset.json @@ -0,0 +1,37 @@ +{ + "eval_set_id": "weather_agent_pass_at_k", + "name": "pass@k 示例评测集", + "description": "单用例多轮运行,用于计算 pass@k / pass^k", + "eval_cases": [ + { + "eval_id": "simple_weather_001", + "conversation": [ + { + "invocation_id": "e-quick-001", + "user_content": { + "parts": [{"text": "上海天气怎么样"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "18"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": "t1", + "name": "get_weather", + "args": {"city": "上海"} + } + ] + } + } + ], + "session_input": { + "app_name": "weather_agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/pass_at_k/test_pass_at_k.py b/examples/evaluation/pass_at_k/test_pass_at_k.py new file mode 100644 index 0000000..d8987cd --- /dev/null +++ b/examples/evaluation/pass_at_k/test_pass_at_k.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""pass@k / pass^k 示例:多轮运行后计算 pass@1、pass@5、pass^2。""" + +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator + + +@pytest.mark.asyncio +async def test_pass_at_k(): + """多轮运行后解析 (n, c),计算 pass@k 与 pass^k。""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join(test_dir, "agent", "weather_agent.evalset.json") + + # test_config.json 中已配置 num_runs: 5,会跑 5 轮 + executer = AgentEvaluator.get_executer( + agent_module="agent", + agent_name="weather_agent", + eval_dataset_file_path_or_dir=eval_set_path, + print_detailed_results=True, + ) + try: + await executer.evaluate() + finally: + result = executer.get_result() + if result is not None: + nc_by_set = AgentEvaluator.parse_pass_nc(result) + for eval_set_id, nc in nc_by_set.items(): + n, c = nc.n, nc.c + pass_1 = AgentEvaluator.pass_at_k(n, c, 1) + pass_5 = AgentEvaluator.pass_at_k(n, c, 5) + pass_hat_2 = AgentEvaluator.pass_hat_k(n, c, 2) + print( + f"EvalSet {eval_set_id}: n={n}, c={c}, " + f"pass@1={pass_1:.4f}, pass@5={pass_5:.4f}, pass^2={pass_hat_2:.4f}" + ) diff --git a/examples/evaluation/quickstart/.env b/examples/evaluation/quickstart/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/quickstart/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY、TRPC_AGENT_BASE_URL、TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/quickstart/README.md b/examples/evaluation/quickstart/README.md new file mode 100644 index 0000000..6d709bd --- /dev/null +++ b/examples/evaluation/quickstart/README.md @@ -0,0 +1,27 @@ +# Quickstart 天气 Agent 示例 + +最小评测示例:天气查询 Agent,仅包含一个 evalset(单用例)。 + +## 目录结构 + +- `quickstart/`:示例根目录 +- `agent/`:内含 `agent.py`(`root_agent.name="weather_agent"`)、`weather_agent.evalset.json`、`config.py`、`test_config.json` 等 + +## 环境要求 + +Python 3.10+(建议 3.12) + +## 环境变量 + +在 `.env` 或环境中设置: + +- `TRPC_AGENT_API_KEY` 或 `API_KEY` +- `TRPC_AGENT_BASE_URL`(可选) +- `TRPC_AGENT_MODEL_NAME`(可选,默认 glm-4.7) + +## 运行评测 + +```bash +cd examples/evaluation/quickstart +pytest test_quickstart.py -v --tb=short -s +``` diff --git a/examples/evaluation/quickstart/agent/__init__.py b/examples/evaluation/quickstart/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/quickstart/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/quickstart/agent/agent.py b/examples/evaluation/quickstart/agent/agent.py new file mode 100644 index 0000000..39850fd --- /dev/null +++ b/examples/evaluation/quickstart/agent/agent.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Weather agent: current weather, forecast, AQI, UV index.""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def _create_model() -> OpenAIModel: + api_key, url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=url) + + +def get_weather(city: str) -> Dict[str, Any]: + """查询指定城市当前天气。""" + weather_data = { + "北京": {"temperature": 15, "condition": "晴", "humidity": 45, "wind_speed": 10}, + "上海": {"temperature": 18, "condition": "多云", "humidity": 60, "wind_speed": 15}, + "深圳": {"temperature": 25, "condition": "晴", "humidity": 70, "wind_speed": 8}, + "杭州": {"temperature": 20, "condition": "小雨", "humidity": 85, "wind_speed": 12}, + } + result = weather_data.get( + city, {"temperature": 20, "condition": "未知", "humidity": 50, "wind_speed": 10} + ) + return {"city": city, **result} + + +def get_weather_forecast(city: str, days: int = 3) -> Dict[str, Any]: + """查询指定城市未来几日天气预报。""" + return { + "city": city, + "forecast": [{"date": "today", "temperature": "20°C", "condition": "晴"}] * days, + } + + +def get_air_quality(city: str) -> Dict[str, Any]: + """查询指定城市空气质量。""" + aqi_data = {"北京": 85, "上海": 72, "深圳": 65, "杭州": 90, "广州": 78} + aqi = aqi_data.get(city, 75) + level = "优" if aqi <= 50 else "良" if aqi <= 100 else "轻度污染" + return {"city": city, "aqi": aqi, "level": level} + + +def get_uv_index(city: str) -> Dict[str, Any]: + """查询指定城市紫外线指数。""" + uv_data = {"北京": 5, "上海": 6, "深圳": 8, "杭州": 4, "广州": 7} + uv = uv_data.get(city, 5) + suggestion = "注意防晒" if uv >= 6 else "适宜户外" + return {"city": city, "uv_index": uv, "suggestion": suggestion} + + +def create_agent() -> LlmAgent: + """Create the weather agent.""" + return LlmAgent( + name="weather_agent", + description="天气查询助手,可查当前天气、预报、空气质量、紫外线指数", + model=_create_model(), + instruction=( + "你是天气助手。用 get_weather 查当前天气,get_weather_forecast 查预报," + "get_air_quality 查空气质量,get_uv_index 查紫外线。" + "用户问多类信息时依次调用相应工具。" + ), + tools=[ + FunctionTool(get_weather), + FunctionTool(get_weather_forecast), + FunctionTool(get_air_quality), + FunctionTool(get_uv_index), + ], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/quickstart/agent/config.py b/examples/evaluation/quickstart/agent/config.py new file mode 100644 index 0000000..fc6bea3 --- /dev/null +++ b/examples/evaluation/quickstart/agent/config.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + """Get model config from environment variables.""" + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4.7") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/quickstart/agent/test_config.json b/examples/evaluation/quickstart/agent/test_config.json new file mode 100644 index 0000000..c6824ba --- /dev/null +++ b/examples/evaluation/quickstart/agent/test_config.json @@ -0,0 +1,27 @@ +{ + "metrics": [ + { + "metric_name": "tool_trajectory_avg_score", + "threshold": 0.8, + "criterion": { + "tool_trajectory": { + "default": { + "name": {"match": "exact", "case_insensitive": false}, + "arguments": {"match": "exact"} + }, + "order_sensitive": false, + "subset_matching": false + } + } + }, + { + "metric_name": "final_response_avg_score", + "threshold": 0.6, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": true} + } + } + } + ] +} diff --git a/examples/evaluation/quickstart/agent/weather_agent.evalset.json b/examples/evaluation/quickstart/agent/weather_agent.evalset.json new file mode 100644 index 0000000..d3a78a2 --- /dev/null +++ b/examples/evaluation/quickstart/agent/weather_agent.evalset.json @@ -0,0 +1,47 @@ +{ + "eval_set_id": "weather_agent_quickstart", + "name": "天气 Agent 单用例", + "description": "Quickstart 单轮天气查询评测", + "eval_cases": [ + { + "eval_id": "simple_weather_001", + "conversation": [ + { + "invocation_id": "e-quick-001", + "user_content": { + "parts": [ + { + "text": "上海天气怎么样" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "18°C" + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": "t1", + "name": "get_weather", + "args": { + "city": "上海" + } + } + ] + } + } + ], + "session_input": { + "app_name": "weather_agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/quickstart/test_quickstart.py b/examples/evaluation/quickstart/test_quickstart.py new file mode 100644 index 0000000..2fac302 --- /dev/null +++ b/examples/evaluation/quickstart/test_quickstart.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Quickstart 天气 Agent 评测测试""" + +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator + + +@pytest.mark.asyncio +async def test_quickstart_with_eval_set(): + """使用单个 evalset 测试 quickstart 天气 Agent""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join(test_dir, "agent", "weather_agent.evalset.json") + + await AgentEvaluator.evaluate( + agent_module="agent", + agent_name="weather_agent", + eval_dataset_file_path_or_dir=eval_set_path, + print_detailed_results=True, + ) diff --git a/examples/evaluation/trace_mode/.env b/examples/evaluation/trace_mode/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/trace_mode/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY、TRPC_AGENT_BASE_URL、TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/trace_mode/README.md b/examples/evaluation/trace_mode/README.md new file mode 100644 index 0000000..3b1ce63 --- /dev/null +++ b/examples/evaluation/trace_mode/README.md @@ -0,0 +1,20 @@ +# Trace 模式示例 + +使用 **eval_mode: "trace"**:不调用 Agent 推理,用 evalset 中的 **actual_conversation** 作为「实际轨迹」参与打分,**conversation** 作为预期用于对比。适合回放已有对话、离线评估。 + +## 目录结构 + +- `trace_mode/`:示例根目录 +- `agent/`:内含 `agent.py`、`trace_example.evalset.json`(含 trace 用例)、`test_config.json`、`config.py` +- `test_trace_mode.py`:调用 `AgentEvaluator.evaluate`,仅执行打分阶段 + +## 环境要求 + +Python 3.10+。Trace 模式不跑模型推理,但框架仍会加载 agent 模块;若未配置 `TRPC_AGENT_API_KEY`,加载可能报错,可按需配置或仅用於查看結構。 + +## 运行 + +```bash +cd examples/evaluation/trace_mode +pytest test_trace_mode.py -v --tb=short -s +``` diff --git a/examples/evaluation/trace_mode/agent/__init__.py b/examples/evaluation/trace_mode/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/trace_mode/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/trace_mode/agent/agent.py b/examples/evaluation/trace_mode/agent/agent.py new file mode 100644 index 0000000..c9304fb --- /dev/null +++ b/examples/evaluation/trace_mode/agent/agent.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Minimal agent for trace mode example (in trace mode the agent is not run).""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def get_weather(city: str) -> Dict[str, Any]: + """查询指定城市当前天气。""" + return {"city": city, "temperature": 18, "condition": "多云"} + + +def create_agent() -> LlmAgent: + api_key, url, model_name = get_model_config() + return LlmAgent( + name="weather_agent", + description="天气助手", + model=OpenAIModel(model_name=model_name, api_key=api_key, base_url=url), + instruction="用 get_weather 查天气并简要回答。", + tools=[FunctionTool(get_weather)], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/trace_mode/agent/config.py b/examples/evaluation/trace_mode/agent/config.py new file mode 100644 index 0000000..71081cc --- /dev/null +++ b/examples/evaluation/trace_mode/agent/config.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config (trace 模式不跑推理,仅加载模块时可能用到).""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4.7") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/trace_mode/agent/test_config.json b/examples/evaluation/trace_mode/agent/test_config.json new file mode 100644 index 0000000..c6824ba --- /dev/null +++ b/examples/evaluation/trace_mode/agent/test_config.json @@ -0,0 +1,27 @@ +{ + "metrics": [ + { + "metric_name": "tool_trajectory_avg_score", + "threshold": 0.8, + "criterion": { + "tool_trajectory": { + "default": { + "name": {"match": "exact", "case_insensitive": false}, + "arguments": {"match": "exact"} + }, + "order_sensitive": false, + "subset_matching": false + } + } + }, + { + "metric_name": "final_response_avg_score", + "threshold": 0.6, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": true} + } + } + } + ] +} diff --git a/examples/evaluation/trace_mode/agent/trace_example.evalset.json b/examples/evaluation/trace_mode/agent/trace_example.evalset.json new file mode 100644 index 0000000..c522258 --- /dev/null +++ b/examples/evaluation/trace_mode/agent/trace_example.evalset.json @@ -0,0 +1,55 @@ +{ + "eval_set_id": "trace_mode_example", + "name": "Trace 模式示例", + "description": "使用预录制的 actual_conversation 作为实际轨迹,不调用 Agent 推理", + "eval_cases": [ + { + "eval_id": "replay_001", + "eval_mode": "trace", + "actual_conversation": [ + { + "invocation_id": "inv-1", + "user_content": { + "parts": [{"text": "上海天气怎么样"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "上海今天多云,18°C"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": "t1", + "name": "get_weather", + "args": {"city": "上海"} + } + ] + } + } + ], + "conversation": [ + { + "invocation_id": "exp-1", + "user_content": { + "parts": [{"text": "上海天气怎么样"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "上海今天多云,18°C"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": "t1", + "name": "get_weather", + "args": {"city": "上海"} + } + ] + } + } + ] + } + ] +} diff --git a/examples/evaluation/trace_mode/test_trace_mode.py b/examples/evaluation/trace_mode/test_trace_mode.py new file mode 100644 index 0000000..06235bb --- /dev/null +++ b/examples/evaluation/trace_mode/test_trace_mode.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Trace 模式示例:使用预录制的对话轨迹参与打分,不调用 Agent 推理。""" + +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator + + +@pytest.mark.asyncio +async def test_trace_mode(): + """Trace 模式:跳过推理,用 evalset 中的 actual_conversation 作为实际轨迹参与评估。""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join(test_dir, "agent", "trace_example.evalset.json") + + await AgentEvaluator.evaluate( + agent_module="agent", + agent_name="weather_agent", + eval_dataset_file_path_or_dir=eval_set_path, + print_detailed_results=True, + ) diff --git a/examples/evaluation/webui/.env b/examples/evaluation/webui/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/webui/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY、TRPC_AGENT_BASE_URL、TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/webui/README.md b/examples/evaluation/webui/README.md new file mode 100644 index 0000000..fea2ba3 --- /dev/null +++ b/examples/evaluation/webui/README.md @@ -0,0 +1,36 @@ +# WebUI 书籍查找 Agent 示例 + +按优先级查找书籍:本地图书馆 → 本地书店 → 在线零售商。 + +## 目录结构 + +- `webui/`:`--agents` 指向此目录 +- `agent/`:子目录名须与 `root_agent.name` 一致(`"agent"`),内含 `agent.py`、`agent.evalset.json`、`config.py`、`prompts.py`、`tools.py`、`test_config.json` 等 + +## 环境要求 + +Python 3.10+(建议 3.12) + +## 环境变量 + +在 `.env` 中或通过 `export` 设置: + +- `TRPC_AGENT_API_KEY` 或 `API_KEY` +- `TRPC_AGENT_BASE_URL`(可选,有默认值) +- `TRPC_AGENT_MODEL_NAME`(可选,有默认值) + +## 运行示例 + +```bash +cd examples/evaluation/webui +python run_agent.py +``` + +## 运行评估测试 + +```bash +cd examples/evaluation/webui +pytest test_book_finder.py -v --tb=short -s +``` + +需已设置上述环境变量。 diff --git a/examples/evaluation/webui/agent/__init__.py b/examples/evaluation/webui/agent/__init__.py new file mode 100644 index 0000000..449d379 --- /dev/null +++ b/examples/evaluation/webui/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2026 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/webui/agent/agent.evalset.json b/examples/evaluation/webui/agent/agent.evalset.json new file mode 100644 index 0000000..31dc1d5 --- /dev/null +++ b/examples/evaluation/webui/agent/agent.evalset.json @@ -0,0 +1,101 @@ +{ + "eval_set_id": "agent", + "name": "书籍查找工作流评估", + "description": "测试书籍查找 Agent 在不同场景下的行为和响应质量", + "eval_cases": [ + { + "eval_id": "session_001_library_available", + "conversation": [ + { + "invocation_id": "e-book-001-001", + "user_content": { + "parts": [ + { + "text": "我想借阅《三体》" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "太好了!我找到了《三体》这本书。本地图书馆有3本可借,位于科技图书馆的科幻小说区 A-301。图书馆借阅是免费的,推荐您直接去借阅!" + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": "tool-call-001", + "name": "search_local_library", + "args": { + "book_title": "三体" + } + } + ] + } + } + ], + "session_input": { + "app_name": "agent", + "user_id": "user", + "state": {} + } + }, + { + "eval_id": "session_002_only_online_available", + "conversation": [ + { + "invocation_id": "e-book-002-001", + "user_content": { + "parts": [ + { + "text": "我想找《人工智能简史》" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "关于《人工智能简史》的查找结果如下:\n\n1. **本地图书馆**\n - 很抱歉,本地图书馆目前没有这本书。\n\n2. **本地书店**\n - 很抱歉,本地书店也没有找到这本书。\n\n3. **在线购买**\n - 您可以选择以下平台购买:\n - 京东图书:[链接](https://book.jd.com)\n - 当当网:[链接](https://book.dangdang.com)\n - 亚马逊中国:[链接](https://www.amazon.cn)\n - 建议您先搜索比价,选择最优惠的平台购买。" + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": "tool-call-002-1", + "name": "search_local_library", + "args": { + "book_title": "人工智能简史" + } + }, + { + "id": "tool-call-002-2", + "name": "find_local_bookstore", + "args": { + "book_title": "人工智能简史" + } + }, + { + "id": "tool-call-002-3", + "name": "order_online", + "args": { + "book_title": "人工智能简史" + } + } + ] + } + } + ], + "session_input": { + "app_name": "agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/webui/agent/agent.py b/examples/evaluation/webui/agent/agent.py new file mode 100644 index 0000000..4b2b7bd --- /dev/null +++ b/examples/evaluation/webui/agent/agent.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Book finder agent: local library, bookstore, online retailers.""" + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool +from trpc_agent_sdk.types import GenerateContentConfig + +from .config import get_model_config +from .prompts import INSTRUCTION +from .tools import find_local_bookstore, order_online, search_local_library + + +def _create_model() -> OpenAIModel: + """Create model from config.""" + api_key, url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=url) + + +def create_agent() -> LlmAgent: + """Create the book finder agent.""" + return LlmAgent( + name="agent", + description="专业的书籍查找助手,可查询本地图书馆、书店和在线零售商", + model=_create_model(), + instruction=INSTRUCTION, + tools=[ + FunctionTool(search_local_library), + FunctionTool(find_local_bookstore), + FunctionTool(order_online), + ], + generate_content_config=GenerateContentConfig( + temperature=0.3, + top_p=0.9, + max_output_tokens=1500, + ), + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/webui/agent/config.py b/examples/evaluation/webui/agent/config.py new file mode 100644 index 0000000..728c1bb --- /dev/null +++ b/examples/evaluation/webui/agent/config.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + """Get model config from environment variables.""" + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get( + "TRPC_AGENT_MODEL_NAME", "deepseek-v3-local-II" + ) + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/webui/agent/prompts.py b/examples/evaluation/webui/agent/prompts.py new file mode 100644 index 0000000..490fbda --- /dev/null +++ b/examples/evaluation/webui/agent/prompts.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Prompts for the book finder agent.""" + +INSTRUCTION = """你是一个专业的书籍查找助手,帮助用户在本地或在线找到他们想要的书籍。 + +### 工作流程 + +**步骤 1: 理解请求** +- 从用户的请求中提取书籍标题 +- 如果用户未提供具体标题,礼貌地询问他们想要查找的书籍名称 + +**步骤 2: 按优先级顺序查找书籍** + +1. **首先检查本地图书馆** (使用 search_local_library 工具) + - 如果有可用副本,提供详细信息:分馆位置、副本数量、具体位置 + - 强调图书馆借阅是免费的 + +2. **然后检查本地书店** (使用 find_local_bookstore 工具) + - 如果图书馆没有,查找本地书店库存 + - 提供书店地址、电话、价格和库存数量 + - 建议用户可以实地查看 + +3. **最后检查在线零售商** (使用 order_online 工具) + - 如果本地资源都没有,提供在线购买选项 + - 列出多个平台供用户选择 + - 提供配送时间参考 + +**步骤 3: 友好呈现结果** +- 用清晰、友好的语言总结查找结果 +- 如果多个渠道都有,按优先级推荐 +- 如果都没有,建议用户尝试搜索相似书籍或联系书店预订 + +### 注意事项 +- 保持友好和专业的态度 +- 提供准确、详细的信息 +- 优先推荐本地资源(图书馆和书店) +- 帮助用户做出最佳选择 +""" diff --git a/examples/evaluation/webui/agent/test_config.json b/examples/evaluation/webui/agent/test_config.json new file mode 100644 index 0000000..49ca8d4 --- /dev/null +++ b/examples/evaluation/webui/agent/test_config.json @@ -0,0 +1,6 @@ +{ + "criteria": { + "tool_trajectory_avg_score": 0.5, + "response_match_score": 0.3 + } +} diff --git a/examples/evaluation/webui/agent/tools.py b/examples/evaluation/webui/agent/tools.py new file mode 100644 index 0000000..f393da7 --- /dev/null +++ b/examples/evaluation/webui/agent/tools.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Tools for the book finder agent.""" + +from typing import Any, Dict + + +def search_local_library(book_title: str) -> Dict[str, Any]: + """查询本地图书馆的书籍可用性,包括副本数量、所在分馆和位置信息。 + + 参数: + book_title: 书籍标题 + + 返回: + 包含书籍可用性信息的字典 + """ + library_data = { + "Heartstopper: Volume 1": { + "available": False, + "copies": 0, + "branch": "中央图书馆", + "status": "所有副本都已借出", + }, + "三体": { + "available": True, + "copies": 3, + "branch": "科技图书馆", + "status": "有 3 本可借", + "location": "科幻小说区 A-301", + }, + "活着": { + "available": True, + "copies": 2, + "branch": "文学图书馆", + "status": "有 2 本可借", + "location": "现代文学区 B-205", + }, + "Python编程:从入门到实践": { + "available": True, + "copies": 1, + "branch": "技术图书馆", + "status": "仅剩 1 本可借", + "location": "编程语言区 C-102", + }, + } + + result = library_data.get( + book_title, + { + "available": False, + "copies": 0, + "branch": "未找到", + "status": "本地图书馆没有此书", + }, + ) + + return { + "source": "本地图书馆", + "book_title": book_title, + **result, + } + + +def find_local_bookstore(book_title: str) -> Dict[str, Any]: + """查找本地书店的书籍库存信息,包括价格、地址和联系方式。 + + 参数: + book_title: 书籍标题 + + 返回: + 包含书店库存信息的字典 + """ + bookstore_data = { + "Heartstopper: Volume 1": { + "available": True, + "price": "49.80元", + "store": "新华书店", + "address": "市中心步行街128号", + "phone": "0755-12345678", + "stock": 5, + }, + "三体": { + "available": True, + "price": "23.00元", + "store": "当当实体书店", + "address": "科技园区创业路56号", + "phone": "0755-87654321", + "stock": 10, + }, + "解忧杂货店": { + "available": True, + "price": "39.50元", + "store": "方所书店", + "address": "购物中心3楼", + "phone": "0755-11223344", + "stock": 3, + }, + } + + result = bookstore_data.get( + book_title, + { + "available": False, + "store": "未找到", + "status": "本地书店暂无库存", + }, + ) + + return { + "source": "本地书店", + "book_title": book_title, + **result, + } + + +def order_online(book_title: str) -> Dict[str, Any]: + """查找在线购买书籍的选项,提供多个电商平台的信息和配送时间。 + + 参数: + book_title: 书籍标题 + + 返回: + 包含在线购买信息的字典 + """ + online_data = { + "default": { + "available": True, + "platforms": [ + { + "name": "京东图书", + "price": "根据书籍而定", + "url": "https://book.jd.com", + "delivery": "次日达(会员)", + }, + { + "name": "当当网", + "price": "根据书籍而定", + "url": "https://book.dangdang.com", + "delivery": "2-3个工作日", + }, + { + "name": "亚马逊中国", + "price": "根据书籍而定", + "url": "https://www.amazon.cn", + "delivery": "2-5个工作日", + }, + ], + "recommendation": "建议先搜索比价,选择最优惠的平台购买", + } + } + + result = online_data["default"] + + return { + "source": "在线零售商", + "book_title": book_title, + **result, + } diff --git a/examples/evaluation/webui/test_book_finder.py b/examples/evaluation/webui/test_book_finder.py new file mode 100644 index 0000000..31e5380 --- /dev/null +++ b/examples/evaluation/webui/test_book_finder.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""书籍查找 Agent 测试 - 仿照 ADK AgentEvaluator""" + +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator + + +@pytest.mark.asyncio +async def test_webui_with_eval_set(): + """使用评估集测试 WebUI 书籍查找 Agent""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join(test_dir, "agent", "agent.evalset.json") + + await AgentEvaluator.evaluate( + agent_module="agent", + eval_dataset_file_path_or_dir=eval_set_path, + print_detailed_results=True, + )