From 97423158d5aeed58663f22c5aa8fe3a3a99ea852 Mon Sep 17 00:00:00 2001 From: Amrit Krishnan Date: Wed, 25 Feb 2026 09:39:10 -0500 Subject: [PATCH] Update and fix config --- .../aieng/agent_evals/configs.py | 4 + .../aieng/agent_evals/tools/search.py | 4 +- .../tests/aieng/agent_evals/test_configs.py | 231 ++++++++++++++++++ implementations/basics/01_why_evals.ipynb | 12 +- 4 files changed, 238 insertions(+), 13 deletions(-) create mode 100644 aieng-eval-agents/tests/aieng/agent_evals/test_configs.py diff --git a/aieng-eval-agents/aieng/agent_evals/configs.py b/aieng-eval-agents/aieng/agent_evals/configs.py index 61f7542..ab25b4e 100644 --- a/aieng-eval-agents/aieng/agent_evals/configs.py +++ b/aieng-eval-agents/aieng/agent_evals/configs.py @@ -96,6 +96,10 @@ class Configs(BaseSettings): validation_alias=AliasChoices("OPENAI_API_KEY", "GEMINI_API_KEY", "GOOGLE_API_KEY"), description="API key for OpenAI-compatible API (accepts OPENAI_API_KEY, GEMINI_API_KEY, or GOOGLE_API_KEY).", ) + google_api_key: SecretStr = Field( + validation_alias=AliasChoices("GEMINI_API_KEY", "GOOGLE_API_KEY"), + description="API key for Google/Gemini API (accepts GEMINI_API_KEY or GOOGLE_API_KEY).", + ) default_planner_model: str = Field( default="gemini-2.5-pro", description="Model name for planning/complex reasoning tasks.", diff --git a/aieng-eval-agents/aieng/agent_evals/tools/search.py b/aieng-eval-agents/aieng/agent_evals/tools/search.py index 686568e..f97add0 100644 --- a/aieng-eval-agents/aieng/agent_evals/tools/search.py +++ b/aieng-eval-agents/aieng/agent_evals/tools/search.py @@ -303,7 +303,7 @@ async def google_search(query: str, model: str | None = None) -> dict[str, Any]: model = config.default_worker_model return await _google_search_async( - query, model=model, temperature=config.default_temperature, api_key=config.openai_api_key.get_secret_value() + query, model=model, temperature=config.default_temperature, api_key=config.google_api_key.get_secret_value() ) @@ -369,7 +369,7 @@ async def google_search(query: str) -> dict[str, Any]: - **error** (str): Error message (error case only) """ return await _google_search_async( - query, model=model, temperature=temperature, api_key=config.openai_api_key.get_secret_value() + query, model=model, temperature=temperature, api_key=config.google_api_key.get_secret_value() ) return FunctionTool(func=google_search) diff --git a/aieng-eval-agents/tests/aieng/agent_evals/test_configs.py b/aieng-eval-agents/tests/aieng/agent_evals/test_configs.py new file mode 100644 index 0000000..cbef1d7 --- /dev/null +++ b/aieng-eval-agents/tests/aieng/agent_evals/test_configs.py @@ -0,0 +1,231 @@ +"""Tests for Configs and DatabaseConfig configuration models.""" + +import os + +import pytest +from aieng.agent_evals.configs import Configs, DatabaseConfig +from pydantic import SecretStr, ValidationError + + +def make_configs() -> Configs: + """Create Configs without loading any .env file. + + Wraps ``Configs(_env_file=None)`` to avoid a Pyright false-positive: + pydantic-settings accepts ``_env_file`` as a special init override but it + is absent from the generated type stubs. + """ + return Configs(_env_file=None) # type: ignore[call-arg] + + +class TestDatabaseConfig: + """Tests for DatabaseConfig and its build_uri() method.""" + + def test_build_uri_sqlite(self): + """SQLite URI with only driver and database is valid.""" + config = DatabaseConfig(driver="sqlite", database="/tmp/test.db") + assert config.build_uri() == "sqlite:////tmp/test.db" + + def test_build_uri_postgresql_with_credentials(self): + """PostgreSQL URI includes host, port, username, and password.""" + config = DatabaseConfig( + driver="postgresql", + username="user", + password=SecretStr("secret"), + host="localhost", + port=5432, + database="mydb", + ) + assert config.build_uri() == "postgresql://user:secret@localhost:5432/mydb" + + def test_build_uri_includes_query_params(self): + """Query parameters appear in the rendered URI.""" + config = DatabaseConfig(driver="sqlite", database="/tmp/test.db", query={"mode": "ro"}) + assert "mode=ro" in config.build_uri() + + def test_build_uri_escapes_special_password_chars(self): + """Special characters in the password are URL-encoded, not exposed verbatim.""" + config = DatabaseConfig( + driver="postgresql", + username="user", + password=SecretStr("p@ss/word"), + host="localhost", + port=5432, + database="db", + ) + uri = config.build_uri() + assert "p@ss/word" not in uri # must be percent-encoded + assert "user" in uri + + def test_optional_fields_default_to_none(self): + """username, host, password, port, and database all default to None.""" + config = DatabaseConfig(driver="sqlite") + assert config.username is None + assert config.host is None + assert config.password is None + assert config.port is None + assert config.database is None + + def test_query_defaults_to_empty_dict(self): + """Query field defaults to an empty dict.""" + assert DatabaseConfig(driver="sqlite").query == {} + + +class TestConfigsDefaults: + """Tests for default field values in Configs.""" + + @pytest.fixture(autouse=True) + def _required_env(self, monkeypatch): + """Run with a fully isolated environment containing only required fields.""" + monkeypatch.setattr(os, "environ", {"OPENAI_API_KEY": "test-openai-key", "GEMINI_API_KEY": "test-google-key"}) + + def test_default_worker_model(self): + """default_worker_model is gemini-2.5-flash.""" + assert make_configs().default_worker_model == "gemini-2.5-flash" + + def test_default_planner_model(self): + """default_planner_model is gemini-2.5-pro.""" + assert make_configs().default_planner_model == "gemini-2.5-pro" + + def test_default_evaluator_model(self): + """default_evaluator_model is gemini-2.5-pro.""" + assert make_configs().default_evaluator_model == "gemini-2.5-pro" + + def test_default_temperature(self): + """default_temperature is 1.0.""" + assert make_configs().default_temperature == 1.0 + + def test_default_evaluator_temperature(self): + """default_evaluator_temperature is 0.0.""" + assert make_configs().default_evaluator_temperature == 0.0 + + def test_default_openai_base_url(self): + """openai_base_url defaults to the Gemini googleapis endpoint.""" + assert "googleapis.com" in make_configs().openai_base_url + + def test_optional_fields_default_none(self): + """All optional service fields default to None.""" + config = make_configs() + assert config.aml_db is None + assert config.report_generation_db is None + assert config.langfuse_public_key is None + assert config.langfuse_secret_key is None + assert config.e2b_api_key is None + + +class TestGoogleApiKey: + """Tests for the google_api_key field and its env var aliases.""" + + @pytest.fixture(autouse=True) + def _required_env(self, monkeypatch): + """Run with a clean environment: only OPENAI_API_KEY set, no Google keys.""" + monkeypatch.setattr(os, "environ", {"OPENAI_API_KEY": "test-openai-key"}) + + def test_loaded_from_gemini_api_key(self, monkeypatch): + """google_api_key is populated from GEMINI_API_KEY.""" + monkeypatch.setenv("GEMINI_API_KEY", "my-gemini-key") + assert make_configs().google_api_key.get_secret_value() == "my-gemini-key" + + def test_loaded_from_google_api_key(self, monkeypatch): + """google_api_key is populated from GOOGLE_API_KEY.""" + monkeypatch.setenv("GOOGLE_API_KEY", "my-google-key") + assert make_configs().google_api_key.get_secret_value() == "my-google-key" + + def test_gemini_api_key_takes_priority_over_google_api_key(self, monkeypatch): + """GEMINI_API_KEY takes priority over GOOGLE_API_KEY when both are set.""" + monkeypatch.setenv("GEMINI_API_KEY", "gemini-key") + monkeypatch.setenv("GOOGLE_API_KEY", "google-key") + config = make_configs() + assert config.google_api_key.get_secret_value() == "gemini-key" + + def test_secret_value_not_exposed_in_repr(self, monkeypatch): + """SecretStr does not leak the raw key in repr or str.""" + monkeypatch.setenv("GEMINI_API_KEY", "super-secret-key") + key = make_configs().google_api_key + assert "super-secret-key" not in repr(key) + assert "super-secret-key" not in str(key) + + +class TestOpenAiApiKeyAliases: + """Tests for openai_api_key env var aliases.""" + + @pytest.fixture(autouse=True) + def _clear_google_env(self, monkeypatch): + monkeypatch.setattr(os, "environ", {}) + + def test_loaded_from_openai_api_key(self, monkeypatch): + """openai_api_key is loaded from OPENAI_API_KEY when it is set.""" + monkeypatch.setenv("OPENAI_API_KEY", "my-openai-key") + monkeypatch.setenv("GEMINI_API_KEY", "test-google-key") + config = make_configs() + assert config.openai_api_key.get_secret_value() == "my-openai-key" + + def test_loaded_from_gemini_api_key(self, monkeypatch): + """openai_api_key falls back to GEMINI_API_KEY when OPENAI_API_KEY is absent.""" + monkeypatch.setenv("GEMINI_API_KEY", "my-gemini-key") + config = make_configs() + assert config.openai_api_key.get_secret_value() == "my-gemini-key" + + def test_loaded_from_google_api_key(self, monkeypatch): + """openai_api_key falls back to GOOGLE_API_KEY as the last alias.""" + monkeypatch.setenv("GOOGLE_API_KEY", "my-google-key") + config = make_configs() + assert config.openai_api_key.get_secret_value() == "my-google-key" + + def test_missing_raises_validation_error(self): + """Configs raises ValidationError when no API key env var is set.""" + with pytest.raises(ValidationError): + make_configs() + + +class TestConfigsValidators: + """Tests for Configs field validators.""" + + @pytest.fixture(autouse=True) + def _required_env(self, monkeypatch): + monkeypatch.setattr(os, "environ", {"OPENAI_API_KEY": "test-openai-key", "GEMINI_API_KEY": "test-google-key"}) + + def test_langfuse_secret_key_valid(self, monkeypatch): + """langfuse_secret_key accepts values starting with 'sk-lf-'.""" + monkeypatch.setenv("LANGFUSE_SECRET_KEY", "sk-lf-valid-secret") + config = make_configs() + assert config.langfuse_secret_key is not None + assert config.langfuse_secret_key.get_secret_value() == "sk-lf-valid-secret" + + def test_langfuse_secret_key_invalid_prefix_raises(self, monkeypatch): + """langfuse_secret_key rejects values not starting with 'sk-lf-'.""" + monkeypatch.setenv("LANGFUSE_SECRET_KEY", "invalid-secret") + with pytest.raises(ValidationError, match="sk-lf-"): + make_configs() + + def test_langfuse_secret_key_none_is_allowed(self): + """langfuse_secret_key accepts None (key not configured).""" + assert make_configs().langfuse_secret_key is None + + def test_e2b_api_key_valid(self, monkeypatch): + """e2b_api_key accepts values starting with 'e2b_'.""" + monkeypatch.setenv("E2B_API_KEY", "e2b_valid_key") + config = make_configs() + assert config.e2b_api_key is not None + assert config.e2b_api_key.get_secret_value() == "e2b_valid_key" + + def test_e2b_api_key_invalid_prefix_raises(self, monkeypatch): + """e2b_api_key rejects values not starting with 'e2b_'.""" + monkeypatch.setenv("E2B_API_KEY", "invalid_key") + with pytest.raises(ValidationError, match="e2b_"): + make_configs() + + def test_e2b_api_key_none_is_allowed(self): + """e2b_api_key accepts None (key not configured).""" + assert make_configs().e2b_api_key is None + + def test_langfuse_public_key_valid_pattern(self, monkeypatch): + """langfuse_public_key accepts values matching 'pk-lf-*'.""" + monkeypatch.setenv("LANGFUSE_PUBLIC_KEY", "pk-lf-abc123") + config = make_configs() + assert config.langfuse_public_key == "pk-lf-abc123" + + def test_langfuse_public_key_invalid_pattern_raises(self, monkeypatch): + """langfuse_public_key rejects values not matching 'pk-lf-*'.""" + monkeypatch.setenv("LANGFUSE_PUBLIC_KEY", "invalid-key") + with pytest.raises(ValidationError): + make_configs() diff --git a/implementations/basics/01_why_evals.ipynb b/implementations/basics/01_why_evals.ipynb index 4544954..ccce621 100644 --- a/implementations/basics/01_why_evals.ipynb +++ b/implementations/basics/01_why_evals.ipynb @@ -122,17 +122,7 @@ "cell_type": "markdown", "id": "summary", "metadata": {}, - "source": [ - "## Summary\n", - "\n", - "1. **Compounding errors** — 10-step agents need task-level evaluation, not just per-answer accuracy\n", - "2. **Four dimensions** — outcome, tool quality, reasoning, cost-performance\n", - "3. **Three grader types** — code-based (fast, deterministic), model-based (flexible, nuanced), human (gold-standard, for calibration)\n", - "4. **Capability benchmarks** — hard problems as hills to climb; progress is measured, not estimated\n", - "5. **Closed loop** — failure patterns map to prompt, tool, behavior, and reasoning changes\n", - "\n", - "**Next:** In Notebook 02, we'll use the shared evaluation harness to run these ideas in code." - ] + "source": "## Summary\n\n1. **Compounding errors** — 10-step agents need task-level evaluation, not just per-answer accuracy\n2. **Four dimensions** — outcome, tool quality, reasoning, cost-performance\n3. **Three grader types** — code-based (fast, deterministic), model-based (flexible, nuanced), human (gold-standard, for calibration)\n4. **Capability benchmarks** — hard problems as hills to climb; progress is measured, not estimated\n5. **Closed loop** — failure patterns map to prompt, tool, behavior, and reasoning changes\n\n**Next:** In Notebook 02, we'll use the shared evaluation harness to run these ideas in code.\n\n---\n\n**Further Reading:** [Evaluating Agents — Google ADK Documentation](https://google.github.io/adk-docs/evaluate/) — official guide covering built-in evaluation tools for ADK-based agents, including test-file formats, metrics, and running evaluations with the CLI." } ], "metadata": {