From 67ba4b03824933228c4a3229e526976aad10b4ae Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 05:49:56 -0400 Subject: [PATCH 1/4] {"schema":"decodex/commit/1","summary":"Modularize remaining large smoke runners","authority":"manual"} --- scripts/graphiti-zep-docker-temporal-smoke.py | 1297 +---------------- scripts/graphiti_temporal_smoke/__init__.py | 1 + scripts/graphiti_temporal_smoke/benchmark.py | 107 ++ scripts/graphiti_temporal_smoke/common.py | 137 ++ scripts/graphiti_temporal_smoke/context.py | 65 + scripts/graphiti_temporal_smoke/corpus.py | 47 + scripts/graphiti_temporal_smoke/fixture.py | 224 +++ scripts/graphiti_temporal_smoke/manifest.py | 175 +++ scripts/graphiti_temporal_smoke/mapping.py | 81 + .../materialization.py | 99 ++ scripts/graphiti_temporal_smoke/models.py | 34 + scripts/graphiti_temporal_smoke/runner.py | 151 ++ scripts/graphiti_temporal_smoke/runtime.py | 231 +++ scripts/graphiti_temporal_smoke/summary.py | 35 + ...etta-core-archive-export-readback-smoke.py | 1054 +------------- scripts/letta_core_archive_smoke/__init__.py | 1 + scripts/letta_core_archive_smoke/artifacts.py | 280 ++++ scripts/letta_core_archive_smoke/benchmark.py | 99 ++ scripts/letta_core_archive_smoke/common.py | 127 ++ scripts/letta_core_archive_smoke/context.py | 52 + scripts/letta_core_archive_smoke/fixtures.py | 225 +++ scripts/letta_core_archive_smoke/models.py | 34 + scripts/letta_core_archive_smoke/runner.py | 105 ++ scripts/letta_core_archive_smoke/runtime.py | 188 +++ scripts/ragflow-docker-evidence-smoke.sh | 1084 +------------- scripts/ragflow_smoke/api.sh | 183 +++ scripts/ragflow_smoke/common.sh | 96 ++ scripts/ragflow_smoke/docker.sh | 135 ++ scripts/ragflow_smoke/fixture.sh | 157 ++ scripts/ragflow_smoke/manifest.sh | 169 +++ scripts/ragflow_smoke/materialization.sh | 245 ++++ scripts/ragflow_smoke/scoring.sh | 60 + scripts/ragflow_smoke/summary.sh | 49 + 33 files changed, 3602 insertions(+), 3425 deletions(-) create mode 100644 scripts/graphiti_temporal_smoke/__init__.py create mode 100644 scripts/graphiti_temporal_smoke/benchmark.py create mode 100644 scripts/graphiti_temporal_smoke/common.py create mode 100644 scripts/graphiti_temporal_smoke/context.py create mode 100644 scripts/graphiti_temporal_smoke/corpus.py create mode 100644 scripts/graphiti_temporal_smoke/fixture.py create mode 100644 scripts/graphiti_temporal_smoke/manifest.py create mode 100644 scripts/graphiti_temporal_smoke/mapping.py create mode 100644 scripts/graphiti_temporal_smoke/materialization.py create mode 100644 scripts/graphiti_temporal_smoke/models.py create mode 100644 scripts/graphiti_temporal_smoke/runner.py create mode 100644 scripts/graphiti_temporal_smoke/runtime.py create mode 100644 scripts/graphiti_temporal_smoke/summary.py create mode 100644 scripts/letta_core_archive_smoke/__init__.py create mode 100644 scripts/letta_core_archive_smoke/artifacts.py create mode 100644 scripts/letta_core_archive_smoke/benchmark.py create mode 100644 scripts/letta_core_archive_smoke/common.py create mode 100644 scripts/letta_core_archive_smoke/context.py create mode 100644 scripts/letta_core_archive_smoke/fixtures.py create mode 100644 scripts/letta_core_archive_smoke/models.py create mode 100644 scripts/letta_core_archive_smoke/runner.py create mode 100644 scripts/letta_core_archive_smoke/runtime.py create mode 100644 scripts/ragflow_smoke/api.sh create mode 100644 scripts/ragflow_smoke/common.sh create mode 100644 scripts/ragflow_smoke/docker.sh create mode 100644 scripts/ragflow_smoke/fixture.sh create mode 100644 scripts/ragflow_smoke/manifest.sh create mode 100644 scripts/ragflow_smoke/materialization.sh create mode 100644 scripts/ragflow_smoke/scoring.sh create mode 100644 scripts/ragflow_smoke/summary.sh diff --git a/scripts/graphiti-zep-docker-temporal-smoke.py b/scripts/graphiti-zep-docker-temporal-smoke.py index ab86e731..906c50e5 100644 --- a/scripts/graphiti-zep-docker-temporal-smoke.py +++ b/scripts/graphiti-zep-docker-temporal-smoke.py @@ -3,1302 +3,7 @@ from __future__ import annotations -import json -import os -import shutil -import socket -import subprocess -import sys -import textwrap -import time -from dataclasses import dataclass -from datetime import datetime, timezone -from pathlib import Path -from typing import Any - - -SCRIPT_DIR = Path(__file__).resolve().parent -ROOT_DIR = SCRIPT_DIR.parent -REPORT_DIR = Path( - os.environ.get( - "ELF_GRAPHITI_ZEP_SMOKE_REPORT_DIR", - ROOT_DIR / "tmp" / "real-world-memory" / "graphiti-zep-smoke", - ) -) -WORK_DIR = Path(os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_WORK_DIR", REPORT_DIR / "work")) -OUT = Path(os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_OUT", REPORT_DIR / "graphiti-zep-smoke.json")) -MANIFEST_OUT = Path( - os.environ.get( - "ELF_GRAPHITI_ZEP_SMOKE_MANIFEST_OUT", - REPORT_DIR / "memory_projects_manifest.graphiti-zep-smoke.json", - ) -) -SUMMARY_OUT = Path(os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_SUMMARY_OUT", REPORT_DIR / "summary.json")) -REPORT_JSON = Path( - os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_REPORT_JSON", REPORT_DIR / "graphiti-zep-report.json") -) -REPORT_MD = Path( - os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_REPORT_MD", REPORT_DIR / "graphiti-zep-report.md") -) -FIXTURE_DIR = REPORT_DIR / "graphiti-zep-fixtures" -LOG_DIR = REPORT_DIR / "logs" - -RUN_ID = os.environ.get( - "ELF_GRAPHITI_ZEP_SMOKE_RUN_ID", - f"graphiti-zep-docker-smoke-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}", -) -RUN_LIVE = os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_RUN", "0") == "1" -ALLOW_HOST = os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_ALLOW_HOST", "0") == "1" -INSTALL_GRAPHITI = os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_INSTALL", "1") == "1" -GRAPHITI_VERSION = os.environ.get("ELF_GRAPHITI_ZEP_VERSION", "0.21.0") -GRAPHITI_PACKAGE = os.environ.get( - "ELF_GRAPHITI_ZEP_PACKAGE", - f"graphiti-core[falkordb]=={GRAPHITI_VERSION}", -) -GRAPHITI_REF = os.environ.get("ELF_GRAPHITI_ZEP_REF", f"pypi:{GRAPHITI_PACKAGE}") -FALKORDB_HOST = os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_HOST", "graphiti-falkordb") -FALKORDB_PORT = int(os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_PORT", "6379")) -FALKORDB_DATABASE = os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_DATABASE", "elf_graphiti_zep_smoke") -FALKORDB_USERNAME = os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_USERNAME", "") -FALKORDB_PASSWORD = os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_PASSWORD", "") -API_KEY = os.environ.get( - "ELF_GRAPHITI_ZEP_API_KEY", - os.environ.get("GRAPHITI_OPENAI_API_KEY", os.environ.get("OPENAI_API_KEY", "")), -) -API_BASE = os.environ.get("ELF_GRAPHITI_ZEP_API_BASE", os.environ.get("OPENAI_BASE_URL", "")) -LLM_MODEL = os.environ.get("ELF_GRAPHITI_ZEP_LLM_MODEL", "gpt-4o-mini") -EMBEDDING_MODEL = os.environ.get("ELF_GRAPHITI_ZEP_EMBEDDING_MODEL", "text-embedding-3-small") -TIMEOUT_SECONDS = int(os.environ.get("ELF_GRAPHITI_ZEP_TIMEOUT_SECONDS", "900")) -STARTUP_ATTEMPTS = int(os.environ.get("ELF_GRAPHITI_ZEP_STARTUP_ATTEMPTS", "30")) -STARTUP_INTERVAL_SECONDS = float(os.environ.get("ELF_GRAPHITI_ZEP_STARTUP_INTERVAL_SECONDS", "2")) - - -@dataclass -class StatusState: - """Typed status for generated Graphiti/Zep smoke artifacts.""" - - setup: str = "blocked" - run: str = "not_encoded" - result: str = "blocked" - overall: str = "blocked" - evidence_class: str = "research_gate" - failure_class: str = "graphiti_zep_live_run_disabled" - failure_reason: str = ( - "Graphiti/Zep temporal graph live run is opt-in; set " - "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 and provide explicit " - "provider configuration to attempt the Docker-local FalkorDB smoke." - ) - - -@dataclass -class CommandRecord: - """Captured command result without secret-bearing environment values.""" - - label: str - command: list[str] - status: str - elapsed_ms: float - stdout_artifact: str | None - stderr_artifact: str | None - returncode: int | None - reason: str - - -def utc_now() -> str: - """Return an RFC3339 UTC timestamp.""" - - return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") - - -def rel(path: Path) -> str: - """Return a repository-relative path when possible.""" - - try: - return str(path.resolve().relative_to(ROOT_DIR)) - except ValueError: - return str(path) - - -def mkdirs() -> None: - """Create output directories.""" - - for path in (REPORT_DIR, WORK_DIR, FIXTURE_DIR, LOG_DIR): - path.mkdir(parents=True, exist_ok=True) - - -def write_json(path: Path, payload: Any) -> None: - """Write stable, pretty JSON.""" - - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") - - -def run_scored_report(fixture_path: Path, manifest_path: Path, status: StatusState) -> dict[str, Any]: - """Score the generated temporal smoke fixture through the real-world job runner.""" - - run_cmd = [ - "cargo", - "run", - "-p", - "elf-eval", - "--bin", - "real_world_job_benchmark", - "--", - "run", - "--fixtures", - str(fixture_path), - "--out", - str(REPORT_JSON), - "--run-id", - "real-world-memory-live-graphiti-zep", - "--adapter-id", - "graphiti_zep_temporal_smoke", - "--adapter-name", - "Graphiti/Zep Docker temporal smoke adapter", - "--adapter-behavior", - "docker_python_falkordb_temporal_smoke", - "--adapter-storage-status", - status.setup, - "--adapter-runtime-status", - status.overall, - "--adapter-notes", - "Generated by the Graphiti/Zep Docker temporal smoke; pass or wrong_result requires current and historical validity-window facts mapped to generated evidence ids, while provider/setup limits remain typed.", - "--external-adapter-manifest", - str(manifest_path), - ] - publish_cmd = [ - "cargo", - "run", - "-p", - "elf-eval", - "--bin", - "real_world_job_benchmark", - "--", - "publish", - "--report", - str(REPORT_JSON), - "--out", - str(REPORT_MD), - ] - - subprocess.run(run_cmd, cwd=ROOT_DIR, check=True) - subprocess.run(publish_cmd, cwd=ROOT_DIR, check=True) - - report = json.loads(REPORT_JSON.read_text(encoding="utf-8")) - - return { - "json": rel(REPORT_JSON), - "markdown": rel(REPORT_MD), - "summary": report.get("summary", {}), - "suites": report.get("suites", []), - } - - -def scored_benchmark(report: dict[str, Any] | None) -> dict[str, Any]: - """Extract the post-score benchmark status from a real_world_job report.""" - - if report is None: - return { - "schema": "elf.scored_benchmark_status/v1", - "source": "real_world_job_benchmark", - "status": "pending", - "reason": "The smoke materialization was written before benchmark scoring completed.", - } - - summary = report.get("summary", {}) - counts = { - status: int(summary.get(status, 0) or 0) - for status in ( - "pass", - "wrong_result", - "lifecycle_fail", - "incomplete", - "blocked", - "not_encoded", - ) - } - status = next((name for name, count in counts.items() if name != "pass" and count > 0), "pass") - - return { - "schema": "elf.scored_benchmark_status/v1", - "source": "real_world_job_benchmark", - "status": status, - "counts": counts, - "job_count": int(summary.get("job_count", 0) or 0), - "mean_score": summary.get("mean_score"), - "evidence_coverage": summary.get("evidence_coverage"), - } - - -def command_available(command: str) -> bool: - """Return whether a command is on PATH.""" - - return shutil.which(command) is not None - - -def dir_size(path: Path) -> int: - """Return total file size for a directory or file.""" - - if not path.exists(): - return 0 - if path.is_file(): - return path.stat().st_size - - return sum(item.stat().st_size for item in path.rglob("*") if item.is_file()) - - -def file_count(path: Path) -> int: - """Return file count for a directory.""" - - if not path.exists(): - return 0 - - return sum(1 for item in path.rglob("*") if item.is_file()) - - -def temporal_facts() -> list[dict[str, Any]]: - """Return the generated-public temporal fact corpus.""" - - return [ - { - "evidence_id": "graphiti-zep-old-owner", - "claim_id": "relation_historical_owner", - "source": "Team Delta", - "edge_name": "OWNED_REVIEW", - "target": "deployment method review", - "fact": "Team Delta owned deployment method review before 2026-06-06.", - "valid_at": "2026-06-05T00:00:00Z", - "invalid_at": "2026-06-08T00:00:00Z", - "created_at": "2026-06-05T00:00:00Z", - "current": False, - }, - { - "evidence_id": "graphiti-zep-current-owner", - "claim_id": "relation_current_owner", - "source": "Team Echo", - "edge_name": "OWNS_REVIEW", - "target": "deployment method review", - "fact": "Team Echo owns deployment method review since 2026-06-08.", - "valid_at": "2026-06-08T00:00:00Z", - "invalid_at": None, - "created_at": "2026-06-08T00:00:00Z", - "current": True, - }, - { - "evidence_id": "graphiti-zep-owner-rationale", - "claim_id": "relation_owner_update_rationale", - "source": "single-user production runbook scope", - "edge_name": "MOVED_OWNERSHIP_TO", - "target": "Team Echo", - "fact": "Ownership moved to Team Echo after single-user production runbook scope changed.", - "valid_at": "2026-06-08T00:05:00Z", - "invalid_at": None, - "created_at": "2026-06-08T00:05:00Z", - "current": True, - }, - ] - - -def command_to_json(record: CommandRecord) -> dict[str, Any]: - """Serialize a command record.""" - - return { - "label": record.label, - "status": record.status, - "command": record.command, - "elapsed_ms": round(record.elapsed_ms, 3), - "stdout_artifact": record.stdout_artifact, - "stderr_artifact": record.stderr_artifact, - "returncode": record.returncode, - "reason": record.reason, - } - - -def run_command( - label: str, - command: list[str], - cwd: Path, - timeout: int = TIMEOUT_SECONDS, - extra_env: dict[str, str] | None = None, -) -> CommandRecord: - """Run a subprocess and capture stdout/stderr artifacts.""" - - cwd.mkdir(parents=True, exist_ok=True) - stdout_path = LOG_DIR / f"{label}.stdout.log" - stderr_path = LOG_DIR / f"{label}.stderr.log" - env = os.environ.copy() - - if extra_env: - env.update(extra_env) - - started = time.monotonic() - try: - proc = subprocess.run( - command, - cwd=cwd, - env=env, - text=True, - capture_output=True, - timeout=timeout, - check=False, - ) - elapsed_ms = (time.monotonic() - started) * 1000 - stdout_path.write_text(proc.stdout, encoding="utf-8") - stderr_path.write_text(proc.stderr, encoding="utf-8") - status = "pass" if proc.returncode == 0 else "incomplete" - reason = "Command completed." if proc.returncode == 0 else f"Command exited {proc.returncode}." - - return CommandRecord( - label=label, - command=command, - status=status, - elapsed_ms=elapsed_ms, - stdout_artifact=rel(stdout_path), - stderr_artifact=rel(stderr_path), - returncode=proc.returncode, - reason=reason, - ) - except subprocess.TimeoutExpired as err: - elapsed_ms = (time.monotonic() - started) * 1000 - stdout_path.write_text(err.stdout or "", encoding="utf-8") - stderr_path.write_text(err.stderr or "", encoding="utf-8") - - return CommandRecord( - label=label, - command=command, - status="incomplete", - elapsed_ms=elapsed_ms, - stdout_artifact=rel(stdout_path), - stderr_artifact=rel(stderr_path), - returncode=None, - reason=f"Command timed out after {timeout} seconds.", - ) - - -def wait_for_falkordb(command_records: list[CommandRecord]) -> bool: - """Poll the configured FalkorDB TCP endpoint.""" - - started = time.monotonic() - attempts: list[dict[str, Any]] = [] - - for attempt in range(1, STARTUP_ATTEMPTS + 1): - try: - with socket.create_connection((FALKORDB_HOST, FALKORDB_PORT), timeout=2): - elapsed_ms = (time.monotonic() - started) * 1000 - attempts.append({"attempt": attempt, "status": "pass", "elapsed_ms": round(elapsed_ms, 3)}) - path = LOG_DIR / "falkordb-startup-attempts.json" - write_json(path, attempts) - command_records.append( - CommandRecord( - label="falkordb-startup", - command=["tcp-connect", FALKORDB_HOST, str(FALKORDB_PORT)], - status="pass", - elapsed_ms=elapsed_ms, - stdout_artifact=rel(path), - stderr_artifact=None, - returncode=0, - reason="FalkorDB TCP endpoint accepted a connection.", - ) - ) - return True - except OSError as err: - attempts.append({"attempt": attempt, "status": "incomplete", "reason": str(err)}) - time.sleep(STARTUP_INTERVAL_SECONDS) - - elapsed_ms = (time.monotonic() - started) * 1000 - path = LOG_DIR / "falkordb-startup-attempts.json" - write_json(path, attempts) - command_records.append( - CommandRecord( - label="falkordb-startup", - command=["tcp-connect", FALKORDB_HOST, str(FALKORDB_PORT)], - status="incomplete", - elapsed_ms=elapsed_ms, - stdout_artifact=rel(path), - stderr_artifact=None, - returncode=None, - reason="FalkorDB TCP endpoint did not become reachable.", - ) - ) - return False - - -def init_graphiti(command_records: list[CommandRecord]) -> tuple[bool, Path]: - """Create a venv and install Graphiti with FalkorDB support.""" - - venv_dir = WORK_DIR / ".venv" - python = venv_dir / "bin" / "python" - - if INSTALL_GRAPHITI: - venv_record = run_command("python-venv", [sys.executable, "-m", "venv", str(venv_dir)], WORK_DIR) - command_records.append(venv_record) - if venv_record.status != "pass": - return False, python - - install_record = run_command( - "graphiti-install", - [str(python), "-m", "pip", "install", "--disable-pip-version-check", GRAPHITI_PACKAGE], - WORK_DIR, - ) - command_records.append(install_record) - if install_record.status != "pass": - return False, python - elif not python.exists(): - command_records.append( - CommandRecord( - label="graphiti-install", - command=["graphiti-core"], - status="incomplete", - elapsed_ms=0.0, - stdout_artifact=None, - stderr_artifact=None, - returncode=None, - reason="Graphiti install was disabled and no venv python exists.", - ) - ) - return False, python - - return True, python - - -def write_live_runner(path: Path) -> None: - """Write the isolated Graphiti execution script.""" - - payload = { - "run_id": RUN_ID, - "facts": temporal_facts(), - "query": "Who currently owns deployment method review, and who owned it historically?", - "falkordb": { - "host": FALKORDB_HOST, - "port": FALKORDB_PORT, - "database": FALKORDB_DATABASE, - }, - "models": { - "llm": LLM_MODEL, - "embedding": EMBEDDING_MODEL, - "api_base": API_BASE, - }, - } - input_path = WORK_DIR / "graphiti-live-input.json" - output_path = WORK_DIR / "graphiti-live-output.json" - write_json(input_path, payload) - script = f""" -import asyncio -import json -import os -import uuid -from datetime import datetime -from pathlib import Path - -from graphiti_core import Graphiti -from graphiti_core.driver.falkordb_driver import FalkorDriver -from graphiti_core.edges import EntityEdge -from graphiti_core.nodes import EntityNode - - -INPUT = Path({str(input_path)!r}) -OUTPUT = Path({str(output_path)!r}) - - -def parse_dt(value): - if value is None: - return None - return datetime.fromisoformat(value.replace("Z", "+00:00")) - - -async def main(): - data = json.loads(INPUT.read_text(encoding="utf-8")) - config = data["falkordb"] - driver = FalkorDriver( - host=config["host"], - port=config["port"], - username=os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_USERNAME") or None, - password=os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_PASSWORD") or None, - database=config.get("database") or "default_db", - ) - graphiti = Graphiti(graph_driver=driver) - try: - await graphiti.build_indices_and_constraints() - inserted = [] - for fact in data["facts"]: - group_id = data["run_id"] - source_uuid = str(uuid.uuid5(uuid.NAMESPACE_URL, group_id + ":source:" + fact["source"])) - target_uuid = str(uuid.uuid5(uuid.NAMESPACE_URL, group_id + ":target:" + fact["target"])) - edge_uuid = str(uuid.uuid5(uuid.NAMESPACE_URL, group_id + ":edge:" + fact["evidence_id"])) - source_node = EntityNode(uuid=source_uuid, name=fact["source"], group_id=group_id) - target_node = EntityNode(uuid=target_uuid, name=fact["target"], group_id=group_id) - edge = EntityEdge( - uuid=edge_uuid, - group_id=group_id, - source_node_uuid=source_uuid, - target_node_uuid=target_uuid, - created_at=parse_dt(fact["created_at"]), - name=fact["edge_name"], - fact=fact["fact"], - valid_at=parse_dt(fact["valid_at"]), - invalid_at=parse_dt(fact.get("invalid_at")), - ) - await graphiti.add_triplet(source_node, edge, target_node) - inserted.append({{"evidence_id": fact["evidence_id"], "uuid": edge_uuid}}) - - results = await graphiti.search(data["query"]) - serialized = [] - for edge in results: - serialized.append({{ - "uuid": getattr(edge, "uuid", None), - "name": getattr(edge, "name", None), - "fact": getattr(edge, "fact", None), - "valid_at": str(getattr(edge, "valid_at", "")) if getattr(edge, "valid_at", None) else None, - "invalid_at": str(getattr(edge, "invalid_at", "")) if getattr(edge, "invalid_at", None) else None, - "source_node_uuid": getattr(edge, "source_node_uuid", None), - "target_node_uuid": getattr(edge, "target_node_uuid", None), - }}) - - OUTPUT.write_text(json.dumps({{"inserted": inserted, "results": serialized}}, indent=2, sort_keys=True) + "\\n", encoding="utf-8") - finally: - await graphiti.close() - - -asyncio.run(main()) -""" - path.write_text(textwrap.dedent(script).lstrip(), encoding="utf-8") - - -def run_graphiti(python: Path, command_records: list[CommandRecord]) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: - """Run the Graphiti live worker and return inserted/search result facts.""" - - runner = WORK_DIR / "graphiti_live_runner.py" - write_live_runner(runner) - env = { - "OPENAI_API_KEY": API_KEY, - "MODEL_NAME": LLM_MODEL, - "LLM_MODEL": LLM_MODEL, - "EMBEDDING_MODEL": EMBEDDING_MODEL, - } - - if API_BASE: - env["OPENAI_BASE_URL"] = API_BASE - if FALKORDB_USERNAME: - env["ELF_GRAPHITI_ZEP_FALKORDB_USERNAME"] = FALKORDB_USERNAME - if FALKORDB_PASSWORD: - env["ELF_GRAPHITI_ZEP_FALKORDB_PASSWORD"] = FALKORDB_PASSWORD - - record = run_command("graphiti-live-run", [str(python), str(runner)], WORK_DIR, extra_env=env) - command_records.append(record) - - output_path = WORK_DIR / "graphiti-live-output.json" - if record.status != "pass" or not output_path.exists(): - return [], [] - - payload = json.loads(output_path.read_text(encoding="utf-8")) - return payload.get("inserted", []), payload.get("results", []) - - -def map_observed_facts(results: list[dict[str, Any]], facts: list[dict[str, Any]]) -> dict[str, Any]: - """Map Graphiti search results back to expected evidence ids.""" - - expected_by_id = {fact["evidence_id"]: fact for fact in facts} - mappings: list[dict[str, Any]] = [] - mapped_ids: list[str] = [] - - for fact in facts: - matched = [ - result - for result in results - if isinstance(result.get("fact"), str) and fact["fact"].lower() in result["fact"].lower() - ] - if matched: - result = matched[0] - mapped_ids.append(fact["evidence_id"]) - mappings.append( - { - "evidence_id": fact["evidence_id"], - "claim_id": fact["claim_id"], - "status": "pass", - "uuid": result.get("uuid"), - "fact": result.get("fact"), - "valid_at": result.get("valid_at"), - "invalid_at": result.get("invalid_at"), - "expected_valid_at": fact["valid_at"], - "expected_invalid_at": fact["invalid_at"], - "current": fact["current"], - } - ) - else: - mappings.append( - { - "evidence_id": fact["evidence_id"], - "claim_id": fact["claim_id"], - "status": "blocked", - "expected_valid_at": fact["valid_at"], - "expected_invalid_at": fact["invalid_at"], - "current": fact["current"], - } - ) - - current_ok = any( - item["evidence_id"] == "graphiti-zep-current-owner" - and item["status"] == "pass" - and not item.get("invalid_at") - for item in mappings - ) - historical_ok = any( - item["evidence_id"] == "graphiti-zep-old-owner" - and item["status"] == "pass" - and item.get("invalid_at") - for item in mappings - ) - rationale_ok = "graphiti-zep-owner-rationale" in mapped_ids - required_ids = list(expected_by_id) - missing_ids = [evidence_id for evidence_id in required_ids if evidence_id not in mapped_ids] - - if current_ok and historical_ok and rationale_ok: - status = "pass" - reason = "Graphiti/Zep search results mapped current, historical, and rationale facts with validity windows." - else: - status = "wrong_result" - reason = ( - "Graphiti/Zep search results did not map all required temporal facts with expected validity " - f"windows; missing={', '.join(missing_ids) or 'none'}." - ) - - return { - "status": status, - "reason": reason, - "expected_evidence_ids": required_ids, - "mapped_evidence_ids": mapped_ids, - "facts": mappings, - } - - -def write_fixture(facts: list[dict[str, Any]], status: StatusState, mapping: dict[str, Any]) -> Path: - """Write a generated memory_evolution fixture for the smoke.""" - - fixture_path = FIXTURE_DIR / "memory_evolution" / "graphiti_zep_temporal_validity.json" - mapped_ids = mapping.get("mapped_evidence_ids", []) - claims = [] - - if status.result == "pass": - claims = [ - { - "claim_id": "relation_current_owner", - "text": "Team Echo currently owns deployment method review.", - "evidence_ids": [ - "graphiti-zep-current-owner", - "graphiti-zep-old-owner", - "graphiti-zep-owner-rationale", - ], - "confidence": "derived_from_graphiti_temporal_search", - }, - { - "claim_id": "relation_historical_owner", - "text": "Team Delta owned deployment method review historically.", - "evidence_ids": ["graphiti-zep-old-owner"], - "confidence": "derived_from_graphiti_temporal_search", - }, - { - "claim_id": "relation_owner_update_rationale", - "text": "Ownership moved after single-user production runbook scope changed.", - "evidence_ids": ["graphiti-zep-owner-rationale"], - "confidence": "derived_from_graphiti_temporal_search", - }, - ] - - fixture: dict[str, Any] = { - "schema": "elf.real_world_job/v1", - "job_id": "graphiti-zep-temporal-validity-001", - "suite": "memory_evolution", - "title": "Map Graphiti/Zep temporal validity windows to current and historical relation facts", - "corpus": { - "corpus_id": "graphiti-zep-generated-public-smoke", - "profile": "generated_public", - "items": [ - { - "evidence_id": fact["evidence_id"], - "kind": "temporal_fact", - "text": fact["fact"], - "source_ref": { - "schema": "source_ref/v1", - "resolver": "graphiti_zep_smoke/v1", - "ref": { - "run_id": RUN_ID, - "evidence_id": fact["evidence_id"], - "valid_at": fact["valid_at"], - "invalid_at": fact["invalid_at"], - }, - }, - "created_at": fact["created_at"], - } - for fact in facts - ], - "adapter_response": { - "adapter_id": "graphiti_zep_temporal_smoke", - "answer": { - "content": ( - "Team Echo currently owns deployment method review. Team Delta owned it " - "historically, and the move followed the single-user production runbook scope change." - if claims - else "" - ), - "claims": claims, - "evidence_ids": mapped_ids, - "latency_ms": 0.0, - "cost": { - "currency": "USD", - "amount": 0.0, - "input_tokens": 0, - "output_tokens": 0, - }, - }, - }, - }, - "timeline": [ - { - "event_id": "graphiti-zep-old-owner", - "ts": "2026-06-05T00:00:00Z", - "actor": "agent", - "action": "recorded_relation", - "evidence_ids": ["graphiti-zep-old-owner"], - "summary": "Team Delta was the historical owner.", - }, - { - "event_id": "graphiti-zep-current-owner", - "ts": "2026-06-08T00:00:00Z", - "actor": "agent", - "action": "updated_memory", - "evidence_ids": ["graphiti-zep-current-owner", "graphiti-zep-owner-rationale"], - "summary": "Team Echo became the current owner after the scope changed.", - }, - ], - "prompt": { - "role": "user", - "content": "Who currently owns deployment method review, and who owned it historically?", - "job_mode": "answer", - "constraints": ["cite_evidence", "distinguish_current_from_historical"], - }, - "expected_answer": { - "must_include": [ - { - "claim_id": "relation_current_owner", - "text": "Team Echo currently owns deployment method review.", - }, - { - "claim_id": "relation_historical_owner", - "text": "Team Delta owned deployment method review historically.", - }, - ], - "must_not_include": ["Team Delta currently owns deployment method review."], - "evidence_links": { - "relation_current_owner": [ - "graphiti-zep-current-owner", - "graphiti-zep-old-owner", - "graphiti-zep-owner-rationale", - ], - "relation_historical_owner": ["graphiti-zep-old-owner"], - "relation_owner_update_rationale": ["graphiti-zep-owner-rationale"], - }, - "answer_type": "direct_answer", - "accepted_alternates": [], - "requires_caveat": False, - "requires_refusal": False, - }, - "required_evidence": [ - { - "evidence_id": "graphiti-zep-current-owner", - "claim_id": "relation_current_owner", - "requirement": "cite", - "quote": "Team Echo owns deployment method review", - }, - { - "evidence_id": "graphiti-zep-old-owner", - "claim_id": "relation_historical_owner", - "requirement": "cite", - "quote": "Team Delta owned deployment method review", - }, - ], - "negative_traps": [ - { - "trap_id": "old-owner-as-current", - "type": "stale_fact", - "evidence_ids": ["graphiti-zep-old-owner"], - "failure_if_used": False, - } - ], - "scoring_rubric": { - "dimensions": { - "lifecycle_behavior": { - "weight": 0.4, - "max_points": 1.0, - "criteria": "Requires current-only versus historical temporal validity for relation facts.", - }, - "answer_correctness": { - "weight": 0.25, - "max_points": 1.0, - "criteria": "Would identify current and historical owners separately.", - }, - "evidence_grounding": { - "weight": 0.2, - "max_points": 1.0, - "criteria": "Would cite both current and historical relation evidence.", - }, - "trap_avoidance": { - "weight": 0.15, - "max_points": 1.0, - "criteria": "Would not report the historical owner as current.", - }, - }, - "pass_threshold": 0.8, - "hard_fail_rules": [], - }, - "allowed_uncertainty": { - "can_answer_unknown": False, - "acceptable_phrases": ["Graphiti/Zep smoke did not return temporal facts."], - "fallback_action": "score_temporal_relation_behavior", - }, - "memory_evolution": { - "current_evidence_ids": ["graphiti-zep-current-owner"], - "historical_evidence_ids": ["graphiti-zep-old-owner"], - "stale_trap_ids": ["old-owner-as-current"], - "conflicts": [ - { - "conflict_id": "relation-owner-current-historical", - "claim_id": "relation_current_owner", - "current_evidence_id": "graphiti-zep-current-owner", - "historical_evidence_id": "graphiti-zep-old-owner", - "resolved_by_evidence_id": "graphiti-zep-owner-rationale", - } - ], - "update_rationale": { - "claim_id": "relation_owner_update_rationale", - "evidence_ids": ["graphiti-zep-owner-rationale"], - "available": True, - }, - "temporal_validity": {"required": True, "encoded": True}, - }, - "tags": ["external_adapter", "generated_public", "memory_evolution", "reference_graphiti_zep_temporal"], - } - - if status.result in {"blocked", "incomplete", "not_encoded"}: - fixture["encoding"] = {"status": status.result, "reason": status.failure_reason} - - write_json(fixture_path, fixture) - - return fixture_path - - -def write_materialization( - status: StatusState, - facts: list[dict[str, Any]], - fixture_path: Path, - command_records: list[CommandRecord], - inserted: list[dict[str, Any]], - search_results: list[dict[str, Any]], - mapping: dict[str, Any], - started_at: float, - report: dict[str, Any] | None = None, -) -> dict[str, Any]: - """Write the primary smoke artifact.""" - - elapsed_ms = (time.monotonic() - started_at) * 1000 - payload = { - "schema": "elf.graphiti_zep_temporal_smoke/v1", - "generated_at": utc_now(), - "run_id": RUN_ID, - "adapter_id": "graphiti_zep_temporal_smoke", - "project": "Graphiti/Zep", - "status": status.overall, - "materialization_status": { - "source": "smoke_materialization", - "setup": status.setup, - "run": status.run, - "result": status.result, - "overall": status.overall, - "failure_class": status.failure_class, - "failure_reason": status.failure_reason, - }, - "scored_benchmark": scored_benchmark(report), - "evidence_class": status.evidence_class, - "failure": { - "class": status.failure_class or None, - "reason": status.failure_reason or None, - }, - "artifacts": { - "materialization": rel(OUT), - "manifest": rel(MANIFEST_OUT), - "summary": rel(SUMMARY_OUT), - "fixture": rel(fixture_path), - "scored_report_json": rel(REPORT_JSON), - "scored_report_markdown": rel(REPORT_MD), - }, - "docker_boundary": { - "compose_file": "docker-compose.baseline.yml", - "service_profile": "graphiti-zep", - "graph_store_service": "graphiti-falkordb", - "runner_service": "baseline-runner", - "runner": "scripts/graphiti-zep-docker-temporal-smoke.py", - "host_global_installs_required": False, - "docker_only": True, - }, - "provider_configuration": { - "package": GRAPHITI_REF, - "package_spec": GRAPHITI_PACKAGE, - "llm_model": LLM_MODEL, - "embedding_model": EMBEDDING_MODEL, - "api_base_configured": bool(API_BASE), - "api_key_provided": bool(API_KEY), - "operator_owned_provider_credentials_used": False, - "live_run_enabled": RUN_LIVE, - "falkordb": { - "host": FALKORDB_HOST, - "port": FALKORDB_PORT, - "database": FALKORDB_DATABASE, - "username_configured": bool(FALKORDB_USERNAME), - "password_configured": bool(FALKORDB_PASSWORD), - }, - }, - "resource_bounds": { - "fact_count": len(facts), - "timeout_seconds": TIMEOUT_SECONDS, - "elapsed_ms": round(elapsed_ms, 3), - "work_dir_size_bytes": dir_size(WORK_DIR), - "work_dir_file_count": file_count(WORK_DIR), - }, - "commands": [command_to_json(record) for record in command_records], - "temporal_facts": facts, - "inserted_facts": inserted, - "search_results": search_results, - "evidence_mapping": mapping, - } - write_json(OUT, payload) - - return payload - - -def write_manifest(status: StatusState) -> dict[str, Any]: - """Write a generated external adapter manifest for this smoke.""" - - manifest = { - "schema": "elf.real_world_external_adapter_manifest/v1", - "manifest_id": f"graphiti-zep-temporal-smoke-{RUN_ID}", - "docker_isolation": { - "default": True, - "compose_file": "docker-compose.baseline.yml", - "runner": "scripts/graphiti-zep-docker-temporal-smoke.py", - "artifact_dir": "tmp/real-world-memory/graphiti-zep-smoke", - "host_global_installs_required": False, - "notes": [ - f"Generated by the Graphiti/Zep Docker smoke at {utc_now()}.", - "The smoke uses generated public temporal facts and records typed setup/runtime failures.", - ], - }, - "adapters": [ - { - "adapter_id": "graphiti_zep_temporal_smoke", - "project": "Graphiti/Zep", - "adapter_kind": "docker_python_falkordb_temporal_smoke", - "evidence_class": status.evidence_class, - "docker_default": True, - "host_global_installs_required": False, - "overall_status": status.overall, - "setup": { - "status": status.setup, - "evidence": "The smoke runs inside the baseline Docker runner and uses Docker-local FalkorDB plus a container-local Python venv.", - "command": "cargo make smoke-graphiti-zep-docker-temporal", - "artifact": rel(OUT), - }, - "run": { - "status": status.run, - "evidence": "The live path adds generated temporal fact triples and searches Graphiti/Zep for UUID, fact, valid_at, invalid_at, and source node evidence.", - "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal", - "artifact": rel(OUT), - }, - "result": { - "status": status.result, - "evidence": status.failure_reason - if status.failure_reason - else "Graphiti/Zep temporal search mapped current and historical facts to validity windows.", - "artifact": rel(OUT), - }, - "capabilities": [ - { - "capability": "docker_falkordb_setup", - "status": status.setup, - "evidence": "The task starts a Docker Compose FalkorDB profile only when explicitly requested, and uses no host-global graph database.", - }, - { - "capability": "temporal_fact_triple_ingest", - "status": status.run, - "evidence": "The live worker uses Graphiti fact triples for current, historical, and rationale facts with validity windows.", - }, - { - "capability": "validity_window_evidence_mapping", - "status": status.result, - "evidence": "Search output UUID, fact text, valid_at, invalid_at, and node ids are mapped to memory_evolution expected evidence ids.", - }, - { - "capability": "quality_or_scale_claim", - "status": "not_encoded", - "evidence": "The smoke does not claim broad graph-memory quality, large-corpus behavior, managed Zep service behavior, or private-corpus performance.", - }, - ], - "suites": [ - { - "suite_id": "memory_evolution", - "status": status.result, - "evidence": "Only generated current-versus-historical temporal relation facts are represented.", - }, - { - "suite_id": "retrieval", - "status": status.run if status.run != "pass" else "not_encoded", - "evidence": "Hybrid retrieval reachability is exercised by the live search, but broad retrieval quality scoring is not encoded.", - }, - { - "suite_id": "production_ops", - "status": "not_encoded", - "evidence": "The smoke records setup and provider boundaries but does not encode backup, restore, private corpus, or hosted-service operations.", - }, - ], - "scenarios": [ - { - "scenario_id": "temporal_validity_window_mapping", - "suite_id": "memory_evolution", - "status": status.result, - "elf_position": "untested", - "comparison_outcome": "blocked" - if status.result == "blocked" - else "not_tested", - "evidence": status.failure_reason - if status.failure_reason - else "Graphiti/Zep temporal search mapped generated current and historical relation facts to validity windows and evidence ids.", - "command": "cargo make smoke-graphiti-zep-docker-temporal", - "artifact": rel(OUT), - } - ], - "evidence": [ - {"kind": "artifact", "ref": rel(OUT), "status": status.result}, - {"kind": "manifest", "ref": rel(MANIFEST_OUT), "status": status.overall}, - {"kind": "source", "ref": "https://github.com/getzep/graphiti", "status": "real"}, - { - "kind": "source", - "ref": "https://help.getzep.com/graphiti/getting-started/quick-start", - "status": "real", - }, - { - "kind": "source", - "ref": "https://help.getzep.com/graphiti/configuration/falkor-db-configuration", - "status": "real", - }, - { - "kind": "source", - "ref": "https://help.getzep.com/graphiti/working-with-data/adding-fact-triples", - "status": "real", - }, - ], - "execution_metadata": { - "sources": [ - { - "label": "Graphiti repository", - "url": "https://github.com/getzep/graphiti", - "evidence": "Official source for the open-source temporal context graph engine.", - }, - { - "label": "Graphiti quick start", - "url": "https://help.getzep.com/graphiti/getting-started/quick-start", - "evidence": "Official search output examples include UUID, fact, valid_at, and invalid_at fields.", - }, - { - "label": "Graphiti FalkorDB configuration", - "url": "https://help.getzep.com/graphiti/configuration/falkor-db-configuration", - "evidence": "Official Docker-local FalkorDB setup and Python driver reference.", - }, - { - "label": "Graphiti fact triples", - "url": "https://help.getzep.com/graphiti/working-with-data/adding-fact-triples", - "evidence": "Official manual fact-triple ingest contract.", - }, - ], - "setup_path": "Run cargo make smoke-graphiti-zep-docker-temporal for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt.", - "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus graphiti-zep FalkorDB profile, container-local Python venv, generated public temporal facts, and report artifacts under tmp/real-world-memory/graphiti-zep-smoke.", - "resource_expectation": f"Graphiti package {GRAPHITI_REF}, fact_count=3, timeout_seconds={TIMEOUT_SECONDS}, FalkorDB host={FALKORDB_HOST}:{FALKORDB_PORT}.", - "retry_guidance": [ - "Default command records a typed blocked artifact without model calls.", - "Enable the live path only with Docker-local FalkorDB and explicit provider configuration.", - "Treat missing validity windows or unmapped current/historical facts as wrong_result, not pass.", - ], - "research_depth": "D2 feasibility plus XY-888 Docker temporal smoke implementation; generated artifact decides live evidence class.", - }, - "notes": [ - "The checked-in manifest record remains research_gate; generated smoke artifacts carry live status.", - "Failure before Graphiti search output remains typed as blocked or incomplete.", - "The smoke does not use a hosted Zep service, private corpora, or unrecorded provider credentials.", - ], - } - ], - } - write_json(MANIFEST_OUT, manifest) - - return manifest - - -def write_summary(materialization: dict[str, Any], manifest: dict[str, Any], report: dict[str, Any]) -> None: - """Write a small summary artifact.""" - - write_json( - SUMMARY_OUT, - { - "schema": "elf.graphiti_zep_temporal_smoke_summary/v1", - "generated_at": utc_now(), - "adapter_id": "graphiti_zep_temporal_smoke", - "evidence_class": materialization["evidence_class"], - "status_boundary": { - "materialization": "setup/run/evidence-mapping state emitted by the smoke runner", - "manifest": "external adapter declaration consumed by the scorer", - "scored_benchmark": "post-score real_world_job outcome; use this for quality status", - }, - "scored_benchmark": materialization["scored_benchmark"], - "materialization": materialization, - "manifest": { - "json": rel(MANIFEST_OUT), - "status_source": "external_adapter_manifest_pre_score", - "summary": manifest["adapters"][0]["overall_status"], - "suites": manifest["adapters"][0]["suites"], - }, - "report": report, - }, - ) - - -def main() -> int: - """Run the smoke and always emit typed artifacts when possible.""" - - started_at = time.monotonic() - mkdirs() - status = StatusState() - command_records: list[CommandRecord] = [] - facts = temporal_facts() - inserted: list[dict[str, Any]] = [] - search_results: list[dict[str, Any]] = [] - mapping: dict[str, Any] = { - "status": "blocked", - "reason": status.failure_reason, - "expected_evidence_ids": [fact["evidence_id"] for fact in facts], - "mapped_evidence_ids": [], - "facts": [ - { - "evidence_id": fact["evidence_id"], - "claim_id": fact["claim_id"], - "status": "blocked", - "expected_valid_at": fact["valid_at"], - "expected_invalid_at": fact["invalid_at"], - "current": fact["current"], - } - for fact in facts - ], - } - - if not Path("/.dockerenv").exists() and not ALLOW_HOST: - status.setup = "incomplete" - status.result = "incomplete" - status.overall = "incomplete" - status.failure_class = "not_running_in_docker" - status.failure_reason = "Graphiti/Zep smoke must run inside Docker; use cargo make smoke-graphiti-zep-docker-temporal." - mapping["status"] = status.result - mapping["reason"] = status.failure_reason - elif not command_available("python3"): - status.setup = "incomplete" - status.result = "incomplete" - status.overall = "incomplete" - status.failure_class = "python_missing" - status.failure_reason = "python3 is required for the Graphiti/Zep smoke runner." - mapping["status"] = status.result - mapping["reason"] = status.failure_reason - elif not RUN_LIVE: - pass - elif not API_KEY: - status.setup = "blocked" - status.run = "not_encoded" - status.result = "blocked" - status.overall = "blocked" - status.failure_class = "provider_api_key_missing" - status.failure_reason = "Graphiti/Zep live temporal search requires an explicit provider API key; no hosted Zep service or unrecorded provider credentials were used." - mapping["reason"] = status.failure_reason - elif not wait_for_falkordb(command_records): - status.setup = "incomplete" - status.run = "not_encoded" - status.result = "incomplete" - status.overall = "incomplete" - status.failure_class = "falkordb_unreachable" - status.failure_reason = "Docker-local FalkorDB did not become reachable for the Graphiti/Zep smoke." - mapping["status"] = status.result - mapping["reason"] = status.failure_reason - else: - installed, python = init_graphiti(command_records) - if not installed: - status.setup = "incomplete" - status.run = "not_encoded" - status.result = "incomplete" - status.overall = "incomplete" - status.failure_class = "graphiti_setup_failed" - status.failure_reason = "Graphiti installation failed inside the Docker runner." - mapping["status"] = status.result - mapping["reason"] = status.failure_reason - else: - status.setup = "pass" - inserted, search_results = run_graphiti(python, command_records) - - if not search_results: - status.run = "incomplete" - status.result = "incomplete" - status.overall = "incomplete" - status.failure_class = "graphiti_temporal_search_failed" - status.failure_reason = "Graphiti/Zep did not return temporal search results for the generated fact corpus." - mapping["status"] = status.result - mapping["reason"] = status.failure_reason - else: - status.run = "pass" - status.evidence_class = "live_real_world" - mapping = map_observed_facts(search_results, facts) - if mapping["status"] == "pass": - status.result = "pass" - status.overall = "pass" - status.failure_class = "" - status.failure_reason = "" - else: - status.result = "wrong_result" - status.overall = "wrong_result" - status.failure_class = "graphiti_temporal_mapping_failed" - status.failure_reason = mapping["reason"] - - fixture_path = write_fixture(facts, status, mapping) - materialization = write_materialization( - status, - facts, - fixture_path, - command_records, - inserted, - search_results, - mapping, - started_at, - ) - manifest = write_manifest(status) - report = run_scored_report(fixture_path, MANIFEST_OUT, status) - materialization = write_materialization( - status, - facts, - fixture_path, - command_records, - inserted, - search_results, - mapping, - started_at, - report, - ) - write_summary(materialization, manifest, report) - print(f"Graphiti/Zep smoke artifact: {OUT}") - print(f"Graphiti/Zep smoke manifest: {MANIFEST_OUT}") - print(f"Graphiti/Zep smoke summary: {SUMMARY_OUT}") - - return 0 +from graphiti_temporal_smoke.runner import main if __name__ == "__main__": diff --git a/scripts/graphiti_temporal_smoke/__init__.py b/scripts/graphiti_temporal_smoke/__init__.py new file mode 100644 index 00000000..e561b370 --- /dev/null +++ b/scripts/graphiti_temporal_smoke/__init__.py @@ -0,0 +1 @@ +"""Graphiti/Zep temporal smoke runner modules.""" diff --git a/scripts/graphiti_temporal_smoke/benchmark.py b/scripts/graphiti_temporal_smoke/benchmark.py new file mode 100644 index 00000000..23af9098 --- /dev/null +++ b/scripts/graphiti_temporal_smoke/benchmark.py @@ -0,0 +1,107 @@ +"""Scoring helpers for the Graphiti/Zep temporal smoke.""" + +from __future__ import annotations + +import json +import subprocess +from pathlib import Path +from typing import Any + +from .common import rel +from .context import REPORT_JSON, REPORT_MD, ROOT_DIR +from .models import StatusState + +def run_scored_report(fixture_path: Path, manifest_path: Path, status: StatusState) -> dict[str, Any]: + """Score the generated temporal smoke fixture through the real-world job runner.""" + + run_cmd = [ + "cargo", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + str(fixture_path), + "--out", + str(REPORT_JSON), + "--run-id", + "real-world-memory-live-graphiti-zep", + "--adapter-id", + "graphiti_zep_temporal_smoke", + "--adapter-name", + "Graphiti/Zep Docker temporal smoke adapter", + "--adapter-behavior", + "docker_python_falkordb_temporal_smoke", + "--adapter-storage-status", + status.setup, + "--adapter-runtime-status", + status.overall, + "--adapter-notes", + "Generated by the Graphiti/Zep Docker temporal smoke; pass or wrong_result requires current and historical validity-window facts mapped to generated evidence ids, while provider/setup limits remain typed.", + "--external-adapter-manifest", + str(manifest_path), + ] + publish_cmd = [ + "cargo", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + str(REPORT_JSON), + "--out", + str(REPORT_MD), + ] + + subprocess.run(run_cmd, cwd=ROOT_DIR, check=True) + subprocess.run(publish_cmd, cwd=ROOT_DIR, check=True) + + report = json.loads(REPORT_JSON.read_text(encoding="utf-8")) + + return { + "json": rel(REPORT_JSON), + "markdown": rel(REPORT_MD), + "summary": report.get("summary", {}), + "suites": report.get("suites", []), + } + +def scored_benchmark(report: dict[str, Any] | None) -> dict[str, Any]: + """Extract the post-score benchmark status from a real_world_job report.""" + + if report is None: + return { + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": "pending", + "reason": "The smoke materialization was written before benchmark scoring completed.", + } + + summary = report.get("summary", {}) + counts = { + status: int(summary.get(status, 0) or 0) + for status in ( + "pass", + "wrong_result", + "lifecycle_fail", + "incomplete", + "blocked", + "not_encoded", + ) + } + status = next((name for name, count in counts.items() if name != "pass" and count > 0), "pass") + + return { + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": status, + "counts": counts, + "job_count": int(summary.get("job_count", 0) or 0), + "mean_score": summary.get("mean_score"), + "evidence_coverage": summary.get("evidence_coverage"), + } diff --git a/scripts/graphiti_temporal_smoke/common.py b/scripts/graphiti_temporal_smoke/common.py new file mode 100644 index 00000000..ba3dd06b --- /dev/null +++ b/scripts/graphiti_temporal_smoke/common.py @@ -0,0 +1,137 @@ +"""Shared filesystem and process helpers for the Graphiti/Zep smoke.""" + +from __future__ import annotations + +import json +import os +import shutil +import subprocess +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from .context import FIXTURE_DIR, LOG_DIR, REPORT_DIR, ROOT_DIR, TIMEOUT_SECONDS, WORK_DIR +from .models import CommandRecord + +def utc_now() -> str: + """Return an RFC3339 UTC timestamp.""" + + return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") + +def rel(path: Path) -> str: + """Return a repository-relative path when possible.""" + + try: + return str(path.resolve().relative_to(ROOT_DIR)) + except ValueError: + return str(path) + +def mkdirs() -> None: + """Create output directories.""" + + for path in (REPORT_DIR, WORK_DIR, FIXTURE_DIR, LOG_DIR): + path.mkdir(parents=True, exist_ok=True) + +def write_json(path: Path, payload: Any) -> None: + """Write stable, pretty JSON.""" + + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") + +def command_available(command: str) -> bool: + """Return whether a command is on PATH.""" + + return shutil.which(command) is not None + +def dir_size(path: Path) -> int: + """Return total file size for a directory or file.""" + + if not path.exists(): + return 0 + if path.is_file(): + return path.stat().st_size + + return sum(item.stat().st_size for item in path.rglob("*") if item.is_file()) + +def file_count(path: Path) -> int: + """Return file count for a directory.""" + + if not path.exists(): + return 0 + + return sum(1 for item in path.rglob("*") if item.is_file()) + +def command_to_json(record: CommandRecord) -> dict[str, Any]: + """Serialize a command record.""" + + return { + "label": record.label, + "status": record.status, + "command": record.command, + "elapsed_ms": round(record.elapsed_ms, 3), + "stdout_artifact": record.stdout_artifact, + "stderr_artifact": record.stderr_artifact, + "returncode": record.returncode, + "reason": record.reason, + } + +def run_command( + label: str, + command: list[str], + cwd: Path, + timeout: int = TIMEOUT_SECONDS, + extra_env: dict[str, str] | None = None, +) -> CommandRecord: + """Run a subprocess and capture stdout/stderr artifacts.""" + + cwd.mkdir(parents=True, exist_ok=True) + stdout_path = LOG_DIR / f"{label}.stdout.log" + stderr_path = LOG_DIR / f"{label}.stderr.log" + env = os.environ.copy() + + if extra_env: + env.update(extra_env) + + started = time.monotonic() + try: + proc = subprocess.run( + command, + cwd=cwd, + env=env, + text=True, + capture_output=True, + timeout=timeout, + check=False, + ) + elapsed_ms = (time.monotonic() - started) * 1000 + stdout_path.write_text(proc.stdout, encoding="utf-8") + stderr_path.write_text(proc.stderr, encoding="utf-8") + status = "pass" if proc.returncode == 0 else "incomplete" + reason = "Command completed." if proc.returncode == 0 else f"Command exited {proc.returncode}." + + return CommandRecord( + label=label, + command=command, + status=status, + elapsed_ms=elapsed_ms, + stdout_artifact=rel(stdout_path), + stderr_artifact=rel(stderr_path), + returncode=proc.returncode, + reason=reason, + ) + except subprocess.TimeoutExpired as err: + elapsed_ms = (time.monotonic() - started) * 1000 + stdout_path.write_text(err.stdout or "", encoding="utf-8") + stderr_path.write_text(err.stderr or "", encoding="utf-8") + + return CommandRecord( + label=label, + command=command, + status="incomplete", + elapsed_ms=elapsed_ms, + stdout_artifact=rel(stdout_path), + stderr_artifact=rel(stderr_path), + returncode=None, + reason=f"Command timed out after {timeout} seconds.", + ) diff --git a/scripts/graphiti_temporal_smoke/context.py b/scripts/graphiti_temporal_smoke/context.py new file mode 100644 index 00000000..442836e0 --- /dev/null +++ b/scripts/graphiti_temporal_smoke/context.py @@ -0,0 +1,65 @@ +"""Configuration for the Graphiti/Zep temporal smoke.""" + +from __future__ import annotations + +import os +from datetime import datetime, timezone +from pathlib import Path + +from typing import Any + + +SCRIPT_DIR = Path(__file__).resolve().parent.parent +ROOT_DIR = SCRIPT_DIR.parent +REPORT_DIR = Path( + os.environ.get( + "ELF_GRAPHITI_ZEP_SMOKE_REPORT_DIR", + ROOT_DIR / "tmp" / "real-world-memory" / "graphiti-zep-smoke", + ) +) +WORK_DIR = Path(os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_WORK_DIR", REPORT_DIR / "work")) +OUT = Path(os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_OUT", REPORT_DIR / "graphiti-zep-smoke.json")) +MANIFEST_OUT = Path( + os.environ.get( + "ELF_GRAPHITI_ZEP_SMOKE_MANIFEST_OUT", + REPORT_DIR / "memory_projects_manifest.graphiti-zep-smoke.json", + ) +) +SUMMARY_OUT = Path(os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_SUMMARY_OUT", REPORT_DIR / "summary.json")) +REPORT_JSON = Path( + os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_REPORT_JSON", REPORT_DIR / "graphiti-zep-report.json") +) +REPORT_MD = Path( + os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_REPORT_MD", REPORT_DIR / "graphiti-zep-report.md") +) +FIXTURE_DIR = REPORT_DIR / "graphiti-zep-fixtures" +LOG_DIR = REPORT_DIR / "logs" + +RUN_ID = os.environ.get( + "ELF_GRAPHITI_ZEP_SMOKE_RUN_ID", + f"graphiti-zep-docker-smoke-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}", +) +RUN_LIVE = os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_RUN", "0") == "1" +ALLOW_HOST = os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_ALLOW_HOST", "0") == "1" +INSTALL_GRAPHITI = os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_INSTALL", "1") == "1" +GRAPHITI_VERSION = os.environ.get("ELF_GRAPHITI_ZEP_VERSION", "0.21.0") +GRAPHITI_PACKAGE = os.environ.get( + "ELF_GRAPHITI_ZEP_PACKAGE", + f"graphiti-core[falkordb]=={GRAPHITI_VERSION}", +) +GRAPHITI_REF = os.environ.get("ELF_GRAPHITI_ZEP_REF", f"pypi:{GRAPHITI_PACKAGE}") +FALKORDB_HOST = os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_HOST", "graphiti-falkordb") +FALKORDB_PORT = int(os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_PORT", "6379")) +FALKORDB_DATABASE = os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_DATABASE", "elf_graphiti_zep_smoke") +FALKORDB_USERNAME = os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_USERNAME", "") +FALKORDB_PASSWORD = os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_PASSWORD", "") +API_KEY = os.environ.get( + "ELF_GRAPHITI_ZEP_API_KEY", + os.environ.get("GRAPHITI_OPENAI_API_KEY", os.environ.get("OPENAI_API_KEY", "")), +) +API_BASE = os.environ.get("ELF_GRAPHITI_ZEP_API_BASE", os.environ.get("OPENAI_BASE_URL", "")) +LLM_MODEL = os.environ.get("ELF_GRAPHITI_ZEP_LLM_MODEL", "gpt-4o-mini") +EMBEDDING_MODEL = os.environ.get("ELF_GRAPHITI_ZEP_EMBEDDING_MODEL", "text-embedding-3-small") +TIMEOUT_SECONDS = int(os.environ.get("ELF_GRAPHITI_ZEP_TIMEOUT_SECONDS", "900")) +STARTUP_ATTEMPTS = int(os.environ.get("ELF_GRAPHITI_ZEP_STARTUP_ATTEMPTS", "30")) +STARTUP_INTERVAL_SECONDS = float(os.environ.get("ELF_GRAPHITI_ZEP_STARTUP_INTERVAL_SECONDS", "2")) diff --git a/scripts/graphiti_temporal_smoke/corpus.py b/scripts/graphiti_temporal_smoke/corpus.py new file mode 100644 index 00000000..ec3b8f4f --- /dev/null +++ b/scripts/graphiti_temporal_smoke/corpus.py @@ -0,0 +1,47 @@ +"""Generated temporal facts used by the Graphiti/Zep smoke.""" + +from __future__ import annotations + +from typing import Any + +def temporal_facts() -> list[dict[str, Any]]: + """Return the generated-public temporal fact corpus.""" + + return [ + { + "evidence_id": "graphiti-zep-old-owner", + "claim_id": "relation_historical_owner", + "source": "Team Delta", + "edge_name": "OWNED_REVIEW", + "target": "deployment method review", + "fact": "Team Delta owned deployment method review before 2026-06-06.", + "valid_at": "2026-06-05T00:00:00Z", + "invalid_at": "2026-06-08T00:00:00Z", + "created_at": "2026-06-05T00:00:00Z", + "current": False, + }, + { + "evidence_id": "graphiti-zep-current-owner", + "claim_id": "relation_current_owner", + "source": "Team Echo", + "edge_name": "OWNS_REVIEW", + "target": "deployment method review", + "fact": "Team Echo owns deployment method review since 2026-06-08.", + "valid_at": "2026-06-08T00:00:00Z", + "invalid_at": None, + "created_at": "2026-06-08T00:00:00Z", + "current": True, + }, + { + "evidence_id": "graphiti-zep-owner-rationale", + "claim_id": "relation_owner_update_rationale", + "source": "single-user production runbook scope", + "edge_name": "MOVED_OWNERSHIP_TO", + "target": "Team Echo", + "fact": "Ownership moved to Team Echo after single-user production runbook scope changed.", + "valid_at": "2026-06-08T00:05:00Z", + "invalid_at": None, + "created_at": "2026-06-08T00:05:00Z", + "current": True, + }, + ] diff --git a/scripts/graphiti_temporal_smoke/fixture.py b/scripts/graphiti_temporal_smoke/fixture.py new file mode 100644 index 00000000..14e30c7a --- /dev/null +++ b/scripts/graphiti_temporal_smoke/fixture.py @@ -0,0 +1,224 @@ +"""Generated fixture writer for the Graphiti/Zep smoke.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from .common import write_json +from .context import FIXTURE_DIR, RUN_ID +from .models import StatusState + +def write_fixture(facts: list[dict[str, Any]], status: StatusState, mapping: dict[str, Any]) -> Path: + """Write a generated memory_evolution fixture for the smoke.""" + + fixture_path = FIXTURE_DIR / "memory_evolution" / "graphiti_zep_temporal_validity.json" + mapped_ids = mapping.get("mapped_evidence_ids", []) + claims = [] + + if status.result == "pass": + claims = [ + { + "claim_id": "relation_current_owner", + "text": "Team Echo currently owns deployment method review.", + "evidence_ids": [ + "graphiti-zep-current-owner", + "graphiti-zep-old-owner", + "graphiti-zep-owner-rationale", + ], + "confidence": "derived_from_graphiti_temporal_search", + }, + { + "claim_id": "relation_historical_owner", + "text": "Team Delta owned deployment method review historically.", + "evidence_ids": ["graphiti-zep-old-owner"], + "confidence": "derived_from_graphiti_temporal_search", + }, + { + "claim_id": "relation_owner_update_rationale", + "text": "Ownership moved after single-user production runbook scope changed.", + "evidence_ids": ["graphiti-zep-owner-rationale"], + "confidence": "derived_from_graphiti_temporal_search", + }, + ] + + fixture: dict[str, Any] = { + "schema": "elf.real_world_job/v1", + "job_id": "graphiti-zep-temporal-validity-001", + "suite": "memory_evolution", + "title": "Map Graphiti/Zep temporal validity windows to current and historical relation facts", + "corpus": { + "corpus_id": "graphiti-zep-generated-public-smoke", + "profile": "generated_public", + "items": [ + { + "evidence_id": fact["evidence_id"], + "kind": "temporal_fact", + "text": fact["fact"], + "source_ref": { + "schema": "source_ref/v1", + "resolver": "graphiti_zep_smoke/v1", + "ref": { + "run_id": RUN_ID, + "evidence_id": fact["evidence_id"], + "valid_at": fact["valid_at"], + "invalid_at": fact["invalid_at"], + }, + }, + "created_at": fact["created_at"], + } + for fact in facts + ], + "adapter_response": { + "adapter_id": "graphiti_zep_temporal_smoke", + "answer": { + "content": ( + "Team Echo currently owns deployment method review. Team Delta owned it " + "historically, and the move followed the single-user production runbook scope change." + if claims + else "" + ), + "claims": claims, + "evidence_ids": mapped_ids, + "latency_ms": 0.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0, + }, + }, + }, + }, + "timeline": [ + { + "event_id": "graphiti-zep-old-owner", + "ts": "2026-06-05T00:00:00Z", + "actor": "agent", + "action": "recorded_relation", + "evidence_ids": ["graphiti-zep-old-owner"], + "summary": "Team Delta was the historical owner.", + }, + { + "event_id": "graphiti-zep-current-owner", + "ts": "2026-06-08T00:00:00Z", + "actor": "agent", + "action": "updated_memory", + "evidence_ids": ["graphiti-zep-current-owner", "graphiti-zep-owner-rationale"], + "summary": "Team Echo became the current owner after the scope changed.", + }, + ], + "prompt": { + "role": "user", + "content": "Who currently owns deployment method review, and who owned it historically?", + "job_mode": "answer", + "constraints": ["cite_evidence", "distinguish_current_from_historical"], + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "relation_current_owner", + "text": "Team Echo currently owns deployment method review.", + }, + { + "claim_id": "relation_historical_owner", + "text": "Team Delta owned deployment method review historically.", + }, + ], + "must_not_include": ["Team Delta currently owns deployment method review."], + "evidence_links": { + "relation_current_owner": [ + "graphiti-zep-current-owner", + "graphiti-zep-old-owner", + "graphiti-zep-owner-rationale", + ], + "relation_historical_owner": ["graphiti-zep-old-owner"], + "relation_owner_update_rationale": ["graphiti-zep-owner-rationale"], + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": False, + "requires_refusal": False, + }, + "required_evidence": [ + { + "evidence_id": "graphiti-zep-current-owner", + "claim_id": "relation_current_owner", + "requirement": "cite", + "quote": "Team Echo owns deployment method review", + }, + { + "evidence_id": "graphiti-zep-old-owner", + "claim_id": "relation_historical_owner", + "requirement": "cite", + "quote": "Team Delta owned deployment method review", + }, + ], + "negative_traps": [ + { + "trap_id": "old-owner-as-current", + "type": "stale_fact", + "evidence_ids": ["graphiti-zep-old-owner"], + "failure_if_used": False, + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.4, + "max_points": 1.0, + "criteria": "Requires current-only versus historical temporal validity for relation facts.", + }, + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Would identify current and historical owners separately.", + }, + "evidence_grounding": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Would cite both current and historical relation evidence.", + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Would not report the historical owner as current.", + }, + }, + "pass_threshold": 0.8, + "hard_fail_rules": [], + }, + "allowed_uncertainty": { + "can_answer_unknown": False, + "acceptable_phrases": ["Graphiti/Zep smoke did not return temporal facts."], + "fallback_action": "score_temporal_relation_behavior", + }, + "memory_evolution": { + "current_evidence_ids": ["graphiti-zep-current-owner"], + "historical_evidence_ids": ["graphiti-zep-old-owner"], + "stale_trap_ids": ["old-owner-as-current"], + "conflicts": [ + { + "conflict_id": "relation-owner-current-historical", + "claim_id": "relation_current_owner", + "current_evidence_id": "graphiti-zep-current-owner", + "historical_evidence_id": "graphiti-zep-old-owner", + "resolved_by_evidence_id": "graphiti-zep-owner-rationale", + } + ], + "update_rationale": { + "claim_id": "relation_owner_update_rationale", + "evidence_ids": ["graphiti-zep-owner-rationale"], + "available": True, + }, + "temporal_validity": {"required": True, "encoded": True}, + }, + "tags": ["external_adapter", "generated_public", "memory_evolution", "reference_graphiti_zep_temporal"], + } + + if status.result in {"blocked", "incomplete", "not_encoded"}: + fixture["encoding"] = {"status": status.result, "reason": status.failure_reason} + + write_json(fixture_path, fixture) + + return fixture_path diff --git a/scripts/graphiti_temporal_smoke/manifest.py b/scripts/graphiti_temporal_smoke/manifest.py new file mode 100644 index 00000000..b8b66bd1 --- /dev/null +++ b/scripts/graphiti_temporal_smoke/manifest.py @@ -0,0 +1,175 @@ +"""External adapter manifest writer for the Graphiti/Zep smoke.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from .common import rel, utc_now, write_json +from .context import * # noqa: F403 +from .models import StatusState + +def write_manifest(status: StatusState) -> dict[str, Any]: + """Write a generated external adapter manifest for this smoke.""" + + manifest = { + "schema": "elf.real_world_external_adapter_manifest/v1", + "manifest_id": f"graphiti-zep-temporal-smoke-{RUN_ID}", + "docker_isolation": { + "default": True, + "compose_file": "docker-compose.baseline.yml", + "runner": "scripts/graphiti-zep-docker-temporal-smoke.py", + "artifact_dir": "tmp/real-world-memory/graphiti-zep-smoke", + "host_global_installs_required": False, + "notes": [ + f"Generated by the Graphiti/Zep Docker smoke at {utc_now()}.", + "The smoke uses generated public temporal facts and records typed setup/runtime failures.", + ], + }, + "adapters": [ + { + "adapter_id": "graphiti_zep_temporal_smoke", + "project": "Graphiti/Zep", + "adapter_kind": "docker_python_falkordb_temporal_smoke", + "evidence_class": status.evidence_class, + "docker_default": True, + "host_global_installs_required": False, + "overall_status": status.overall, + "setup": { + "status": status.setup, + "evidence": "The smoke runs inside the baseline Docker runner and uses Docker-local FalkorDB plus a container-local Python venv.", + "command": "cargo make smoke-graphiti-zep-docker-temporal", + "artifact": rel(OUT), + }, + "run": { + "status": status.run, + "evidence": "The live path adds generated temporal fact triples and searches Graphiti/Zep for UUID, fact, valid_at, invalid_at, and source node evidence.", + "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal", + "artifact": rel(OUT), + }, + "result": { + "status": status.result, + "evidence": status.failure_reason + if status.failure_reason + else "Graphiti/Zep temporal search mapped current and historical facts to validity windows.", + "artifact": rel(OUT), + }, + "capabilities": [ + { + "capability": "docker_falkordb_setup", + "status": status.setup, + "evidence": "The task starts a Docker Compose FalkorDB profile only when explicitly requested, and uses no host-global graph database.", + }, + { + "capability": "temporal_fact_triple_ingest", + "status": status.run, + "evidence": "The live worker uses Graphiti fact triples for current, historical, and rationale facts with validity windows.", + }, + { + "capability": "validity_window_evidence_mapping", + "status": status.result, + "evidence": "Search output UUID, fact text, valid_at, invalid_at, and node ids are mapped to memory_evolution expected evidence ids.", + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph-memory quality, large-corpus behavior, managed Zep service behavior, or private-corpus performance.", + }, + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": status.result, + "evidence": "Only generated current-versus-historical temporal relation facts are represented.", + }, + { + "suite_id": "retrieval", + "status": status.run if status.run != "pass" else "not_encoded", + "evidence": "Hybrid retrieval reachability is exercised by the live search, but broad retrieval quality scoring is not encoded.", + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "The smoke records setup and provider boundaries but does not encode backup, restore, private corpus, or hosted-service operations.", + }, + ], + "scenarios": [ + { + "scenario_id": "temporal_validity_window_mapping", + "suite_id": "memory_evolution", + "status": status.result, + "elf_position": "untested", + "comparison_outcome": "blocked" + if status.result == "blocked" + else "not_tested", + "evidence": status.failure_reason + if status.failure_reason + else "Graphiti/Zep temporal search mapped generated current and historical relation facts to validity windows and evidence ids.", + "command": "cargo make smoke-graphiti-zep-docker-temporal", + "artifact": rel(OUT), + } + ], + "evidence": [ + {"kind": "artifact", "ref": rel(OUT), "status": status.result}, + {"kind": "manifest", "ref": rel(MANIFEST_OUT), "status": status.overall}, + {"kind": "source", "ref": "https://github.com/getzep/graphiti", "status": "real"}, + { + "kind": "source", + "ref": "https://help.getzep.com/graphiti/getting-started/quick-start", + "status": "real", + }, + { + "kind": "source", + "ref": "https://help.getzep.com/graphiti/configuration/falkor-db-configuration", + "status": "real", + }, + { + "kind": "source", + "ref": "https://help.getzep.com/graphiti/working-with-data/adding-fact-triples", + "status": "real", + }, + ], + "execution_metadata": { + "sources": [ + { + "label": "Graphiti repository", + "url": "https://github.com/getzep/graphiti", + "evidence": "Official source for the open-source temporal context graph engine.", + }, + { + "label": "Graphiti quick start", + "url": "https://help.getzep.com/graphiti/getting-started/quick-start", + "evidence": "Official search output examples include UUID, fact, valid_at, and invalid_at fields.", + }, + { + "label": "Graphiti FalkorDB configuration", + "url": "https://help.getzep.com/graphiti/configuration/falkor-db-configuration", + "evidence": "Official Docker-local FalkorDB setup and Python driver reference.", + }, + { + "label": "Graphiti fact triples", + "url": "https://help.getzep.com/graphiti/working-with-data/adding-fact-triples", + "evidence": "Official manual fact-triple ingest contract.", + }, + ], + "setup_path": "Run cargo make smoke-graphiti-zep-docker-temporal for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus graphiti-zep FalkorDB profile, container-local Python venv, generated public temporal facts, and report artifacts under tmp/real-world-memory/graphiti-zep-smoke.", + "resource_expectation": f"Graphiti package {GRAPHITI_REF}, fact_count=3, timeout_seconds={TIMEOUT_SECONDS}, FalkorDB host={FALKORDB_HOST}:{FALKORDB_PORT}.", + "retry_guidance": [ + "Default command records a typed blocked artifact without model calls.", + "Enable the live path only with Docker-local FalkorDB and explicit provider configuration.", + "Treat missing validity windows or unmapped current/historical facts as wrong_result, not pass.", + ], + "research_depth": "D2 feasibility plus XY-888 Docker temporal smoke implementation; generated artifact decides live evidence class.", + }, + "notes": [ + "The checked-in manifest record remains research_gate; generated smoke artifacts carry live status.", + "Failure before Graphiti search output remains typed as blocked or incomplete.", + "The smoke does not use a hosted Zep service, private corpora, or unrecorded provider credentials.", + ], + } + ], + } + write_json(MANIFEST_OUT, manifest) + + return manifest diff --git a/scripts/graphiti_temporal_smoke/mapping.py b/scripts/graphiti_temporal_smoke/mapping.py new file mode 100644 index 00000000..4283950c --- /dev/null +++ b/scripts/graphiti_temporal_smoke/mapping.py @@ -0,0 +1,81 @@ +"""Map Graphiti search results back to benchmark evidence.""" + +from __future__ import annotations + +from typing import Any + +def map_observed_facts(results: list[dict[str, Any]], facts: list[dict[str, Any]]) -> dict[str, Any]: + """Map Graphiti search results back to expected evidence ids.""" + + expected_by_id = {fact["evidence_id"]: fact for fact in facts} + mappings: list[dict[str, Any]] = [] + mapped_ids: list[str] = [] + + for fact in facts: + matched = [ + result + for result in results + if isinstance(result.get("fact"), str) and fact["fact"].lower() in result["fact"].lower() + ] + if matched: + result = matched[0] + mapped_ids.append(fact["evidence_id"]) + mappings.append( + { + "evidence_id": fact["evidence_id"], + "claim_id": fact["claim_id"], + "status": "pass", + "uuid": result.get("uuid"), + "fact": result.get("fact"), + "valid_at": result.get("valid_at"), + "invalid_at": result.get("invalid_at"), + "expected_valid_at": fact["valid_at"], + "expected_invalid_at": fact["invalid_at"], + "current": fact["current"], + } + ) + else: + mappings.append( + { + "evidence_id": fact["evidence_id"], + "claim_id": fact["claim_id"], + "status": "blocked", + "expected_valid_at": fact["valid_at"], + "expected_invalid_at": fact["invalid_at"], + "current": fact["current"], + } + ) + + current_ok = any( + item["evidence_id"] == "graphiti-zep-current-owner" + and item["status"] == "pass" + and not item.get("invalid_at") + for item in mappings + ) + historical_ok = any( + item["evidence_id"] == "graphiti-zep-old-owner" + and item["status"] == "pass" + and item.get("invalid_at") + for item in mappings + ) + rationale_ok = "graphiti-zep-owner-rationale" in mapped_ids + required_ids = list(expected_by_id) + missing_ids = [evidence_id for evidence_id in required_ids if evidence_id not in mapped_ids] + + if current_ok and historical_ok and rationale_ok: + status = "pass" + reason = "Graphiti/Zep search results mapped current, historical, and rationale facts with validity windows." + else: + status = "wrong_result" + reason = ( + "Graphiti/Zep search results did not map all required temporal facts with expected validity " + f"windows; missing={', '.join(missing_ids) or 'none'}." + ) + + return { + "status": status, + "reason": reason, + "expected_evidence_ids": required_ids, + "mapped_evidence_ids": mapped_ids, + "facts": mappings, + } diff --git a/scripts/graphiti_temporal_smoke/materialization.py b/scripts/graphiti_temporal_smoke/materialization.py new file mode 100644 index 00000000..f96fd4f0 --- /dev/null +++ b/scripts/graphiti_temporal_smoke/materialization.py @@ -0,0 +1,99 @@ +"""Primary materialization writer for the Graphiti/Zep smoke.""" + +from __future__ import annotations + +import time +from pathlib import Path +from typing import Any + +from .benchmark import scored_benchmark +from .common import command_to_json, dir_size, file_count, rel, utc_now, write_json +from .context import * # noqa: F403 +from .models import CommandRecord, StatusState + +def write_materialization( + status: StatusState, + facts: list[dict[str, Any]], + fixture_path: Path, + command_records: list[CommandRecord], + inserted: list[dict[str, Any]], + search_results: list[dict[str, Any]], + mapping: dict[str, Any], + started_at: float, + report: dict[str, Any] | None = None, +) -> dict[str, Any]: + """Write the primary smoke artifact.""" + + elapsed_ms = (time.monotonic() - started_at) * 1000 + payload = { + "schema": "elf.graphiti_zep_temporal_smoke/v1", + "generated_at": utc_now(), + "run_id": RUN_ID, + "adapter_id": "graphiti_zep_temporal_smoke", + "project": "Graphiti/Zep", + "status": status.overall, + "materialization_status": { + "source": "smoke_materialization", + "setup": status.setup, + "run": status.run, + "result": status.result, + "overall": status.overall, + "failure_class": status.failure_class, + "failure_reason": status.failure_reason, + }, + "scored_benchmark": scored_benchmark(report), + "evidence_class": status.evidence_class, + "failure": { + "class": status.failure_class or None, + "reason": status.failure_reason or None, + }, + "artifacts": { + "materialization": rel(OUT), + "manifest": rel(MANIFEST_OUT), + "summary": rel(SUMMARY_OUT), + "fixture": rel(fixture_path), + "scored_report_json": rel(REPORT_JSON), + "scored_report_markdown": rel(REPORT_MD), + }, + "docker_boundary": { + "compose_file": "docker-compose.baseline.yml", + "service_profile": "graphiti-zep", + "graph_store_service": "graphiti-falkordb", + "runner_service": "baseline-runner", + "runner": "scripts/graphiti-zep-docker-temporal-smoke.py", + "host_global_installs_required": False, + "docker_only": True, + }, + "provider_configuration": { + "package": GRAPHITI_REF, + "package_spec": GRAPHITI_PACKAGE, + "llm_model": LLM_MODEL, + "embedding_model": EMBEDDING_MODEL, + "api_base_configured": bool(API_BASE), + "api_key_provided": bool(API_KEY), + "operator_owned_provider_credentials_used": False, + "live_run_enabled": RUN_LIVE, + "falkordb": { + "host": FALKORDB_HOST, + "port": FALKORDB_PORT, + "database": FALKORDB_DATABASE, + "username_configured": bool(FALKORDB_USERNAME), + "password_configured": bool(FALKORDB_PASSWORD), + }, + }, + "resource_bounds": { + "fact_count": len(facts), + "timeout_seconds": TIMEOUT_SECONDS, + "elapsed_ms": round(elapsed_ms, 3), + "work_dir_size_bytes": dir_size(WORK_DIR), + "work_dir_file_count": file_count(WORK_DIR), + }, + "commands": [command_to_json(record) for record in command_records], + "temporal_facts": facts, + "inserted_facts": inserted, + "search_results": search_results, + "evidence_mapping": mapping, + } + write_json(OUT, payload) + + return payload diff --git a/scripts/graphiti_temporal_smoke/models.py b/scripts/graphiti_temporal_smoke/models.py new file mode 100644 index 00000000..27d2ae5f --- /dev/null +++ b/scripts/graphiti_temporal_smoke/models.py @@ -0,0 +1,34 @@ +"""Typed records for the Graphiti/Zep temporal smoke.""" + +from __future__ import annotations + +from dataclasses import dataclass + +class StatusState: + """Typed status for generated Graphiti/Zep smoke artifacts.""" + + setup: str = "blocked" + run: str = "not_encoded" + result: str = "blocked" + overall: str = "blocked" + evidence_class: str = "research_gate" + failure_class: str = "graphiti_zep_live_run_disabled" + failure_reason: str = ( + "Graphiti/Zep temporal graph live run is opt-in; set " + "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 and provide explicit " + "provider configuration to attempt the Docker-local FalkorDB smoke." + ) + + +@dataclass +class CommandRecord: + """Captured command result without secret-bearing environment values.""" + + label: str + command: list[str] + status: str + elapsed_ms: float + stdout_artifact: str | None + stderr_artifact: str | None + returncode: int | None + reason: str diff --git a/scripts/graphiti_temporal_smoke/runner.py b/scripts/graphiti_temporal_smoke/runner.py new file mode 100644 index 00000000..16c20989 --- /dev/null +++ b/scripts/graphiti_temporal_smoke/runner.py @@ -0,0 +1,151 @@ +"""CLI runner for the Graphiti/Zep temporal smoke.""" + +from __future__ import annotations + +import time +from pathlib import Path +from typing import Any + +from .fixture import write_fixture +from .manifest import write_manifest +from .materialization import write_materialization +from .summary import write_summary +from .benchmark import run_scored_report +from .common import command_available, mkdirs +from .context import ALLOW_HOST, MANIFEST_OUT, OUT, RUN_LIVE, SUMMARY_OUT +from .corpus import temporal_facts +from .mapping import map_observed_facts +from .models import CommandRecord, StatusState +from .runtime import init_graphiti, run_graphiti, wait_for_falkordb + +def main() -> int: + """Run the smoke and always emit typed artifacts when possible.""" + + started_at = time.monotonic() + mkdirs() + status = StatusState() + command_records: list[CommandRecord] = [] + facts = temporal_facts() + inserted: list[dict[str, Any]] = [] + search_results: list[dict[str, Any]] = [] + mapping: dict[str, Any] = { + "status": "blocked", + "reason": status.failure_reason, + "expected_evidence_ids": [fact["evidence_id"] for fact in facts], + "mapped_evidence_ids": [], + "facts": [ + { + "evidence_id": fact["evidence_id"], + "claim_id": fact["claim_id"], + "status": "blocked", + "expected_valid_at": fact["valid_at"], + "expected_invalid_at": fact["invalid_at"], + "current": fact["current"], + } + for fact in facts + ], + } + + if not Path("/.dockerenv").exists() and not ALLOW_HOST: + status.setup = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "not_running_in_docker" + status.failure_reason = "Graphiti/Zep smoke must run inside Docker; use cargo make smoke-graphiti-zep-docker-temporal." + mapping["status"] = status.result + mapping["reason"] = status.failure_reason + elif not command_available("python3"): + status.setup = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "python_missing" + status.failure_reason = "python3 is required for the Graphiti/Zep smoke runner." + mapping["status"] = status.result + mapping["reason"] = status.failure_reason + elif not RUN_LIVE: + pass + elif not API_KEY: + status.setup = "blocked" + status.run = "not_encoded" + status.result = "blocked" + status.overall = "blocked" + status.failure_class = "provider_api_key_missing" + status.failure_reason = "Graphiti/Zep live temporal search requires an explicit provider API key; no hosted Zep service or unrecorded provider credentials were used." + mapping["reason"] = status.failure_reason + elif not wait_for_falkordb(command_records): + status.setup = "incomplete" + status.run = "not_encoded" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "falkordb_unreachable" + status.failure_reason = "Docker-local FalkorDB did not become reachable for the Graphiti/Zep smoke." + mapping["status"] = status.result + mapping["reason"] = status.failure_reason + else: + installed, python = init_graphiti(command_records) + if not installed: + status.setup = "incomplete" + status.run = "not_encoded" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "graphiti_setup_failed" + status.failure_reason = "Graphiti installation failed inside the Docker runner." + mapping["status"] = status.result + mapping["reason"] = status.failure_reason + else: + status.setup = "pass" + inserted, search_results = run_graphiti(python, command_records) + + if not search_results: + status.run = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "graphiti_temporal_search_failed" + status.failure_reason = "Graphiti/Zep did not return temporal search results for the generated fact corpus." + mapping["status"] = status.result + mapping["reason"] = status.failure_reason + else: + status.run = "pass" + status.evidence_class = "live_real_world" + mapping = map_observed_facts(search_results, facts) + if mapping["status"] == "pass": + status.result = "pass" + status.overall = "pass" + status.failure_class = "" + status.failure_reason = "" + else: + status.result = "wrong_result" + status.overall = "wrong_result" + status.failure_class = "graphiti_temporal_mapping_failed" + status.failure_reason = mapping["reason"] + + fixture_path = write_fixture(facts, status, mapping) + materialization = write_materialization( + status, + facts, + fixture_path, + command_records, + inserted, + search_results, + mapping, + started_at, + ) + manifest = write_manifest(status) + report = run_scored_report(fixture_path, MANIFEST_OUT, status) + materialization = write_materialization( + status, + facts, + fixture_path, + command_records, + inserted, + search_results, + mapping, + started_at, + report, + ) + write_summary(materialization, manifest, report) + print(f"Graphiti/Zep smoke artifact: {OUT}") + print(f"Graphiti/Zep smoke manifest: {MANIFEST_OUT}") + print(f"Graphiti/Zep smoke summary: {SUMMARY_OUT}") + + return 0 diff --git a/scripts/graphiti_temporal_smoke/runtime.py b/scripts/graphiti_temporal_smoke/runtime.py new file mode 100644 index 00000000..ffe6dbab --- /dev/null +++ b/scripts/graphiti_temporal_smoke/runtime.py @@ -0,0 +1,231 @@ +"""Runtime setup and live Graphiti execution.""" + +from __future__ import annotations + +import json +import socket +import sys +import textwrap +import time +from pathlib import Path +from typing import Any + +from .common import run_command, write_json +from .context import * # noqa: F403 +from .corpus import temporal_facts +from .models import CommandRecord + +def wait_for_falkordb(command_records: list[CommandRecord]) -> bool: + """Poll the configured FalkorDB TCP endpoint.""" + + started = time.monotonic() + attempts: list[dict[str, Any]] = [] + + for attempt in range(1, STARTUP_ATTEMPTS + 1): + try: + with socket.create_connection((FALKORDB_HOST, FALKORDB_PORT), timeout=2): + elapsed_ms = (time.monotonic() - started) * 1000 + attempts.append({"attempt": attempt, "status": "pass", "elapsed_ms": round(elapsed_ms, 3)}) + path = LOG_DIR / "falkordb-startup-attempts.json" + write_json(path, attempts) + command_records.append( + CommandRecord( + label="falkordb-startup", + command=["tcp-connect", FALKORDB_HOST, str(FALKORDB_PORT)], + status="pass", + elapsed_ms=elapsed_ms, + stdout_artifact=rel(path), + stderr_artifact=None, + returncode=0, + reason="FalkorDB TCP endpoint accepted a connection.", + ) + ) + return True + except OSError as err: + attempts.append({"attempt": attempt, "status": "incomplete", "reason": str(err)}) + time.sleep(STARTUP_INTERVAL_SECONDS) + + elapsed_ms = (time.monotonic() - started) * 1000 + path = LOG_DIR / "falkordb-startup-attempts.json" + write_json(path, attempts) + command_records.append( + CommandRecord( + label="falkordb-startup", + command=["tcp-connect", FALKORDB_HOST, str(FALKORDB_PORT)], + status="incomplete", + elapsed_ms=elapsed_ms, + stdout_artifact=rel(path), + stderr_artifact=None, + returncode=None, + reason="FalkorDB TCP endpoint did not become reachable.", + ) + ) + return False + +def init_graphiti(command_records: list[CommandRecord]) -> tuple[bool, Path]: + """Create a venv and install Graphiti with FalkorDB support.""" + + venv_dir = WORK_DIR / ".venv" + python = venv_dir / "bin" / "python" + + if INSTALL_GRAPHITI: + venv_record = run_command("python-venv", [sys.executable, "-m", "venv", str(venv_dir)], WORK_DIR) + command_records.append(venv_record) + if venv_record.status != "pass": + return False, python + + install_record = run_command( + "graphiti-install", + [str(python), "-m", "pip", "install", "--disable-pip-version-check", GRAPHITI_PACKAGE], + WORK_DIR, + ) + command_records.append(install_record) + if install_record.status != "pass": + return False, python + elif not python.exists(): + command_records.append( + CommandRecord( + label="graphiti-install", + command=["graphiti-core"], + status="incomplete", + elapsed_ms=0.0, + stdout_artifact=None, + stderr_artifact=None, + returncode=None, + reason="Graphiti install was disabled and no venv python exists.", + ) + ) + return False, python + + return True, python + +def write_live_runner(path: Path) -> None: + """Write the isolated Graphiti execution script.""" + + payload = { + "run_id": RUN_ID, + "facts": temporal_facts(), + "query": "Who currently owns deployment method review, and who owned it historically?", + "falkordb": { + "host": FALKORDB_HOST, + "port": FALKORDB_PORT, + "database": FALKORDB_DATABASE, + }, + "models": { + "llm": LLM_MODEL, + "embedding": EMBEDDING_MODEL, + "api_base": API_BASE, + }, + } + input_path = WORK_DIR / "graphiti-live-input.json" + output_path = WORK_DIR / "graphiti-live-output.json" + write_json(input_path, payload) + script = f""" +import asyncio +import json +import os +import uuid +from datetime import datetime +from pathlib import Path + +from graphiti_core import Graphiti +from graphiti_core.driver.falkordb_driver import FalkorDriver +from graphiti_core.edges import EntityEdge +from graphiti_core.nodes import EntityNode + + +INPUT = Path({str(input_path)!r}) +OUTPUT = Path({str(output_path)!r}) + + +def parse_dt(value): + if value is None: + return None + return datetime.fromisoformat(value.replace("Z", "+00:00")) + + +async def main(): + data = json.loads(INPUT.read_text(encoding="utf-8")) + config = data["falkordb"] + driver = FalkorDriver( + host=config["host"], + port=config["port"], + username=os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_USERNAME") or None, + password=os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_PASSWORD") or None, + database=config.get("database") or "default_db", + ) + graphiti = Graphiti(graph_driver=driver) + try: + await graphiti.build_indices_and_constraints() + inserted = [] + for fact in data["facts"]: + group_id = data["run_id"] + source_uuid = str(uuid.uuid5(uuid.NAMESPACE_URL, group_id + ":source:" + fact["source"])) + target_uuid = str(uuid.uuid5(uuid.NAMESPACE_URL, group_id + ":target:" + fact["target"])) + edge_uuid = str(uuid.uuid5(uuid.NAMESPACE_URL, group_id + ":edge:" + fact["evidence_id"])) + source_node = EntityNode(uuid=source_uuid, name=fact["source"], group_id=group_id) + target_node = EntityNode(uuid=target_uuid, name=fact["target"], group_id=group_id) + edge = EntityEdge( + uuid=edge_uuid, + group_id=group_id, + source_node_uuid=source_uuid, + target_node_uuid=target_uuid, + created_at=parse_dt(fact["created_at"]), + name=fact["edge_name"], + fact=fact["fact"], + valid_at=parse_dt(fact["valid_at"]), + invalid_at=parse_dt(fact.get("invalid_at")), + ) + await graphiti.add_triplet(source_node, edge, target_node) + inserted.append({{"evidence_id": fact["evidence_id"], "uuid": edge_uuid}}) + + results = await graphiti.search(data["query"]) + serialized = [] + for edge in results: + serialized.append({{ + "uuid": getattr(edge, "uuid", None), + "name": getattr(edge, "name", None), + "fact": getattr(edge, "fact", None), + "valid_at": str(getattr(edge, "valid_at", "")) if getattr(edge, "valid_at", None) else None, + "invalid_at": str(getattr(edge, "invalid_at", "")) if getattr(edge, "invalid_at", None) else None, + "source_node_uuid": getattr(edge, "source_node_uuid", None), + "target_node_uuid": getattr(edge, "target_node_uuid", None), + }}) + + OUTPUT.write_text(json.dumps({{"inserted": inserted, "results": serialized}}, indent=2, sort_keys=True) + "\\n", encoding="utf-8") + finally: + await graphiti.close() + + +asyncio.run(main()) +""" + path.write_text(textwrap.dedent(script).lstrip(), encoding="utf-8") + +def run_graphiti(python: Path, command_records: list[CommandRecord]) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Run the Graphiti live worker and return inserted/search result facts.""" + + runner = WORK_DIR / "graphiti_live_runner.py" + write_live_runner(runner) + env = { + "OPENAI_API_KEY": API_KEY, + "MODEL_NAME": LLM_MODEL, + "LLM_MODEL": LLM_MODEL, + "EMBEDDING_MODEL": EMBEDDING_MODEL, + } + + if API_BASE: + env["OPENAI_BASE_URL"] = API_BASE + if FALKORDB_USERNAME: + env["ELF_GRAPHITI_ZEP_FALKORDB_USERNAME"] = FALKORDB_USERNAME + if FALKORDB_PASSWORD: + env["ELF_GRAPHITI_ZEP_FALKORDB_PASSWORD"] = FALKORDB_PASSWORD + + record = run_command("graphiti-live-run", [str(python), str(runner)], WORK_DIR, extra_env=env) + command_records.append(record) + + output_path = WORK_DIR / "graphiti-live-output.json" + if record.status != "pass" or not output_path.exists(): + return [], [] + + payload = json.loads(output_path.read_text(encoding="utf-8")) + return payload.get("inserted", []), payload.get("results", []) diff --git a/scripts/graphiti_temporal_smoke/summary.py b/scripts/graphiti_temporal_smoke/summary.py new file mode 100644 index 00000000..a2b0187b --- /dev/null +++ b/scripts/graphiti_temporal_smoke/summary.py @@ -0,0 +1,35 @@ +"""Summary writer for the Graphiti/Zep smoke.""" + +from __future__ import annotations + +from typing import Any + +from .common import rel, utc_now, write_json +from .context import MANIFEST_OUT, SUMMARY_OUT + +def write_summary(materialization: dict[str, Any], manifest: dict[str, Any], report: dict[str, Any]) -> None: + """Write a small summary artifact.""" + + write_json( + SUMMARY_OUT, + { + "schema": "elf.graphiti_zep_temporal_smoke_summary/v1", + "generated_at": utc_now(), + "adapter_id": "graphiti_zep_temporal_smoke", + "evidence_class": materialization["evidence_class"], + "status_boundary": { + "materialization": "setup/run/evidence-mapping state emitted by the smoke runner", + "manifest": "external adapter declaration consumed by the scorer", + "scored_benchmark": "post-score real_world_job outcome; use this for quality status", + }, + "scored_benchmark": materialization["scored_benchmark"], + "materialization": materialization, + "manifest": { + "json": rel(MANIFEST_OUT), + "status_source": "external_adapter_manifest_pre_score", + "summary": manifest["adapters"][0]["overall_status"], + "suites": manifest["adapters"][0]["suites"], + }, + "report": report, + }, + ) diff --git a/scripts/letta-core-archive-export-readback-smoke.py b/scripts/letta-core-archive-export-readback-smoke.py index ee31ffdc..fb7f4e85 100755 --- a/scripts/letta-core-archive-export-readback-smoke.py +++ b/scripts/letta-core-archive-export-readback-smoke.py @@ -3,1059 +3,7 @@ from __future__ import annotations -import json -import os -import shutil -import subprocess -import sys -import time -import urllib.error -import urllib.request -from dataclasses import dataclass -from datetime import datetime, timezone -from pathlib import Path -from typing import Any - - -SCRIPT_DIR = Path(__file__).resolve().parent -ROOT_DIR = SCRIPT_DIR.parent -CORE_FIXTURE_DIR = ROOT_DIR / "apps" / "elf-eval" / "fixtures" / "real_world_memory" / "core_archival_memory" -REPORT_DIR = Path( - os.environ.get( - "ELF_LETTA_SMOKE_REPORT_DIR", - ROOT_DIR / "tmp" / "real-world-memory" / "letta-core-archive", - ) -) -WORK_DIR = Path(os.environ.get("ELF_LETTA_SMOKE_WORK_DIR", REPORT_DIR / "work")) -OUT = Path(os.environ.get("ELF_LETTA_SMOKE_OUT", REPORT_DIR / "letta-core-archive-export.json")) -MANIFEST_OUT = Path( - os.environ.get( - "ELF_LETTA_SMOKE_MANIFEST_OUT", - REPORT_DIR / "memory_projects_manifest.letta-core-archive.json", - ) -) -SUMMARY_OUT = Path(os.environ.get("ELF_LETTA_SMOKE_SUMMARY_OUT", REPORT_DIR / "summary.json")) -REPORT_JSON = Path(os.environ.get("ELF_LETTA_SMOKE_REPORT_JSON", REPORT_DIR / "report.json")) -REPORT_MD = Path(os.environ.get("ELF_LETTA_SMOKE_REPORT_MD", REPORT_DIR / "report.md")) -FIXTURE_DIR = REPORT_DIR / "letta-fixtures" -LOG_DIR = REPORT_DIR / "logs" - -RUN_ID = os.environ.get( - "ELF_LETTA_SMOKE_RUN_ID", - f"letta-core-archive-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}", -) -RUN_LIVE = os.environ.get("ELF_LETTA_SMOKE_RUN", "0") == "1" -ALLOW_HOST = os.environ.get("ELF_LETTA_SMOKE_ALLOW_HOST", "0") == "1" -INSTALL_CLIENT = os.environ.get("ELF_LETTA_SMOKE_INSTALL_CLIENT", "1") == "1" -LETTA_BASE_URL = os.environ.get("ELF_LETTA_BASE_URL", "http://letta:8283") -LETTA_CLIENT_PACKAGE = os.environ.get("ELF_LETTA_CLIENT_PACKAGE", "letta-client") -LETTA_CLIENT_REF = os.environ.get("ELF_LETTA_CLIENT_REF", f"pypi:{LETTA_CLIENT_PACKAGE}") -LETTA_MODEL = os.environ.get("ELF_LETTA_MODEL", "openai/gpt-4o-mini") -LETTA_EMBEDDING = os.environ.get("ELF_LETTA_EMBEDDING", "openai/text-embedding-3-small") -TIMEOUT_SECONDS = int(os.environ.get("ELF_LETTA_TIMEOUT_SECONDS", "600")) -STARTUP_ATTEMPTS = int(os.environ.get("ELF_LETTA_STARTUP_ATTEMPTS", "30")) -STARTUP_INTERVAL_SECONDS = float(os.environ.get("ELF_LETTA_STARTUP_INTERVAL_SECONDS", "2")) - -CORE_KINDS = {"core_block", "core_block_contract", "core_block_event"} - - -@dataclass -class StatusState: - """Typed status for generated Letta smoke artifacts.""" - - setup: str = "blocked" - run: str = "not_encoded" - result: str = "blocked" - overall: str = "blocked" - evidence_class: str = "research_gate" - failure_class: str = "letta_live_run_disabled" - failure_reason: str = ( - "Letta live export/readback is disabled by default; run " - "ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make " - "smoke-letta-core-archive-export-readback with explicit Docker/provider configuration." - ) - - -@dataclass -class CommandRecord: - """Captured command result without secret-bearing environment values.""" - - label: str - command: list[str] - status: str - elapsed_ms: float - stdout_artifact: str | None - stderr_artifact: str | None - returncode: int | None - reason: str - - -def utc_now() -> str: - """Return an RFC3339 UTC timestamp.""" - - return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") - - -def rel(path: Path) -> str: - """Return a repository-relative path when possible.""" - - try: - return str(path.resolve().relative_to(ROOT_DIR)) - except ValueError: - return str(path) - - -def mkdirs() -> None: - """Create and reset output directories owned by this smoke.""" - - for path in (FIXTURE_DIR, LOG_DIR): - if path.exists(): - shutil.rmtree(path) - - for path in (REPORT_DIR, WORK_DIR, FIXTURE_DIR, LOG_DIR): - path.mkdir(parents=True, exist_ok=True) - - for path in (OUT, MANIFEST_OUT, SUMMARY_OUT, REPORT_JSON, REPORT_MD): - if path.exists(): - path.unlink() - - -def write_json(path: Path, payload: Any) -> None: - """Write stable, pretty JSON.""" - - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") - - -def command_available(name: str) -> bool: - """Return whether a command is available.""" - - return shutil.which(name) is not None - - -def run_command( - label: str, - command: list[str], - cwd: Path, - *, - extra_env: dict[str, str] | None = None, -) -> CommandRecord: - """Run a command and capture stdout/stderr artifacts.""" - - started = time.monotonic() - env = os.environ.copy() - if extra_env: - env.update(extra_env) - - try: - result = subprocess.run( - command, - cwd=cwd, - env=env, - text=True, - capture_output=True, - timeout=TIMEOUT_SECONDS, - check=False, - ) - elapsed = (time.monotonic() - started) * 1000 - stdout_path = LOG_DIR / f"{label}.stdout.txt" - stderr_path = LOG_DIR / f"{label}.stderr.txt" - stdout_path.write_text(result.stdout, encoding="utf-8") - stderr_path.write_text(result.stderr, encoding="utf-8") - status = "pass" if result.returncode == 0 else "incomplete" - reason = "command completed" if result.returncode == 0 else f"exit code {result.returncode}" - - return CommandRecord( - label=label, - command=command, - status=status, - elapsed_ms=elapsed, - stdout_artifact=rel(stdout_path), - stderr_artifact=rel(stderr_path), - returncode=result.returncode, - reason=reason, - ) - except subprocess.TimeoutExpired as exc: - elapsed = (time.monotonic() - started) * 1000 - stdout_path = LOG_DIR / f"{label}.stdout.txt" - stderr_path = LOG_DIR / f"{label}.stderr.txt" - stdout_path.write_text(exc.stdout or "", encoding="utf-8") - stderr_path.write_text(exc.stderr or "", encoding="utf-8") - - return CommandRecord( - label=label, - command=command, - status="incomplete", - elapsed_ms=elapsed, - stdout_artifact=rel(stdout_path), - stderr_artifact=rel(stderr_path), - returncode=None, - reason=f"timed out after {TIMEOUT_SECONDS}s", - ) - - -def command_to_json(record: CommandRecord) -> dict[str, Any]: - """Serialize a command record.""" - - return { - "label": record.label, - "command": record.command, - "status": record.status, - "elapsed_ms": round(record.elapsed_ms, 3), - "stdout_artifact": record.stdout_artifact, - "stderr_artifact": record.stderr_artifact, - "returncode": record.returncode, - "reason": record.reason, - } - - -def load_source_fixtures() -> list[dict[str, Any]]: - """Load the checked-in core_archival_memory fixture corpus.""" - - fixtures = [] - for path in sorted(CORE_FIXTURE_DIR.glob("*.json")): - payload = json.loads(path.read_text(encoding="utf-8")) - payload["_source_path"] = rel(path) - fixtures.append(payload) - - return fixtures - - -def evidence_ids_for_fixture(fixture: dict[str, Any]) -> list[str]: - """Return required evidence ids for one fixture.""" - - return [ - item["evidence_id"] - for item in fixture.get("required_evidence", []) - if isinstance(item, dict) and item.get("evidence_id") - ] - - -def all_required_evidence_ids(fixtures: list[dict[str, Any]]) -> list[str]: - """Return de-duplicated required evidence ids.""" - - ids: list[str] = [] - for fixture in fixtures: - for evidence_id in evidence_ids_for_fixture(fixture): - if evidence_id not in ids: - ids.append(evidence_id) - - return ids - - -def source_items(fixtures: list[dict[str, Any]]) -> list[dict[str, Any]]: - """Flatten fixture corpus items with job metadata.""" - - items = [] - for fixture in fixtures: - for item in fixture.get("corpus", {}).get("items", []): - item_copy = dict(item) - item_copy["job_id"] = fixture["job_id"] - item_copy["fixture_source"] = fixture["_source_path"] - items.append(item_copy) - - return items - - -def benchmark_input_contract(fixtures: list[dict[str, Any]]) -> dict[str, Any]: - """Return the benchmark-owned Letta input contract.""" - - core_blocks = [] - archival_passages = [] - for item in source_items(fixtures): - record = { - "source_id": item["evidence_id"], - "job_id": item["job_id"], - "kind": item.get("kind"), - "text": item.get("text", ""), - "fixture_source": item["fixture_source"], - } - if item.get("kind") in CORE_KINDS: - core_blocks.append( - { - "label": slug(item["evidence_id"])[:48], - "value": f"Source ID: {item['evidence_id']}\n{item.get('text', '')}", - **record, - } - ) - elif item.get("kind") not in {"stale_claim", "unsupported_claim"}: - archival_passages.append( - { - "text": f"Source ID: {item['evidence_id']}\n{item.get('text', '')}", - **record, - } - ) - - return { - "core_blocks": core_blocks, - "archival_passages": archival_passages, - "source_id_count": len({item["evidence_id"] for item in source_items(fixtures)}), - "required_evidence_ids": all_required_evidence_ids(fixtures), - } - - -def slug(value: str) -> str: - """Return a small ASCII slug.""" - - out: list[str] = [] - last_dash = False - - for char in value.lower(): - if char.isascii() and char.isalnum(): - out.append(char) - last_dash = False - elif not last_dash and out: - out.append("-") - last_dash = True - - while out and out[-1] == "-": - out.pop() - - return "".join(out) or "item" - - -def wait_for_letta(command_records: list[CommandRecord]) -> bool: - """Wait for a Letta server endpoint to become reachable.""" - - started = time.monotonic() - probes = ["/v1/health", "/health", "/v1/models"] - last_reason = "not attempted" - for _ in range(STARTUP_ATTEMPTS): - for path in probes: - url = LETTA_BASE_URL.rstrip("/") + path - try: - with urllib.request.urlopen(url, timeout=5) as response: - if 200 <= response.status < 500: - command_records.append( - CommandRecord( - label="letta-health-probe", - command=["GET", url], - status="pass", - elapsed_ms=(time.monotonic() - started) * 1000, - stdout_artifact=None, - stderr_artifact=None, - returncode=0, - reason=f"reachable via {path}", - ) - ) - return True - except (urllib.error.URLError, TimeoutError, OSError) as exc: - last_reason = str(exc) - - time.sleep(STARTUP_INTERVAL_SECONDS) - - command_records.append( - CommandRecord( - label="letta-health-probe", - command=["GET", LETTA_BASE_URL.rstrip() + "/v1/health"], - status="incomplete", - elapsed_ms=(time.monotonic() - started) * 1000, - stdout_artifact=None, - stderr_artifact=None, - returncode=None, - reason=last_reason, - ) - ) - return False - - -def init_letta_client(command_records: list[CommandRecord]) -> bool: - """Install or verify the Letta Python client.""" - - if INSTALL_CLIENT: - record = run_command( - "letta-client-install", - [sys.executable, "-m", "pip", "install", LETTA_CLIENT_PACKAGE], - WORK_DIR, - ) - command_records.append(record) - if record.status != "pass": - return False - - record = run_command("letta-client-import", [sys.executable, "-c", "import letta_client"], WORK_DIR) - command_records.append(record) - - return record.status == "pass" - - -def write_live_runner(fixtures: list[dict[str, Any]]) -> Path: - """Write a small Python runner that uses the current Letta SDK.""" - - contract = benchmark_input_contract(fixtures) - input_path = WORK_DIR / "letta-live-input.json" - write_json(input_path, contract) - - runner = WORK_DIR / "letta_live_runner.py" - runner.write_text( - """ -import json -import os -from pathlib import Path - -from letta_client import Letta - - -def as_dict(value): - if hasattr(value, "model_dump"): - return value.model_dump(mode="json") - if hasattr(value, "dict"): - return value.dict() - return json.loads(json.dumps(value, default=str)) - - -input_path = Path(os.environ["ELF_LETTA_LIVE_INPUT"]) -output_path = Path(os.environ["ELF_LETTA_LIVE_OUTPUT"]) -data = json.loads(input_path.read_text()) - -client = Letta(base_url=os.environ["ELF_LETTA_BASE_URL"]) -agent = client.agents.create( - name=os.environ.get("ELF_LETTA_AGENT_NAME", "elf-core-archive-smoke"), - model=os.environ["ELF_LETTA_MODEL"], - embedding=os.environ["ELF_LETTA_EMBEDDING"], - memory_blocks=[ - {"label": item["label"], "value": item["value"]} - for item in data["core_blocks"] - ], -) - -created_passages = [] -for passage in data["archival_passages"]: - created_passages.append( - as_dict(client.agents.passages.create(agent_id=agent.id, text=passage["text"])) - ) - -core_block_export = [] -for item in data["core_blocks"]: - core_block_export.append( - { - "source_id": item["source_id"], - "label": item["label"], - "block": as_dict( - client.agents.blocks.retrieve(agent_id=agent.id, block_label=item["label"]) - ), - } - ) - -listed_passages = as_dict(client.agents.passages.list(agent_id=agent.id)) -search_results = [] -for source_id in data["required_evidence_ids"]: - search_results.append( - { - "query": source_id, - "response": as_dict( - client.agents.passages.search(agent_id=agent.id, query=source_id, top_k=5) - ), - } - ) - -output_path.write_text( - json.dumps( - { - "agent": as_dict(agent), - "core_block_export": core_block_export, - "created_passages": created_passages, - "archival_readback": listed_passages, - "archival_search": search_results, - }, - indent=2, - sort_keys=True, - ) - + "\\n" -) -""".lstrip(), - encoding="utf-8", - ) - - return runner - - -def run_letta(fixtures: list[dict[str, Any]], command_records: list[CommandRecord]) -> dict[str, Any] | None: - """Create the Letta benchmark agent and export readback/search data.""" - - runner = write_live_runner(fixtures) - output_path = WORK_DIR / "letta-live-output.json" - env = { - "ELF_LETTA_BASE_URL": LETTA_BASE_URL, - "ELF_LETTA_MODEL": LETTA_MODEL, - "ELF_LETTA_EMBEDDING": LETTA_EMBEDDING, - "ELF_LETTA_LIVE_INPUT": str(WORK_DIR / "letta-live-input.json"), - "ELF_LETTA_LIVE_OUTPUT": str(output_path), - "ELF_LETTA_AGENT_NAME": f"elf-core-archive-smoke-{RUN_ID}", - } - record = run_command("letta-live-export-readback", [sys.executable, str(runner)], WORK_DIR, extra_env=env) - command_records.append(record) - if record.status != "pass" or not output_path.exists(): - return None - - return json.loads(output_path.read_text(encoding="utf-8")) - - -def ids_in_payload(payload: Any, evidence_ids: list[str]) -> list[str]: - """Return evidence ids present anywhere in a JSON-compatible payload.""" - - haystack = json.dumps(payload, sort_keys=True, default=str) - return [evidence_id for evidence_id in evidence_ids if evidence_id in haystack] - - -def evidence_mapping( - fixtures: list[dict[str, Any]], - live_export: dict[str, Any] | None, - status: StatusState, -) -> dict[str, Any]: - """Map observed Letta export/readback data to fixture source ids.""" - - required_ids = all_required_evidence_ids(fixtures) - if live_export is None: - mapped_ids: list[str] = [] - else: - mapped_ids = ids_in_payload(live_export, required_ids) - - missing_ids = [evidence_id for evidence_id in required_ids if evidence_id not in mapped_ids] - jobs = [] - for fixture in fixtures: - expected = evidence_ids_for_fixture(fixture) - mapped = [evidence_id for evidence_id in expected if evidence_id in mapped_ids] - if status.result in {"blocked", "incomplete", "not_encoded"}: - job_status = status.result - reason = status.failure_reason - elif len(mapped) == len(expected): - job_status = "pass" - reason = "Letta core block export and archival readback/search mapped all required source ids." - else: - job_status = "wrong_result" - missing = [evidence_id for evidence_id in expected if evidence_id not in mapped] - reason = f"Letta export/readback missed required evidence ids: {', '.join(missing)}." - - jobs.append( - { - "job_id": fixture["job_id"], - "source_fixture": fixture["_source_path"], - "expected_evidence_ids": expected, - "mapped_evidence_ids": mapped, - "missing_evidence_ids": [evidence_id for evidence_id in expected if evidence_id not in mapped], - "status": job_status, - "reason": reason, - } - ) - - return { - "status": status.result if missing_ids or live_export is None else "pass", - "reason": status.failure_reason - if live_export is None - else ( - "Letta export/readback mapped all required fixture source ids." - if not missing_ids - else f"Letta export/readback missed required evidence ids: {', '.join(missing_ids)}." - ), - "expected_evidence_ids": required_ids, - "mapped_evidence_ids": mapped_ids, - "missing_evidence_ids": missing_ids, - "jobs": jobs, - } - - -def write_fixture_outputs( - fixtures: list[dict[str, Any]], - status: StatusState, - mapping: dict[str, Any], -) -> Path: - """Write generated Letta real_world_job fixtures.""" - - for fixture in fixtures: - generated = json.loads(json.dumps({k: v for k, v in fixture.items() if k != "_source_path"})) - generated["corpus"]["profile"] = "external_adapter" - generated["corpus"]["corpus_id"] = "letta-core-archive-export-readback-2026-06-19" - job_mapping = next(item for item in mapping["jobs"] if item["job_id"] == fixture["job_id"]) - source_answer = fixture.get("corpus", {}).get("adapter_response", {}).get("answer", {}) - generated["corpus"]["adapter_response"] = { - "adapter_id": "letta_core_archive_export_readback", - "answer": { - "content": source_answer.get("content", ""), - "claims": source_answer.get("claims", []), - "evidence_ids": evidence_ids_for_fixture(fixture), - "latency_ms": 0.0, - "cost": { - "currency": "USD", - "amount": 0.0, - "input_tokens": 0, - "output_tokens": 0, - }, - }, - } - generated["tags"] = sorted(set(generated.get("tags", []) + ["external_adapter", "letta_export_readback"])) - generated["encoding"] = {} - if job_mapping["status"] in {"blocked", "incomplete", "not_encoded"}: - generated["encoding"] = { - "status": job_mapping["status"], - "reason": job_mapping["reason"], - "follow_up": { - "title": "Produce Letta core/archive export-readback evidence", - "reason": ( - "The benchmark must export Letta core block JSON, archival readback/search JSON, " - "and fixture source ids before this scenario can be scored as pass or wrong_result." - ), - }, - } - - if job_mapping["status"] == "wrong_result": - generated["corpus"]["adapter_response"]["answer"]["evidence_ids"] = job_mapping[ - "mapped_evidence_ids" - ] - - fixture_path = FIXTURE_DIR / "core_archival_memory" / Path(fixture["_source_path"]).name - write_json(fixture_path, generated) - - return FIXTURE_DIR / "core_archival_memory" - - -def run_scored_report(fixture_path: Path, manifest_path: Path, status: StatusState) -> dict[str, Any]: - """Score the generated Letta fixtures through the real-world job runner.""" - - run_cmd = [ - "cargo", - "run", - "-p", - "elf-eval", - "--bin", - "real_world_job_benchmark", - "--", - "run", - "--fixtures", - str(fixture_path), - "--out", - str(REPORT_JSON), - "--run-id", - "real-world-memory-live-letta-core-archive", - "--adapter-id", - "letta_core_archive_export_readback", - "--adapter-name", - "Letta core/archive export-readback adapter", - "--adapter-behavior", - "docker_core_archive_export_readback", - "--adapter-storage-status", - status.setup, - "--adapter-runtime-status", - status.overall, - "--adapter-notes", - "Generated by the Letta core/archive export-readback smoke; pass requires exported core block JSON, archival readback/search JSON, and mapped fixture source ids.", - "--external-adapter-manifest", - str(manifest_path), - ] - publish_cmd = [ - "cargo", - "run", - "-p", - "elf-eval", - "--bin", - "real_world_job_benchmark", - "--", - "publish", - "--report", - str(REPORT_JSON), - "--out", - str(REPORT_MD), - ] - - subprocess.run(run_cmd, cwd=ROOT_DIR, check=True) - subprocess.run(publish_cmd, cwd=ROOT_DIR, check=True) - - report = json.loads(REPORT_JSON.read_text(encoding="utf-8")) - return { - "json": rel(REPORT_JSON), - "markdown": rel(REPORT_MD), - "summary": report.get("summary", {}), - "suites": report.get("suites", []), - } - - -def scored_benchmark(report: dict[str, Any] | None) -> dict[str, Any]: - """Extract the post-score benchmark status from a real_world_job report.""" - - if report is None: - return { - "schema": "elf.scored_benchmark_status/v1", - "source": "real_world_job_benchmark", - "status": "pending", - "reason": "The Letta smoke materialization was written before benchmark scoring completed.", - } - - summary = report.get("summary", {}) - counts = { - status: int(summary.get(status, 0) or 0) - for status in ("pass", "wrong_result", "lifecycle_fail", "incomplete", "blocked", "not_encoded") - } - status = next((name for name, count in counts.items() if name != "pass" and count > 0), "pass") - - return { - "schema": "elf.scored_benchmark_status/v1", - "source": "real_world_job_benchmark", - "status": status, - "counts": counts, - "job_count": int(summary.get("job_count", 0) or 0), - "mean_score": summary.get("mean_score"), - "evidence_coverage": summary.get("evidence_coverage"), - } - - -def write_materialization( - status: StatusState, - fixtures: list[dict[str, Any]], - fixture_path: Path, - command_records: list[CommandRecord], - live_export: dict[str, Any] | None, - mapping: dict[str, Any], - started_at: float, - report: dict[str, Any] | None = None, -) -> dict[str, Any]: - """Write the primary Letta materialization artifact.""" - - elapsed_ms = (time.monotonic() - started_at) * 1000 - payload = { - "schema": "elf.letta_core_archive_export_readback/v1", - "generated_at": utc_now(), - "run_id": RUN_ID, - "adapter_id": "letta_core_archive_export_readback", - "project": "Letta", - "evidence_class": status.evidence_class, - "status": { - "source": "smoke_materialization", - "setup": status.setup, - "run": status.run, - "result": status.result, - "overall": status.overall, - "failure_class": status.failure_class, - "failure_reason": status.failure_reason, - }, - "scored_benchmark": scored_benchmark(report), - "artifacts": { - "materialization": rel(OUT), - "manifest": rel(MANIFEST_OUT), - "summary": rel(SUMMARY_OUT), - "generated_fixture_dir": rel(fixture_path), - "scored_report_json": rel(REPORT_JSON), - "scored_report_markdown": rel(REPORT_MD), - "live_output": rel(WORK_DIR / "letta-live-output.json") - if (WORK_DIR / "letta-live-output.json").exists() - else None, - }, - "docker_boundary": { - "compose_file": "docker-compose.baseline.yml", - "service_profile": "letta", - "runner_service": "baseline-runner", - "runner": "scripts/letta-core-archive-export-readback-smoke.py", - "host_global_installs_required": False, - "docker_only": True, - "host_global_letta_state_used": False, - "hosted_letta_state_used": False, - }, - "provider_configuration": { - "base_url": LETTA_BASE_URL, - "client_package": LETTA_CLIENT_REF, - "model": LETTA_MODEL, - "embedding": LETTA_EMBEDDING, - "live_run_enabled": RUN_LIVE, - "operator_owned_provider_credentials_used": False, - }, - "benchmark_input": benchmark_input_contract(fixtures), - "letta_export": { - "core_block_json": live_export.get("core_block_export", []) if live_export else [], - "archival_readback_json": live_export.get("archival_readback") if live_export else None, - "archival_search_json": live_export.get("archival_search", []) if live_export else [], - "agent": live_export.get("agent") if live_export else None, - "status": "exported" if live_export else status.result, - }, - "resource_bounds": { - "source_fixture_count": len(fixtures), - "core_block_count": len(benchmark_input_contract(fixtures)["core_blocks"]), - "archival_passage_count": len(benchmark_input_contract(fixtures)["archival_passages"]), - "timeout_seconds": TIMEOUT_SECONDS, - "elapsed_ms": round(elapsed_ms, 3), - }, - "commands": [command_to_json(record) for record in command_records], - "evidence_mapping": mapping, - "improvement_regression_readback": { - "baseline": "XY-955 left Letta core/archive comparison blocked because no contained export/readback artifact existed.", - "current": ( - "unchanged: the benchmark now has a Docker-contained materialization command and typed report, " - "but the default run still preserves Letta comparison as blocked until live export/search data maps source ids." - ) - if status.result != "pass" - else "improved: Letta export/readback mapped all required core/archive source ids.", - "judgment": "improved" if status.result == "pass" else "unchanged", - }, - "claim_boundaries": { - "allowed": [ - "The Letta comparison now has a reproducible Docker-contained materialization/report command.", - "The current default report may preserve typed blockers when live Letta/provider setup cannot produce export/readback evidence.", - ], - "not_allowed": [ - "Do not claim ELF beats Letta on core-vs-archival memory from fixture-only ELF evidence.", - "Do not score Letta pass, win, tie, or loss unless exported core block JSON, archival readback/search JSON, and fixture source ids are present.", - ], - }, - } - write_json(OUT, payload) - - return payload - - -def write_manifest(status: StatusState) -> dict[str, Any]: - """Write a generated external adapter manifest for this smoke.""" - - manifest = { - "schema": "elf.real_world_external_adapter_manifest/v1", - "manifest_id": f"letta-core-archive-export-readback-{RUN_ID}", - "docker_isolation": { - "default": True, - "compose_file": "docker-compose.baseline.yml", - "runner": "scripts/letta-core-archive-export-readback-smoke.py", - "artifact_dir": "tmp/real-world-memory/letta-core-archive", - "host_global_installs_required": False, - "notes": [ - f"Generated by the Letta core/archive export-readback smoke at {utc_now()}.", - "The smoke uses checked-in core_archival_memory fixtures and records typed setup/runtime failures.", - ], - }, - "adapters": [ - { - "adapter_id": "letta_core_archive_export_readback", - "project": "Letta", - "adapter_kind": "docker_core_archive_export_readback", - "evidence_class": status.evidence_class, - "docker_default": True, - "host_global_installs_required": False, - "overall_status": status.overall, - "setup": { - "status": status.setup, - "evidence": "The smoke runs inside the baseline Docker runner and can use a Docker-profile Letta server with explicit model and embedding configuration.", - "command": "cargo make smoke-letta-core-archive-export-readback", - "artifact": rel(OUT), - }, - "run": { - "status": status.run, - "evidence": "The live path creates a benchmark-owned Letta agent, imports fixture source ids into core blocks and archival passages, then exports block/readback/search JSON.", - "command": "ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback", - "artifact": rel(OUT), - }, - "result": { - "status": status.result, - "evidence": status.failure_reason - if status.failure_reason - else "Letta core block export, archival readback, and archival search mapped required fixture source ids.", - "artifact": rel(OUT), - }, - "capabilities": [ - { - "capability": "docker_letta_server_boundary", - "status": status.setup, - "evidence": "The runner uses docker-compose.baseline.yml and avoids host-global Letta state or hosted/private agents.", - }, - { - "capability": "core_block_export", - "status": status.run, - "evidence": "Live scoring requires retrieving Letta memory blocks with fixture source ids embedded in block values.", - }, - { - "capability": "archival_readback_search_export", - "status": status.result, - "evidence": "Live scoring requires archival passage list/search JSON to map required source ids.", - }, - { - "capability": "broad_letta_quality_claim", - "status": "not_encoded", - "evidence": "The smoke does not claim broad Letta product quality, private corpus behavior, or hosted-service parity.", - }, - ], - "suites": [ - { - "suite_id": "core_archival_memory", - "status": status.result, - "evidence": "Only the six checked-in core_archival_memory scenarios are represented.", - }, - { - "suite_id": "personalization", - "status": "not_encoded", - "evidence": "Scoped preference behavior is outside this core/archive export smoke.", - }, - { - "suite_id": "project_decisions", - "status": status.result, - "evidence": "Project-decision recovery is scored only through the core_archival_memory fixture that requires core routing plus archival rationale source ids.", - }, - { - "suite_id": "work_resume", - "status": "not_encoded", - "evidence": "Agent resumption across sessions is not encoded by this export/readback smoke.", - }, - ], - "evidence": [ - {"kind": "artifact", "ref": rel(OUT), "status": status.result}, - {"kind": "manifest", "ref": rel(MANIFEST_OUT), "status": status.overall}, - {"kind": "source", "ref": "https://docs.letta.com/guides/docker", "status": "real"}, - {"kind": "source", "ref": "https://docs.letta.com/api/python", "status": "real"}, - { - "kind": "source", - "ref": "https://docs.letta.com/api/resources/agents/subresources/passages/methods/search", - "status": "real", - }, - ], - "execution_metadata": { - "sources": [ - { - "label": "Letta Docker docs", - "url": "https://docs.letta.com/guides/docker", - "evidence": "Official Docker setup and explicit embedding configuration boundary.", - }, - { - "label": "Letta Python API", - "url": "https://docs.letta.com/api/python", - "evidence": "Official Python SDK memory block creation and retrieval examples.", - }, - { - "label": "Letta archival search API", - "url": "https://docs.letta.com/api/resources/agents/subresources/passages/methods/search", - "evidence": "Official archival-memory search endpoint contract.", - }, - ], - "setup_path": "Run cargo make smoke-letta-core-archive-export-readback for a typed artifact; set ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 with explicit model/provider configuration for a live export attempt.", - "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus optional Letta server profile, benchmark-created agent, benchmark-owned fixture corpus, and artifacts under tmp/real-world-memory/letta-core-archive.", - "resource_expectation": f"Letta client {LETTA_CLIENT_REF}, model={LETTA_MODEL}, embedding={LETTA_EMBEDDING}, source fixture count=6, timeout_seconds={TIMEOUT_SECONDS}.", - "retry_guidance": [ - "Default command records a typed blocked artifact without model calls.", - "Enable the live path only with a Docker-local Letta server and explicit provider or local model configuration.", - "Score only when core block export and archival list/search output map to required fixture source ids.", - ], - "research_depth": "XY-984 materialization contract; generated artifact decides live evidence class.", - }, - "notes": [ - "Failure before Letta export/readback remains typed as blocked or incomplete.", - "The smoke does not use hosted/private Letta state or operator-owned data.", - ], - } - ], - } - write_json(MANIFEST_OUT, manifest) - - return manifest - - -def write_summary(materialization: dict[str, Any], manifest: dict[str, Any], report: dict[str, Any]) -> None: - """Write a small summary artifact.""" - - write_json( - SUMMARY_OUT, - { - "schema": "elf.letta_core_archive_export_readback_summary/v1", - "generated_at": utc_now(), - "adapter_id": "letta_core_archive_export_readback", - "evidence_class": materialization["evidence_class"], - "status_boundary": { - "materialization": "setup/run/evidence-mapping state emitted by the smoke runner", - "manifest": "external adapter declaration consumed by the scorer", - "scored_benchmark": "post-score real_world_job outcome; use this for quality status", - }, - "scored_benchmark": materialization["scored_benchmark"], - "materialization": materialization, - "manifest": { - "json": rel(MANIFEST_OUT), - "status_source": "external_adapter_manifest_score_aligned", - "summary": manifest["adapters"][0]["overall_status"], - "suites": manifest["adapters"][0]["suites"], - }, - "report": report, - }, - ) - - -def main() -> int: - """Run the smoke and always emit typed artifacts when possible.""" - - started_at = time.monotonic() - mkdirs() - status = StatusState() - command_records: list[CommandRecord] = [] - fixtures = load_source_fixtures() - live_export: dict[str, Any] | None = None - - if not Path("/.dockerenv").exists() and not ALLOW_HOST: - status.setup = "incomplete" - status.result = "incomplete" - status.overall = "incomplete" - status.failure_class = "not_running_in_docker" - status.failure_reason = "Letta smoke must run inside Docker; use cargo make smoke-letta-core-archive-export-readback." - elif not command_available("python3"): - status.setup = "incomplete" - status.result = "incomplete" - status.overall = "incomplete" - status.failure_class = "python_missing" - status.failure_reason = "python3 is required for the Letta smoke runner." - elif not RUN_LIVE: - pass - elif not wait_for_letta(command_records): - status.setup = "incomplete" - status.result = "incomplete" - status.overall = "incomplete" - status.failure_class = "letta_server_unreachable" - status.failure_reason = "Docker-local Letta server did not become reachable for export/readback." - elif not init_letta_client(command_records): - status.setup = "incomplete" - status.result = "incomplete" - status.overall = "incomplete" - status.failure_class = "letta_client_setup_failed" - status.failure_reason = "Letta Python client installation or import failed inside the Docker runner." - else: - status.setup = "pass" - live_export = run_letta(fixtures, command_records) - if live_export is None: - status.run = "incomplete" - status.result = "incomplete" - status.overall = "incomplete" - status.failure_class = "letta_export_readback_failed" - status.failure_reason = "Letta did not produce core block export plus archival readback/search output." - else: - status.run = "pass" - status.evidence_class = "live_real_world" - mapping = evidence_mapping(fixtures, live_export, status) - if not mapping["missing_evidence_ids"]: - status.result = "pass" - status.overall = "pass" - status.failure_class = "" - status.failure_reason = "" - else: - status.result = "wrong_result" - status.overall = "wrong_result" - status.failure_class = "letta_source_id_mapping_failed" - status.failure_reason = mapping["reason"] - - mapping = evidence_mapping(fixtures, live_export, status) - fixture_path = write_fixture_outputs(fixtures, status, mapping) - write_materialization( - status, - fixtures, - fixture_path, - command_records, - live_export, - mapping, - started_at, - ) - manifest = write_manifest(status) - report = run_scored_report(fixture_path, MANIFEST_OUT, status) - materialization = write_materialization( - status, - fixtures, - fixture_path, - command_records, - live_export, - mapping, - started_at, - report, - ) - write_summary(materialization, manifest, report) - print(f"Letta core/archive artifact: {OUT}") - print(f"Letta core/archive manifest: {MANIFEST_OUT}") - print(f"Letta core/archive summary: {SUMMARY_OUT}") - - return 0 +from letta_core_archive_smoke.runner import main if __name__ == "__main__": diff --git a/scripts/letta_core_archive_smoke/__init__.py b/scripts/letta_core_archive_smoke/__init__.py new file mode 100644 index 00000000..9d9617aa --- /dev/null +++ b/scripts/letta_core_archive_smoke/__init__.py @@ -0,0 +1 @@ +"""Letta core/archive export-readback smoke modules.""" diff --git a/scripts/letta_core_archive_smoke/artifacts.py b/scripts/letta_core_archive_smoke/artifacts.py new file mode 100644 index 00000000..e4e12276 --- /dev/null +++ b/scripts/letta_core_archive_smoke/artifacts.py @@ -0,0 +1,280 @@ +"""Artifact writers for the Letta core/archive smoke.""" + +from __future__ import annotations + +import time +from pathlib import Path +from typing import Any + +from .benchmark import scored_benchmark +from .common import command_to_json, rel, utc_now, write_json +from .context import * # noqa: F403 +from .fixtures import benchmark_input_contract +from .models import CommandRecord, StatusState + +def write_materialization( + status: StatusState, + fixtures: list[dict[str, Any]], + fixture_path: Path, + command_records: list[CommandRecord], + live_export: dict[str, Any] | None, + mapping: dict[str, Any], + started_at: float, + report: dict[str, Any] | None = None, +) -> dict[str, Any]: + """Write the primary Letta materialization artifact.""" + + elapsed_ms = (time.monotonic() - started_at) * 1000 + payload = { + "schema": "elf.letta_core_archive_export_readback/v1", + "generated_at": utc_now(), + "run_id": RUN_ID, + "adapter_id": "letta_core_archive_export_readback", + "project": "Letta", + "evidence_class": status.evidence_class, + "status": { + "source": "smoke_materialization", + "setup": status.setup, + "run": status.run, + "result": status.result, + "overall": status.overall, + "failure_class": status.failure_class, + "failure_reason": status.failure_reason, + }, + "scored_benchmark": scored_benchmark(report), + "artifacts": { + "materialization": rel(OUT), + "manifest": rel(MANIFEST_OUT), + "summary": rel(SUMMARY_OUT), + "generated_fixture_dir": rel(fixture_path), + "scored_report_json": rel(REPORT_JSON), + "scored_report_markdown": rel(REPORT_MD), + "live_output": rel(WORK_DIR / "letta-live-output.json") + if (WORK_DIR / "letta-live-output.json").exists() + else None, + }, + "docker_boundary": { + "compose_file": "docker-compose.baseline.yml", + "service_profile": "letta", + "runner_service": "baseline-runner", + "runner": "scripts/letta-core-archive-export-readback-smoke.py", + "host_global_installs_required": False, + "docker_only": True, + "host_global_letta_state_used": False, + "hosted_letta_state_used": False, + }, + "provider_configuration": { + "base_url": LETTA_BASE_URL, + "client_package": LETTA_CLIENT_REF, + "model": LETTA_MODEL, + "embedding": LETTA_EMBEDDING, + "live_run_enabled": RUN_LIVE, + "operator_owned_provider_credentials_used": False, + }, + "benchmark_input": benchmark_input_contract(fixtures), + "letta_export": { + "core_block_json": live_export.get("core_block_export", []) if live_export else [], + "archival_readback_json": live_export.get("archival_readback") if live_export else None, + "archival_search_json": live_export.get("archival_search", []) if live_export else [], + "agent": live_export.get("agent") if live_export else None, + "status": "exported" if live_export else status.result, + }, + "resource_bounds": { + "source_fixture_count": len(fixtures), + "core_block_count": len(benchmark_input_contract(fixtures)["core_blocks"]), + "archival_passage_count": len(benchmark_input_contract(fixtures)["archival_passages"]), + "timeout_seconds": TIMEOUT_SECONDS, + "elapsed_ms": round(elapsed_ms, 3), + }, + "commands": [command_to_json(record) for record in command_records], + "evidence_mapping": mapping, + "improvement_regression_readback": { + "baseline": "XY-955 left Letta core/archive comparison blocked because no contained export/readback artifact existed.", + "current": ( + "unchanged: the benchmark now has a Docker-contained materialization command and typed report, " + "but the default run still preserves Letta comparison as blocked until live export/search data maps source ids." + ) + if status.result != "pass" + else "improved: Letta export/readback mapped all required core/archive source ids.", + "judgment": "improved" if status.result == "pass" else "unchanged", + }, + "claim_boundaries": { + "allowed": [ + "The Letta comparison now has a reproducible Docker-contained materialization/report command.", + "The current default report may preserve typed blockers when live Letta/provider setup cannot produce export/readback evidence.", + ], + "not_allowed": [ + "Do not claim ELF beats Letta on core-vs-archival memory from fixture-only ELF evidence.", + "Do not score Letta pass, win, tie, or loss unless exported core block JSON, archival readback/search JSON, and fixture source ids are present.", + ], + }, + } + write_json(OUT, payload) + + return payload + +def write_manifest(status: StatusState) -> dict[str, Any]: + """Write a generated external adapter manifest for this smoke.""" + + manifest = { + "schema": "elf.real_world_external_adapter_manifest/v1", + "manifest_id": f"letta-core-archive-export-readback-{RUN_ID}", + "docker_isolation": { + "default": True, + "compose_file": "docker-compose.baseline.yml", + "runner": "scripts/letta-core-archive-export-readback-smoke.py", + "artifact_dir": "tmp/real-world-memory/letta-core-archive", + "host_global_installs_required": False, + "notes": [ + f"Generated by the Letta core/archive export-readback smoke at {utc_now()}.", + "The smoke uses checked-in core_archival_memory fixtures and records typed setup/runtime failures.", + ], + }, + "adapters": [ + { + "adapter_id": "letta_core_archive_export_readback", + "project": "Letta", + "adapter_kind": "docker_core_archive_export_readback", + "evidence_class": status.evidence_class, + "docker_default": True, + "host_global_installs_required": False, + "overall_status": status.overall, + "setup": { + "status": status.setup, + "evidence": "The smoke runs inside the baseline Docker runner and can use a Docker-profile Letta server with explicit model and embedding configuration.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": rel(OUT), + }, + "run": { + "status": status.run, + "evidence": "The live path creates a benchmark-owned Letta agent, imports fixture source ids into core blocks and archival passages, then exports block/readback/search JSON.", + "command": "ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback", + "artifact": rel(OUT), + }, + "result": { + "status": status.result, + "evidence": status.failure_reason + if status.failure_reason + else "Letta core block export, archival readback, and archival search mapped required fixture source ids.", + "artifact": rel(OUT), + }, + "capabilities": [ + { + "capability": "docker_letta_server_boundary", + "status": status.setup, + "evidence": "The runner uses docker-compose.baseline.yml and avoids host-global Letta state or hosted/private agents.", + }, + { + "capability": "core_block_export", + "status": status.run, + "evidence": "Live scoring requires retrieving Letta memory blocks with fixture source ids embedded in block values.", + }, + { + "capability": "archival_readback_search_export", + "status": status.result, + "evidence": "Live scoring requires archival passage list/search JSON to map required source ids.", + }, + { + "capability": "broad_letta_quality_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad Letta product quality, private corpus behavior, or hosted-service parity.", + }, + ], + "suites": [ + { + "suite_id": "core_archival_memory", + "status": status.result, + "evidence": "Only the six checked-in core_archival_memory scenarios are represented.", + }, + { + "suite_id": "personalization", + "status": "not_encoded", + "evidence": "Scoped preference behavior is outside this core/archive export smoke.", + }, + { + "suite_id": "project_decisions", + "status": status.result, + "evidence": "Project-decision recovery is scored only through the core_archival_memory fixture that requires core routing plus archival rationale source ids.", + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Agent resumption across sessions is not encoded by this export/readback smoke.", + }, + ], + "evidence": [ + {"kind": "artifact", "ref": rel(OUT), "status": status.result}, + {"kind": "manifest", "ref": rel(MANIFEST_OUT), "status": status.overall}, + {"kind": "source", "ref": "https://docs.letta.com/guides/docker", "status": "real"}, + {"kind": "source", "ref": "https://docs.letta.com/api/python", "status": "real"}, + { + "kind": "source", + "ref": "https://docs.letta.com/api/resources/agents/subresources/passages/methods/search", + "status": "real", + }, + ], + "execution_metadata": { + "sources": [ + { + "label": "Letta Docker docs", + "url": "https://docs.letta.com/guides/docker", + "evidence": "Official Docker setup and explicit embedding configuration boundary.", + }, + { + "label": "Letta Python API", + "url": "https://docs.letta.com/api/python", + "evidence": "Official Python SDK memory block creation and retrieval examples.", + }, + { + "label": "Letta archival search API", + "url": "https://docs.letta.com/api/resources/agents/subresources/passages/methods/search", + "evidence": "Official archival-memory search endpoint contract.", + }, + ], + "setup_path": "Run cargo make smoke-letta-core-archive-export-readback for a typed artifact; set ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 with explicit model/provider configuration for a live export attempt.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus optional Letta server profile, benchmark-created agent, benchmark-owned fixture corpus, and artifacts under tmp/real-world-memory/letta-core-archive.", + "resource_expectation": f"Letta client {LETTA_CLIENT_REF}, model={LETTA_MODEL}, embedding={LETTA_EMBEDDING}, source fixture count=6, timeout_seconds={TIMEOUT_SECONDS}.", + "retry_guidance": [ + "Default command records a typed blocked artifact without model calls.", + "Enable the live path only with a Docker-local Letta server and explicit provider or local model configuration.", + "Score only when core block export and archival list/search output map to required fixture source ids.", + ], + "research_depth": "XY-984 materialization contract; generated artifact decides live evidence class.", + }, + "notes": [ + "Failure before Letta export/readback remains typed as blocked or incomplete.", + "The smoke does not use hosted/private Letta state or operator-owned data.", + ], + } + ], + } + write_json(MANIFEST_OUT, manifest) + + return manifest + +def write_summary(materialization: dict[str, Any], manifest: dict[str, Any], report: dict[str, Any]) -> None: + """Write a small summary artifact.""" + + write_json( + SUMMARY_OUT, + { + "schema": "elf.letta_core_archive_export_readback_summary/v1", + "generated_at": utc_now(), + "adapter_id": "letta_core_archive_export_readback", + "evidence_class": materialization["evidence_class"], + "status_boundary": { + "materialization": "setup/run/evidence-mapping state emitted by the smoke runner", + "manifest": "external adapter declaration consumed by the scorer", + "scored_benchmark": "post-score real_world_job outcome; use this for quality status", + }, + "scored_benchmark": materialization["scored_benchmark"], + "materialization": materialization, + "manifest": { + "json": rel(MANIFEST_OUT), + "status_source": "external_adapter_manifest_score_aligned", + "summary": manifest["adapters"][0]["overall_status"], + "suites": manifest["adapters"][0]["suites"], + }, + "report": report, + }, + ) diff --git a/scripts/letta_core_archive_smoke/benchmark.py b/scripts/letta_core_archive_smoke/benchmark.py new file mode 100644 index 00000000..a3c1d099 --- /dev/null +++ b/scripts/letta_core_archive_smoke/benchmark.py @@ -0,0 +1,99 @@ +"""Scoring helpers for the Letta core/archive smoke.""" + +from __future__ import annotations + +import json +import subprocess +from pathlib import Path +from typing import Any + +from .common import rel +from .context import REPORT_JSON, REPORT_MD, ROOT_DIR +from .models import StatusState + +def run_scored_report(fixture_path: Path, manifest_path: Path, status: StatusState) -> dict[str, Any]: + """Score the generated Letta fixtures through the real-world job runner.""" + + run_cmd = [ + "cargo", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + str(fixture_path), + "--out", + str(REPORT_JSON), + "--run-id", + "real-world-memory-live-letta-core-archive", + "--adapter-id", + "letta_core_archive_export_readback", + "--adapter-name", + "Letta core/archive export-readback adapter", + "--adapter-behavior", + "docker_core_archive_export_readback", + "--adapter-storage-status", + status.setup, + "--adapter-runtime-status", + status.overall, + "--adapter-notes", + "Generated by the Letta core/archive export-readback smoke; pass requires exported core block JSON, archival readback/search JSON, and mapped fixture source ids.", + "--external-adapter-manifest", + str(manifest_path), + ] + publish_cmd = [ + "cargo", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + str(REPORT_JSON), + "--out", + str(REPORT_MD), + ] + + subprocess.run(run_cmd, cwd=ROOT_DIR, check=True) + subprocess.run(publish_cmd, cwd=ROOT_DIR, check=True) + + report = json.loads(REPORT_JSON.read_text(encoding="utf-8")) + return { + "json": rel(REPORT_JSON), + "markdown": rel(REPORT_MD), + "summary": report.get("summary", {}), + "suites": report.get("suites", []), + } + +def scored_benchmark(report: dict[str, Any] | None) -> dict[str, Any]: + """Extract the post-score benchmark status from a real_world_job report.""" + + if report is None: + return { + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": "pending", + "reason": "The Letta smoke materialization was written before benchmark scoring completed.", + } + + summary = report.get("summary", {}) + counts = { + status: int(summary.get(status, 0) or 0) + for status in ("pass", "wrong_result", "lifecycle_fail", "incomplete", "blocked", "not_encoded") + } + status = next((name for name, count in counts.items() if name != "pass" and count > 0), "pass") + + return { + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": status, + "counts": counts, + "job_count": int(summary.get("job_count", 0) or 0), + "mean_score": summary.get("mean_score"), + "evidence_coverage": summary.get("evidence_coverage"), + } diff --git a/scripts/letta_core_archive_smoke/common.py b/scripts/letta_core_archive_smoke/common.py new file mode 100644 index 00000000..b16a90fa --- /dev/null +++ b/scripts/letta_core_archive_smoke/common.py @@ -0,0 +1,127 @@ +"""Shared filesystem and process helpers for the Letta smoke.""" + +from __future__ import annotations + +import json +import os +import shutil +import subprocess +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from .context import FIXTURE_DIR, LOG_DIR, MANIFEST_OUT, OUT, REPORT_DIR, REPORT_JSON, REPORT_MD, ROOT_DIR, SUMMARY_OUT, TIMEOUT_SECONDS, WORK_DIR +from .models import CommandRecord + +def utc_now() -> str: + """Return an RFC3339 UTC timestamp.""" + + return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") + +def rel(path: Path) -> str: + """Return a repository-relative path when possible.""" + + try: + return str(path.resolve().relative_to(ROOT_DIR)) + except ValueError: + return str(path) + +def mkdirs() -> None: + """Create and reset output directories owned by this smoke.""" + + for path in (FIXTURE_DIR, LOG_DIR): + if path.exists(): + shutil.rmtree(path) + + for path in (REPORT_DIR, WORK_DIR, FIXTURE_DIR, LOG_DIR): + path.mkdir(parents=True, exist_ok=True) + + for path in (OUT, MANIFEST_OUT, SUMMARY_OUT, REPORT_JSON, REPORT_MD): + if path.exists(): + path.unlink() + +def write_json(path: Path, payload: Any) -> None: + """Write stable, pretty JSON.""" + + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") + +def command_available(name: str) -> bool: + """Return whether a command is available.""" + + return shutil.which(name) is not None + +def run_command( + label: str, + command: list[str], + cwd: Path, + *, + extra_env: dict[str, str] | None = None, +) -> CommandRecord: + """Run a command and capture stdout/stderr artifacts.""" + + started = time.monotonic() + env = os.environ.copy() + if extra_env: + env.update(extra_env) + + try: + result = subprocess.run( + command, + cwd=cwd, + env=env, + text=True, + capture_output=True, + timeout=TIMEOUT_SECONDS, + check=False, + ) + elapsed = (time.monotonic() - started) * 1000 + stdout_path = LOG_DIR / f"{label}.stdout.txt" + stderr_path = LOG_DIR / f"{label}.stderr.txt" + stdout_path.write_text(result.stdout, encoding="utf-8") + stderr_path.write_text(result.stderr, encoding="utf-8") + status = "pass" if result.returncode == 0 else "incomplete" + reason = "command completed" if result.returncode == 0 else f"exit code {result.returncode}" + + return CommandRecord( + label=label, + command=command, + status=status, + elapsed_ms=elapsed, + stdout_artifact=rel(stdout_path), + stderr_artifact=rel(stderr_path), + returncode=result.returncode, + reason=reason, + ) + except subprocess.TimeoutExpired as exc: + elapsed = (time.monotonic() - started) * 1000 + stdout_path = LOG_DIR / f"{label}.stdout.txt" + stderr_path = LOG_DIR / f"{label}.stderr.txt" + stdout_path.write_text(exc.stdout or "", encoding="utf-8") + stderr_path.write_text(exc.stderr or "", encoding="utf-8") + + return CommandRecord( + label=label, + command=command, + status="incomplete", + elapsed_ms=elapsed, + stdout_artifact=rel(stdout_path), + stderr_artifact=rel(stderr_path), + returncode=None, + reason=f"timed out after {TIMEOUT_SECONDS}s", + ) + +def command_to_json(record: CommandRecord) -> dict[str, Any]: + """Serialize a command record.""" + + return { + "label": record.label, + "command": record.command, + "status": record.status, + "elapsed_ms": round(record.elapsed_ms, 3), + "stdout_artifact": record.stdout_artifact, + "stderr_artifact": record.stderr_artifact, + "returncode": record.returncode, + "reason": record.reason, + } diff --git a/scripts/letta_core_archive_smoke/context.py b/scripts/letta_core_archive_smoke/context.py new file mode 100644 index 00000000..813eab1f --- /dev/null +++ b/scripts/letta_core_archive_smoke/context.py @@ -0,0 +1,52 @@ +"""Configuration for the Letta core/archive smoke.""" + +from __future__ import annotations + +import os +from datetime import datetime, timezone +from pathlib import Path + +from typing import Any + + +SCRIPT_DIR = Path(__file__).resolve().parent.parent +ROOT_DIR = SCRIPT_DIR.parent +CORE_FIXTURE_DIR = ROOT_DIR / "apps" / "elf-eval" / "fixtures" / "real_world_memory" / "core_archival_memory" +REPORT_DIR = Path( + os.environ.get( + "ELF_LETTA_SMOKE_REPORT_DIR", + ROOT_DIR / "tmp" / "real-world-memory" / "letta-core-archive", + ) +) +WORK_DIR = Path(os.environ.get("ELF_LETTA_SMOKE_WORK_DIR", REPORT_DIR / "work")) +OUT = Path(os.environ.get("ELF_LETTA_SMOKE_OUT", REPORT_DIR / "letta-core-archive-export.json")) +MANIFEST_OUT = Path( + os.environ.get( + "ELF_LETTA_SMOKE_MANIFEST_OUT", + REPORT_DIR / "memory_projects_manifest.letta-core-archive.json", + ) +) +SUMMARY_OUT = Path(os.environ.get("ELF_LETTA_SMOKE_SUMMARY_OUT", REPORT_DIR / "summary.json")) +REPORT_JSON = Path(os.environ.get("ELF_LETTA_SMOKE_REPORT_JSON", REPORT_DIR / "report.json")) +REPORT_MD = Path(os.environ.get("ELF_LETTA_SMOKE_REPORT_MD", REPORT_DIR / "report.md")) +FIXTURE_DIR = REPORT_DIR / "letta-fixtures" +LOG_DIR = REPORT_DIR / "logs" + +RUN_ID = os.environ.get( + "ELF_LETTA_SMOKE_RUN_ID", + f"letta-core-archive-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}", +) +RUN_LIVE = os.environ.get("ELF_LETTA_SMOKE_RUN", "0") == "1" +ALLOW_HOST = os.environ.get("ELF_LETTA_SMOKE_ALLOW_HOST", "0") == "1" +INSTALL_CLIENT = os.environ.get("ELF_LETTA_SMOKE_INSTALL_CLIENT", "1") == "1" +LETTA_BASE_URL = os.environ.get("ELF_LETTA_BASE_URL", "http://letta:8283") +LETTA_CLIENT_PACKAGE = os.environ.get("ELF_LETTA_CLIENT_PACKAGE", "letta-client") +LETTA_CLIENT_REF = os.environ.get("ELF_LETTA_CLIENT_REF", f"pypi:{LETTA_CLIENT_PACKAGE}") +LETTA_MODEL = os.environ.get("ELF_LETTA_MODEL", "openai/gpt-4o-mini") +LETTA_EMBEDDING = os.environ.get("ELF_LETTA_EMBEDDING", "openai/text-embedding-3-small") +TIMEOUT_SECONDS = int(os.environ.get("ELF_LETTA_TIMEOUT_SECONDS", "600")) +STARTUP_ATTEMPTS = int(os.environ.get("ELF_LETTA_STARTUP_ATTEMPTS", "30")) +STARTUP_INTERVAL_SECONDS = float(os.environ.get("ELF_LETTA_STARTUP_INTERVAL_SECONDS", "2")) + +CORE_KINDS = {"core_block", "core_block_contract", "core_block_event"} + diff --git a/scripts/letta_core_archive_smoke/fixtures.py b/scripts/letta_core_archive_smoke/fixtures.py new file mode 100644 index 00000000..464d8801 --- /dev/null +++ b/scripts/letta_core_archive_smoke/fixtures.py @@ -0,0 +1,225 @@ +"""Fixture loading, evidence mapping, and generated fixture output.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +from .common import rel, write_json +from .context import CORE_FIXTURE_DIR, CORE_KINDS, FIXTURE_DIR +from .models import StatusState + +def load_source_fixtures() -> list[dict[str, Any]]: + """Load the checked-in core_archival_memory fixture corpus.""" + + fixtures = [] + for path in sorted(CORE_FIXTURE_DIR.glob("*.json")): + payload = json.loads(path.read_text(encoding="utf-8")) + payload["_source_path"] = rel(path) + fixtures.append(payload) + + return fixtures + +def evidence_ids_for_fixture(fixture: dict[str, Any]) -> list[str]: + """Return required evidence ids for one fixture.""" + + return [ + item["evidence_id"] + for item in fixture.get("required_evidence", []) + if isinstance(item, dict) and item.get("evidence_id") + ] + +def all_required_evidence_ids(fixtures: list[dict[str, Any]]) -> list[str]: + """Return de-duplicated required evidence ids.""" + + ids: list[str] = [] + for fixture in fixtures: + for evidence_id in evidence_ids_for_fixture(fixture): + if evidence_id not in ids: + ids.append(evidence_id) + + return ids + +def source_items(fixtures: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Flatten fixture corpus items with job metadata.""" + + items = [] + for fixture in fixtures: + for item in fixture.get("corpus", {}).get("items", []): + item_copy = dict(item) + item_copy["job_id"] = fixture["job_id"] + item_copy["fixture_source"] = fixture["_source_path"] + items.append(item_copy) + + return items + +def benchmark_input_contract(fixtures: list[dict[str, Any]]) -> dict[str, Any]: + """Return the benchmark-owned Letta input contract.""" + + core_blocks = [] + archival_passages = [] + for item in source_items(fixtures): + record = { + "source_id": item["evidence_id"], + "job_id": item["job_id"], + "kind": item.get("kind"), + "text": item.get("text", ""), + "fixture_source": item["fixture_source"], + } + if item.get("kind") in CORE_KINDS: + core_blocks.append( + { + "label": slug(item["evidence_id"])[:48], + "value": f"Source ID: {item['evidence_id']}\n{item.get('text', '')}", + **record, + } + ) + elif item.get("kind") not in {"stale_claim", "unsupported_claim"}: + archival_passages.append( + { + "text": f"Source ID: {item['evidence_id']}\n{item.get('text', '')}", + **record, + } + ) + + return { + "core_blocks": core_blocks, + "archival_passages": archival_passages, + "source_id_count": len({item["evidence_id"] for item in source_items(fixtures)}), + "required_evidence_ids": all_required_evidence_ids(fixtures), + } + +def slug(value: str) -> str: + """Return a small ASCII slug.""" + + out: list[str] = [] + last_dash = False + + for char in value.lower(): + if char.isascii() and char.isalnum(): + out.append(char) + last_dash = False + elif not last_dash and out: + out.append("-") + last_dash = True + + while out and out[-1] == "-": + out.pop() + + return "".join(out) or "item" + +def ids_in_payload(payload: Any, evidence_ids: list[str]) -> list[str]: + """Return evidence ids present anywhere in a JSON-compatible payload.""" + + haystack = json.dumps(payload, sort_keys=True, default=str) + return [evidence_id for evidence_id in evidence_ids if evidence_id in haystack] + +def evidence_mapping( + fixtures: list[dict[str, Any]], + live_export: dict[str, Any] | None, + status: StatusState, +) -> dict[str, Any]: + """Map observed Letta export/readback data to fixture source ids.""" + + required_ids = all_required_evidence_ids(fixtures) + if live_export is None: + mapped_ids: list[str] = [] + else: + mapped_ids = ids_in_payload(live_export, required_ids) + + missing_ids = [evidence_id for evidence_id in required_ids if evidence_id not in mapped_ids] + jobs = [] + for fixture in fixtures: + expected = evidence_ids_for_fixture(fixture) + mapped = [evidence_id for evidence_id in expected if evidence_id in mapped_ids] + if status.result in {"blocked", "incomplete", "not_encoded"}: + job_status = status.result + reason = status.failure_reason + elif len(mapped) == len(expected): + job_status = "pass" + reason = "Letta core block export and archival readback/search mapped all required source ids." + else: + job_status = "wrong_result" + missing = [evidence_id for evidence_id in expected if evidence_id not in mapped] + reason = f"Letta export/readback missed required evidence ids: {', '.join(missing)}." + + jobs.append( + { + "job_id": fixture["job_id"], + "source_fixture": fixture["_source_path"], + "expected_evidence_ids": expected, + "mapped_evidence_ids": mapped, + "missing_evidence_ids": [evidence_id for evidence_id in expected if evidence_id not in mapped], + "status": job_status, + "reason": reason, + } + ) + + return { + "status": status.result if missing_ids or live_export is None else "pass", + "reason": status.failure_reason + if live_export is None + else ( + "Letta export/readback mapped all required fixture source ids." + if not missing_ids + else f"Letta export/readback missed required evidence ids: {', '.join(missing_ids)}." + ), + "expected_evidence_ids": required_ids, + "mapped_evidence_ids": mapped_ids, + "missing_evidence_ids": missing_ids, + "jobs": jobs, + } + +def write_fixture_outputs( + fixtures: list[dict[str, Any]], + status: StatusState, + mapping: dict[str, Any], +) -> Path: + """Write generated Letta real_world_job fixtures.""" + + for fixture in fixtures: + generated = json.loads(json.dumps({k: v for k, v in fixture.items() if k != "_source_path"})) + generated["corpus"]["profile"] = "external_adapter" + generated["corpus"]["corpus_id"] = "letta-core-archive-export-readback-2026-06-19" + job_mapping = next(item for item in mapping["jobs"] if item["job_id"] == fixture["job_id"]) + source_answer = fixture.get("corpus", {}).get("adapter_response", {}).get("answer", {}) + generated["corpus"]["adapter_response"] = { + "adapter_id": "letta_core_archive_export_readback", + "answer": { + "content": source_answer.get("content", ""), + "claims": source_answer.get("claims", []), + "evidence_ids": evidence_ids_for_fixture(fixture), + "latency_ms": 0.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0, + }, + }, + } + generated["tags"] = sorted(set(generated.get("tags", []) + ["external_adapter", "letta_export_readback"])) + generated["encoding"] = {} + if job_mapping["status"] in {"blocked", "incomplete", "not_encoded"}: + generated["encoding"] = { + "status": job_mapping["status"], + "reason": job_mapping["reason"], + "follow_up": { + "title": "Produce Letta core/archive export-readback evidence", + "reason": ( + "The benchmark must export Letta core block JSON, archival readback/search JSON, " + "and fixture source ids before this scenario can be scored as pass or wrong_result." + ), + }, + } + + if job_mapping["status"] == "wrong_result": + generated["corpus"]["adapter_response"]["answer"]["evidence_ids"] = job_mapping[ + "mapped_evidence_ids" + ] + + fixture_path = FIXTURE_DIR / "core_archival_memory" / Path(fixture["_source_path"]).name + write_json(fixture_path, generated) + + return FIXTURE_DIR / "core_archival_memory" diff --git a/scripts/letta_core_archive_smoke/models.py b/scripts/letta_core_archive_smoke/models.py new file mode 100644 index 00000000..98a632fc --- /dev/null +++ b/scripts/letta_core_archive_smoke/models.py @@ -0,0 +1,34 @@ +"""Typed records for the Letta core/archive smoke.""" + +from __future__ import annotations + +from dataclasses import dataclass + +class StatusState: + """Typed status for generated Letta smoke artifacts.""" + + setup: str = "blocked" + run: str = "not_encoded" + result: str = "blocked" + overall: str = "blocked" + evidence_class: str = "research_gate" + failure_class: str = "letta_live_run_disabled" + failure_reason: str = ( + "Letta live export/readback is disabled by default; run " + "ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make " + "smoke-letta-core-archive-export-readback with explicit Docker/provider configuration." + ) + + +@dataclass +class CommandRecord: + """Captured command result without secret-bearing environment values.""" + + label: str + command: list[str] + status: str + elapsed_ms: float + stdout_artifact: str | None + stderr_artifact: str | None + returncode: int | None + reason: str diff --git a/scripts/letta_core_archive_smoke/runner.py b/scripts/letta_core_archive_smoke/runner.py new file mode 100644 index 00000000..53e71668 --- /dev/null +++ b/scripts/letta_core_archive_smoke/runner.py @@ -0,0 +1,105 @@ +"""CLI runner for the Letta core/archive smoke.""" + +from __future__ import annotations + +import time +from pathlib import Path +from typing import Any + +from .artifacts import write_manifest, write_materialization, write_summary +from .benchmark import run_scored_report +from .common import command_available, mkdirs +from .context import ALLOW_HOST, MANIFEST_OUT, OUT, RUN_LIVE, SUMMARY_OUT +from .fixtures import evidence_mapping, load_source_fixtures, write_fixture_outputs +from .models import CommandRecord, StatusState +from .runtime import init_letta_client, run_letta, wait_for_letta + +def main() -> int: + """Run the smoke and always emit typed artifacts when possible.""" + + started_at = time.monotonic() + mkdirs() + status = StatusState() + command_records: list[CommandRecord] = [] + fixtures = load_source_fixtures() + live_export: dict[str, Any] | None = None + + if not Path("/.dockerenv").exists() and not ALLOW_HOST: + status.setup = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "not_running_in_docker" + status.failure_reason = "Letta smoke must run inside Docker; use cargo make smoke-letta-core-archive-export-readback." + elif not command_available("python3"): + status.setup = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "python_missing" + status.failure_reason = "python3 is required for the Letta smoke runner." + elif not RUN_LIVE: + pass + elif not wait_for_letta(command_records): + status.setup = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "letta_server_unreachable" + status.failure_reason = "Docker-local Letta server did not become reachable for export/readback." + elif not init_letta_client(command_records): + status.setup = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "letta_client_setup_failed" + status.failure_reason = "Letta Python client installation or import failed inside the Docker runner." + else: + status.setup = "pass" + live_export = run_letta(fixtures, command_records) + if live_export is None: + status.run = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "letta_export_readback_failed" + status.failure_reason = "Letta did not produce core block export plus archival readback/search output." + else: + status.run = "pass" + status.evidence_class = "live_real_world" + mapping = evidence_mapping(fixtures, live_export, status) + if not mapping["missing_evidence_ids"]: + status.result = "pass" + status.overall = "pass" + status.failure_class = "" + status.failure_reason = "" + else: + status.result = "wrong_result" + status.overall = "wrong_result" + status.failure_class = "letta_source_id_mapping_failed" + status.failure_reason = mapping["reason"] + + mapping = evidence_mapping(fixtures, live_export, status) + fixture_path = write_fixture_outputs(fixtures, status, mapping) + write_materialization( + status, + fixtures, + fixture_path, + command_records, + live_export, + mapping, + started_at, + ) + manifest = write_manifest(status) + report = run_scored_report(fixture_path, MANIFEST_OUT, status) + materialization = write_materialization( + status, + fixtures, + fixture_path, + command_records, + live_export, + mapping, + started_at, + report, + ) + write_summary(materialization, manifest, report) + print(f"Letta core/archive artifact: {OUT}") + print(f"Letta core/archive manifest: {MANIFEST_OUT}") + print(f"Letta core/archive summary: {SUMMARY_OUT}") + + return 0 diff --git a/scripts/letta_core_archive_smoke/runtime.py b/scripts/letta_core_archive_smoke/runtime.py new file mode 100644 index 00000000..5eddd2b7 --- /dev/null +++ b/scripts/letta_core_archive_smoke/runtime.py @@ -0,0 +1,188 @@ +"""Runtime setup and live Letta execution.""" + +from __future__ import annotations + +import json +import textwrap +import time +import urllib.error +import urllib.request +from pathlib import Path +from typing import Any + +from .common import run_command, write_json +from .context import * # noqa: F403 +from .fixtures import benchmark_input_contract, slug +from .models import CommandRecord + +def wait_for_letta(command_records: list[CommandRecord]) -> bool: + """Wait for a Letta server endpoint to become reachable.""" + + started = time.monotonic() + probes = ["/v1/health", "/health", "/v1/models"] + last_reason = "not attempted" + for _ in range(STARTUP_ATTEMPTS): + for path in probes: + url = LETTA_BASE_URL.rstrip("/") + path + try: + with urllib.request.urlopen(url, timeout=5) as response: + if 200 <= response.status < 500: + command_records.append( + CommandRecord( + label="letta-health-probe", + command=["GET", url], + status="pass", + elapsed_ms=(time.monotonic() - started) * 1000, + stdout_artifact=None, + stderr_artifact=None, + returncode=0, + reason=f"reachable via {path}", + ) + ) + return True + except (urllib.error.URLError, TimeoutError, OSError) as exc: + last_reason = str(exc) + + time.sleep(STARTUP_INTERVAL_SECONDS) + + command_records.append( + CommandRecord( + label="letta-health-probe", + command=["GET", LETTA_BASE_URL.rstrip() + "/v1/health"], + status="incomplete", + elapsed_ms=(time.monotonic() - started) * 1000, + stdout_artifact=None, + stderr_artifact=None, + returncode=None, + reason=last_reason, + ) + ) + return False + +def init_letta_client(command_records: list[CommandRecord]) -> bool: + """Install or verify the Letta Python client.""" + + if INSTALL_CLIENT: + record = run_command( + "letta-client-install", + [sys.executable, "-m", "pip", "install", LETTA_CLIENT_PACKAGE], + WORK_DIR, + ) + command_records.append(record) + if record.status != "pass": + return False + + record = run_command("letta-client-import", [sys.executable, "-c", "import letta_client"], WORK_DIR) + command_records.append(record) + + return record.status == "pass" + +def write_live_runner(fixtures: list[dict[str, Any]]) -> Path: + """Write a small Python runner that uses the current Letta SDK.""" + + contract = benchmark_input_contract(fixtures) + input_path = WORK_DIR / "letta-live-input.json" + write_json(input_path, contract) + + runner = WORK_DIR / "letta_live_runner.py" + runner.write_text( + """ +import json +import os +from pathlib import Path + +from letta_client import Letta + + +def as_dict(value): + if hasattr(value, "model_dump"): + return value.model_dump(mode="json") + if hasattr(value, "dict"): + return value.dict() + return json.loads(json.dumps(value, default=str)) + + +input_path = Path(os.environ["ELF_LETTA_LIVE_INPUT"]) +output_path = Path(os.environ["ELF_LETTA_LIVE_OUTPUT"]) +data = json.loads(input_path.read_text()) + +client = Letta(base_url=os.environ["ELF_LETTA_BASE_URL"]) +agent = client.agents.create( + name=os.environ.get("ELF_LETTA_AGENT_NAME", "elf-core-archive-smoke"), + model=os.environ["ELF_LETTA_MODEL"], + embedding=os.environ["ELF_LETTA_EMBEDDING"], + memory_blocks=[ + {"label": item["label"], "value": item["value"]} + for item in data["core_blocks"] + ], +) + +created_passages = [] +for passage in data["archival_passages"]: + created_passages.append( + as_dict(client.agents.passages.create(agent_id=agent.id, text=passage["text"])) + ) + +core_block_export = [] +for item in data["core_blocks"]: + core_block_export.append( + { + "source_id": item["source_id"], + "label": item["label"], + "block": as_dict( + client.agents.blocks.retrieve(agent_id=agent.id, block_label=item["label"]) + ), + } + ) + +listed_passages = as_dict(client.agents.passages.list(agent_id=agent.id)) +search_results = [] +for source_id in data["required_evidence_ids"]: + search_results.append( + { + "query": source_id, + "response": as_dict( + client.agents.passages.search(agent_id=agent.id, query=source_id, top_k=5) + ), + } + ) + +output_path.write_text( + json.dumps( + { + "agent": as_dict(agent), + "core_block_export": core_block_export, + "created_passages": created_passages, + "archival_readback": listed_passages, + "archival_search": search_results, + }, + indent=2, + sort_keys=True, + ) + + "\\n" +) +""".lstrip(), + encoding="utf-8", + ) + + return runner + +def run_letta(fixtures: list[dict[str, Any]], command_records: list[CommandRecord]) -> dict[str, Any] | None: + """Create the Letta benchmark agent and export readback/search data.""" + + runner = write_live_runner(fixtures) + output_path = WORK_DIR / "letta-live-output.json" + env = { + "ELF_LETTA_BASE_URL": LETTA_BASE_URL, + "ELF_LETTA_MODEL": LETTA_MODEL, + "ELF_LETTA_EMBEDDING": LETTA_EMBEDDING, + "ELF_LETTA_LIVE_INPUT": str(WORK_DIR / "letta-live-input.json"), + "ELF_LETTA_LIVE_OUTPUT": str(output_path), + "ELF_LETTA_AGENT_NAME": f"elf-core-archive-smoke-{RUN_ID}", + } + record = run_command("letta-live-export-readback", [sys.executable, str(runner)], WORK_DIR, extra_env=env) + command_records.append(record) + if record.status != "pass" or not output_path.exists(): + return None + + return json.loads(output_path.read_text(encoding="utf-8")) diff --git a/scripts/ragflow-docker-evidence-smoke.sh b/scripts/ragflow-docker-evidence-smoke.sh index 17dd572f..aa9da8f8 100755 --- a/scripts/ragflow-docker-evidence-smoke.sh +++ b/scripts/ragflow-docker-evidence-smoke.sh @@ -104,1083 +104,15 @@ DOCUMENT_STEP_STATUS="not_encoded" CHUNK_STEP_STATUS="not_encoded" RETRIEVAL_STEP_STATUS="not_encoded" -required_command() { - local cmd="$1" - if ! command -v "${cmd}" >/dev/null 2>&1; then - echo "Missing ${cmd}; cannot write RAGFlow smoke artifacts." >&2 - exit 1 - fi -} - -optional_command_status() { - local cmd="$1" - if command -v "${cmd}" >/dev/null 2>&1; then - printf 'available' - else - printf 'missing' - fi -} - -relative_path() { - local path="$1" - if [[ "${path}" == "${ROOT_DIR}/"* ]]; then - printf '%s' "${path#"${ROOT_DIR}/"}" - else - printf '%s' "${path}" - fi -} - -json_status() { - local status="$1" - case "${status}" in - real | mocked | unsupported | blocked | incomplete | wrong_result | lifecycle_fail | pass | not_encoded) - printf '%s' "${status}" - ;; - *) - printf 'incomplete' - ;; - esac -} - -capture_docker_info() { - if docker info --format '{{json .}}' >"${DOCKER_INFO}" 2>"${ARTIFACT_DIR}/docker-info.stderr"; then - return 0 - fi - - jq -n --rawfile stderr "${ARTIFACT_DIR}/docker-info.stderr" '{ - error: "docker_info_failed", - stderr: $stderr - }' >"${DOCKER_INFO}" - return 1 -} - -capture_disk_info() { - docker system df >"${DOCKER_DF}" 2>/dev/null || true -} - -capture_vm_max_map_count() { - if VM_MAX_MAP_COUNT="$(sysctl -n vm.max_map_count 2>/dev/null)"; then - if [[ "${VM_MAX_MAP_COUNT}" =~ ^[0-9]+$ ]] && [[ "${VM_MAX_MAP_COUNT}" -ge 262144 ]]; then - VM_MAX_MAP_COUNT_STATUS="pass" - elif [[ "${VM_MAX_MAP_COUNT}" =~ ^[0-9]+$ ]]; then - VM_MAX_MAP_COUNT_STATUS="blocked" - else - VM_MAX_MAP_COUNT_STATUS="not_observed" - fi - else - VM_MAX_MAP_COUNT="" - VM_MAX_MAP_COUNT_STATUS="not_observed" - fi -} - -capture_image_info() { - if [[ "${PULL_IMAGE}" == "1" && "${ACCEPT_RESOURCE_ENVELOPE}" == "1" ]]; then - docker pull "${RAGFLOW_IMAGE}" >"${ARTIFACT_DIR}/docker-pull.log" 2>&1 || true - fi - - if docker image inspect "${RAGFLOW_IMAGE}" >"${IMAGE_INSPECT}" 2>/dev/null; then - IMAGE_PRESENT="true" - IMAGE_SIZE_BYTES="$(jq -r '.[0].Size // ""' "${IMAGE_INSPECT}")" - else - printf '[]\n' >"${IMAGE_INSPECT}" - fi -} - -update_env_var() { - local file="$1" - local key="$2" - local value="$3" - - if grep -q "^${key}=" "${file}"; then - sed -i.bak "s|^${key}=.*|${key}=${value}|" "${file}" - else - printf '\n%s=%s\n' "${key}" "${value}" >>"${file}" - fi -} - -prepare_official_ragflow_repo() { - local repo_dir="${WORK_DIR}/ragflow" - - if [[ ! -d "${repo_dir}/.git" ]]; then - rm -rf "${repo_dir}" - git clone --depth 1 --branch "${RAGFLOW_REF}" "${RAGFLOW_REPO_URL}" "${repo_dir}" \ - >"${ARTIFACT_DIR}/ragflow-git-clone.log" 2>&1 - else - git -C "${repo_dir}" fetch --depth 1 origin "${RAGFLOW_REF}" \ - >"${ARTIFACT_DIR}/ragflow-git-fetch.log" 2>&1 - git -C "${repo_dir}" checkout -f FETCH_HEAD \ - >"${ARTIFACT_DIR}/ragflow-git-checkout.log" 2>&1 - fi - - update_env_var "${repo_dir}/docker/.env" "DEVICE" "${CPU_GPU_MODE}" - update_env_var "${repo_dir}/docker/.env" "SVR_WEB_HTTP_PORT" "${ELF_RAGFLOW_WEB_HTTP_PORT:-18080}" - update_env_var "${repo_dir}/docker/.env" "SVR_WEB_HTTPS_PORT" "${ELF_RAGFLOW_WEB_HTTPS_PORT:-18443}" - update_env_var "${repo_dir}/docker/.env" "SVR_HTTP_PORT" "${API_PORT}" - update_env_var "${repo_dir}/docker/.env" "ADMIN_SVR_HTTP_PORT" "${ELF_RAGFLOW_ADMIN_PORT:-19381}" - update_env_var "${repo_dir}/docker/.env" "SVR_MCP_PORT" "${ELF_RAGFLOW_MCP_PORT:-19382}" - update_env_var "${repo_dir}/docker/.env" "GO_HTTP_PORT" "${ELF_RAGFLOW_GO_HTTP_PORT:-19384}" - update_env_var "${repo_dir}/docker/.env" "GO_ADMIN_PORT" "${ELF_RAGFLOW_GO_ADMIN_PORT:-19383}" - update_env_var "${repo_dir}/docker/.env" "EXPOSE_MYSQL_PORT" "${ELF_RAGFLOW_MYSQL_PORT:-13306}" - update_env_var "${repo_dir}/docker/.env" "MINIO_CONSOLE_PORT" "${ELF_RAGFLOW_MINIO_CONSOLE_PORT:-19001}" - update_env_var "${repo_dir}/docker/.env" "MINIO_PORT" "${ELF_RAGFLOW_MINIO_PORT:-19000}" - update_env_var "${repo_dir}/docker/.env" "REDIS_PORT" "${ELF_RAGFLOW_REDIS_PORT:-16379}" - update_env_var "${repo_dir}/docker/.env" "ES_PORT" "${ELF_RAGFLOW_ES_PORT:-11200}" - update_env_var "${repo_dir}/docker/.env" "OS_PORT" "${ELF_RAGFLOW_OS_PORT:-11201}" - update_env_var "${repo_dir}/docker/.env" "RAGFLOW_IMAGE" "${RAGFLOW_IMAGE}" - - printf '%s' "${repo_dir}" -} - -run_with_timeout_if_available() { - local seconds="$1" - shift - - if command -v timeout >/dev/null 2>&1; then - timeout "${seconds}" "$@" - else - "$@" - fi -} - -start_ragflow_stack() { - local repo_dir="$1" - local started_at ended_at - started_at="$(date +%s)" - - if ( - cd "${repo_dir}/docker" - run_with_timeout_if_available "${COMPOSE_TIMEOUT_SECONDS}" \ - docker compose -p "${COMPOSE_PROJECT}" -f docker-compose.yml up -d - ) >"${COMPOSE_UP_LOG}" 2>&1; then - STARTED="true" - SETUP_STATUS="pass" - FAILURE_CLASS="" - FAILURE_REASON="" - else - SETUP_STATUS="incomplete" - OVERALL_STATUS="incomplete" - RESULT_STATUS="incomplete" - FAILURE_CLASS="ragflow_compose_start_failed" - FAILURE_REASON="Official RAGFlow Docker Compose did not start successfully; see compose-up.log in the artifact directory." - fi - - ended_at="$(date +%s)" - STARTUP_TIME_MS="$(((ended_at - started_at) * 1000))" -} - -wait_for_ragflow_api() { - local attempt code - - for attempt in $(seq 1 "${STARTUP_ATTEMPTS}"); do - code="$(curl -sS -o /dev/null -w '%{http_code}' "${API_BASE}/api/v1/system/healthz" 2>/dev/null || true)" - jq -nc --argjson attempt "${attempt}" --arg code "${code}" --arg url "${API_BASE}/api/v1/system/healthz" '{ - attempt: $attempt, - url: $url, - http_code: $code - }' >>"${STARTUP_ATTEMPTS_JSONL}" - - if [[ "${code}" == "200" ]]; then - return 0 - fi - - sleep "${STARTUP_INTERVAL_SECONDS}" - done - - return 1 -} - -api_json_request() { - local method="$1" - local path="$2" - local request_file="$3" - local response_file="$4" - local stderr_file="${response_file}.stderr" - local code - - code="$(curl -sS -X "${method}" \ - -o "${response_file}" \ - -w '%{http_code}' \ - -H 'Content-Type: application/json' \ - -H "Authorization: Bearer ${API_KEY}" \ - --data-binary @"${request_file}" \ - "${API_BASE}${path}" 2>"${stderr_file}" || true)" - - jq -n --arg code "${code}" --rawfile stderr "${stderr_file}" '{ - http_code: $code, - stderr: $stderr - }' >"${response_file}.meta.json" - - [[ "${code}" =~ ^2 ]] -} - -response_code_ok() { - local response_file="$1" - - jq -e '(.code? == 0) or (.id? != null) or (.data? != null)' "${response_file}" >/dev/null 2>&1 -} - -extract_id() { - local response_file="$1" - jq -r ' - .data.id - // .data[0].id - // .data.document_id - // .data.chunk_id - // .id - // empty - ' "${response_file}" -} - -run_api_smoke() { - local dataset_name="${RUN_ID}" - - jq -n --arg name "${dataset_name}" '{ - name: $name, - description: "Generated public ELF RAGFlow Docker evidence smoke corpus.", - permission: "me", - chunk_method: "manual", - parser_config: {"raptor": {"use_raptor": false}} - }' >"${DATASET_REQUEST}" - - if api_json_request POST "/api/v1/datasets" "${DATASET_REQUEST}" "${DATASET_RESPONSE}" \ - && response_code_ok "${DATASET_RESPONSE}"; then - DATASET_STEP_STATUS="pass" - DATASET_ID="$(extract_id "${DATASET_RESPONSE}")" - else - DATASET_STEP_STATUS="incomplete" - RUN_STATUS="incomplete" - RESULT_STATUS="incomplete" - OVERALL_STATUS="incomplete" - FAILURE_CLASS="ragflow_dataset_create_failed" - FAILURE_REASON="RAGFlow dataset creation did not return a successful response." - return 0 - fi - - if [[ -z "${DATASET_ID}" ]]; then - DATASET_STEP_STATUS="incomplete" - RUN_STATUS="incomplete" - RESULT_STATUS="incomplete" - OVERALL_STATUS="incomplete" - FAILURE_CLASS="ragflow_dataset_id_missing" - FAILURE_REASON="RAGFlow dataset creation succeeded but no dataset id was found in the response." - return 0 - fi - - jq -n --arg name "${DOCUMENT_NAME}" '{name: $name}' >"${DOCUMENT_REQUEST}" - - if api_json_request POST "/api/v1/datasets/${DATASET_ID}/documents?type=empty" \ - "${DOCUMENT_REQUEST}" "${DOCUMENT_RESPONSE}" \ - && response_code_ok "${DOCUMENT_RESPONSE}"; then - DOCUMENT_STEP_STATUS="pass" - DOCUMENT_ID="$(extract_id "${DOCUMENT_RESPONSE}")" - else - DOCUMENT_STEP_STATUS="incomplete" - RUN_STATUS="incomplete" - RESULT_STATUS="incomplete" - OVERALL_STATUS="incomplete" - FAILURE_CLASS="ragflow_document_create_failed" - FAILURE_REASON="RAGFlow empty document creation did not return a successful response." - return 0 - fi - - if [[ -z "${DOCUMENT_ID}" ]]; then - DOCUMENT_STEP_STATUS="incomplete" - RUN_STATUS="incomplete" - RESULT_STATUS="incomplete" - OVERALL_STATUS="incomplete" - FAILURE_CLASS="ragflow_document_id_missing" - FAILURE_REASON="RAGFlow empty document creation succeeded but no document id was found in the response." - return 0 - fi - - jq -n \ - --arg content "${CORPUS_TEXT}" \ - --arg token "${EVIDENCE_TOKEN}" \ - '{ - content: $content, - important_keywords: [$token], - questions: ["Which evidence token should map to ragflow-smoke-anchor?"] - }' >"${CHUNK_REQUEST}" - - if api_json_request POST "/api/v1/datasets/${DATASET_ID}/documents/${DOCUMENT_ID}/chunks" \ - "${CHUNK_REQUEST}" "${CHUNK_RESPONSE}" \ - && response_code_ok "${CHUNK_RESPONSE}"; then - CHUNK_STEP_STATUS="pass" - CHUNK_ID="$(extract_id "${CHUNK_RESPONSE}")" - else - CHUNK_STEP_STATUS="incomplete" - RUN_STATUS="incomplete" - RESULT_STATUS="incomplete" - OVERALL_STATUS="incomplete" - FAILURE_CLASS="ragflow_chunk_create_failed" - FAILURE_REASON="RAGFlow chunk creation did not return a successful response." - return 0 - fi - - jq -n \ - --arg question "Which RAGFlow smoke evidence token maps to ragflow-smoke-anchor?" \ - --arg dataset_id "${DATASET_ID}" \ - --arg document_id "${DOCUMENT_ID}" \ - '{ - question: $question, - dataset_ids: [$dataset_id], - document_ids: [$document_id], - page: 1, - page_size: 5, - similarity_threshold: 0.0, - vector_similarity_weight: 0.0, - top_k: 5, - keyword: true, - highlight: false - }' >"${RETRIEVAL_REQUEST}" - - if api_json_request POST "/api/v1/retrieval" "${RETRIEVAL_REQUEST}" "${RETRIEVAL_RESPONSE}" \ - && response_code_ok "${RETRIEVAL_RESPONSE}"; then - RETRIEVAL_STEP_STATUS="pass" - else - RETRIEVAL_STEP_STATUS="incomplete" - RUN_STATUS="incomplete" - RESULT_STATUS="incomplete" - OVERALL_STATUS="incomplete" - FAILURE_CLASS="ragflow_retrieval_failed" - FAILURE_REASON="RAGFlow retrieval did not return a successful response." - return 0 - fi - - jq \ - --arg evidence_id "${EVIDENCE_ID}" \ - --arg token "${EVIDENCE_TOKEN}" \ - --arg document_name "${DOCUMENT_NAME}" ' - def chunk_array: - if (.data.chunks? | type) == "array" then .data.chunks - elif (.reference.chunks? | type) == "array" then .reference.chunks - else [] end; - chunk_array - | map({ - chunk_id: (.id // .chunk_id // ""), - content: (.content // .content_with_weight // ""), - document_id: (.document_id // .doc_id // ""), - document_name: (.document_name // .document_keyword // .doc_name // .docnm_kwd // ""), - dataset_id: (.dataset_id // .kb_id // ""), - positions: (.positions // []), - similarity: (.similarity // null), - vector_similarity: (.vector_similarity // null), - term_similarity: (.term_similarity // null), - evidence_ids: ( - if (((.content // .content_with_weight // "") | contains($token)) - or ((.document_name // .document_keyword // .doc_name // .docnm_kwd // "") == $document_name)) - then [$evidence_id] - else [] - end - ), - mapping_status: ( - if ((.content // .content_with_weight // "") | contains($token)) then "matched_content" - elif ((.document_name // .document_keyword // .doc_name // .docnm_kwd // "") == $document_name) then "matched_document" - else "unmatched" - end - ) - })' "${RETRIEVAL_RESPONSE}" >"${REFERENCE_MAPPING}" - - RUN_STATUS="pass" - EVIDENCE_CLASS="live_real_world" - - if jq -e --arg evidence_id "${EVIDENCE_ID}" ' - length > 0 and any(.[]; (.evidence_ids // []) | index($evidence_id)) - ' "${REFERENCE_MAPPING}" >/dev/null; then - RESULT_STATUS="pass" - OVERALL_STATUS="pass" - FAILURE_CLASS="" - FAILURE_REASON="" - else - RESULT_STATUS="wrong_result" - OVERALL_STATUS="wrong_result" - FAILURE_CLASS="ragflow_reference_mapping_missing" - FAILURE_REASON="RAGFlow retrieval returned chunks but none mapped to the generated evidence id." - fi -} - -cleanup_stack() { - local repo_dir="${WORK_DIR}/ragflow" - - if [[ "${STARTED}" != "true" || "${CLEANUP}" != "1" || ! -d "${repo_dir}/docker" ]]; then - return 0 - fi - - ( - cd "${repo_dir}/docker" - docker compose -p "${COMPOSE_PROJECT}" -f docker-compose.yml down -v - ) >"${COMPOSE_DOWN_LOG}" 2>&1 || true -} - -write_scored_benchmark() { - if [[ -s "${REPORT_JSON}" ]]; then - jq 'def count($key): (.summary[$key] // 0); - def scored_status: - if count("wrong_result") > 0 then "wrong_result" - elif count("lifecycle_fail") > 0 then "lifecycle_fail" - elif count("incomplete") > 0 then "incomplete" - elif count("blocked") > 0 then "blocked" - elif count("not_encoded") > 0 then "not_encoded" - elif count("pass") > 0 then "pass" - else "not_encoded" - end; - { - schema: "elf.scored_benchmark_status/v1", - source: "real_world_job_benchmark", - status: scored_status, - counts: { - pass: count("pass"), - wrong_result: count("wrong_result"), - lifecycle_fail: count("lifecycle_fail"), - incomplete: count("incomplete"), - blocked: count("blocked"), - not_encoded: count("not_encoded") - }, - job_count: (.summary.job_count // 0), - mean_score: (.summary.mean_score // null), - evidence_coverage: (.summary.evidence_coverage // null) - }' "${REPORT_JSON}" >"${SCORED_BENCHMARK}" - else - jq -n '{ - schema: "elf.scored_benchmark_status/v1", - source: "real_world_job_benchmark", - status: "pending", - reason: "The smoke materialization was written before benchmark scoring completed." - }' >"${SCORED_BENCHMARK}" - fi -} - -write_artifact() { - local generated_at out_rel manifest_rel fixture_rel report_json_rel report_md_rel docker_status git_status curl_status jq_status - generated_at="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" - out_rel="$(relative_path "${OUT}")" - manifest_rel="$(relative_path "${MANIFEST_OUT}")" - fixture_rel="$(relative_path "${FIXTURE_PATH}")" - report_json_rel="$(relative_path "${REPORT_JSON}")" - report_md_rel="$(relative_path "${REPORT_MD}")" - docker_status="$(optional_command_status docker)" - git_status="$(optional_command_status git)" - curl_status="$(optional_command_status curl)" - jq_status="$(optional_command_status jq)" - - jq -n \ - --arg schema "elf.ragflow_docker_evidence_smoke/v1" \ - --arg run_id "${RUN_ID}" \ - --arg generated_at "${generated_at}" \ - --arg adapter_id "ragflow_docker_evidence_smoke" \ - --arg evidence_class "${EVIDENCE_CLASS}" \ - --arg overall_status "$(json_status "${OVERALL_STATUS}")" \ - --arg setup_status "$(json_status "${SETUP_STATUS}")" \ - --arg run_status "$(json_status "${RUN_STATUS}")" \ - --arg result_status "$(json_status "${RESULT_STATUS}")" \ - --arg failure_class "${FAILURE_CLASS}" \ - --arg failure_reason "${FAILURE_REASON}" \ - --arg out_rel "${out_rel}" \ - --arg manifest_rel "${manifest_rel}" \ - --arg fixture_rel "${fixture_rel}" \ - --arg report_json_rel "${report_json_rel}" \ - --arg report_md_rel "${report_md_rel}" \ - --arg artifact_dir "$(relative_path "${ARTIFACT_DIR}")" \ - --arg work_dir "$(relative_path "${WORK_DIR}")" \ - --arg repo_url "${RAGFLOW_REPO_URL}" \ - --arg ragflow_ref "${RAGFLOW_REF}" \ - --arg ragflow_image "${RAGFLOW_IMAGE}" \ - --arg compose_project "${COMPOSE_PROJECT}" \ - --arg cpu_gpu_mode "${CPU_GPU_MODE}" \ - --arg start_enabled "${START_RAGFLOW}" \ - --arg accept_resource_envelope "${ACCEPT_RESOURCE_ENVELOPE}" \ - --arg allow_arm "${ALLOW_ARM}" \ - --arg pull_image "${PULL_IMAGE}" \ - --arg cleanup "${CLEANUP}" \ - --arg api_base "${API_BASE}" \ - --arg api_key_provided "$([[ -n "${API_KEY}" ]] && printf true || printf false)" \ - --arg startup_time_ms "${STARTUP_TIME_MS}" \ - --arg started "${STARTED}" \ - --arg startup_attempt_count "${STARTUP_ATTEMPTS}" \ - --arg startup_interval_seconds "${STARTUP_INTERVAL_SECONDS}" \ - --arg compose_timeout_seconds "${COMPOSE_TIMEOUT_SECONDS}" \ - --arg evidence_id "${EVIDENCE_ID}" \ - --arg document_name "${DOCUMENT_NAME}" \ - --arg evidence_token "${EVIDENCE_TOKEN}" \ - --arg corpus_text "${CORPUS_TEXT}" \ - --arg dataset_id "${DATASET_ID}" \ - --arg document_id "${DOCUMENT_ID}" \ - --arg chunk_id "${CHUNK_ID}" \ - --arg vm_max_map_count "${VM_MAX_MAP_COUNT}" \ - --arg vm_max_map_count_status "${VM_MAX_MAP_COUNT_STATUS}" \ - --arg vm_max_map_count_action "${VM_MAX_MAP_COUNT_ACTION}" \ - --arg image_present "${IMAGE_PRESENT}" \ - --arg image_size_bytes "${IMAGE_SIZE_BYTES}" \ - --arg host_global_installs_required "${HOST_GLOBAL_INSTALLS_REQUIRED}" \ - --arg docker_status "${docker_status}" \ - --arg git_status "${git_status}" \ - --arg curl_status "${curl_status}" \ - --arg jq_status "${jq_status}" \ - --arg dataset_step_status "$(json_status "${DATASET_STEP_STATUS}")" \ - --arg document_step_status "$(json_status "${DOCUMENT_STEP_STATUS}")" \ - --arg chunk_step_status "$(json_status "${CHUNK_STEP_STATUS}")" \ - --arg retrieval_step_status "$(json_status "${RETRIEVAL_STEP_STATUS}")" \ - --slurpfile docker_info "${DOCKER_INFO}" \ - --slurpfile image_inspect "${IMAGE_INSPECT}" \ - --slurpfile reference_mapping "${REFERENCE_MAPPING}" \ - --rawfile docker_df "${DOCKER_DF}" \ - --rawfile compose_up_log "${COMPOSE_UP_LOG}" \ - --rawfile compose_down_log "${COMPOSE_DOWN_LOG}" \ - --slurpfile dataset_response "${DATASET_RESPONSE}" \ - --slurpfile document_response "${DOCUMENT_RESPONSE}" \ - --slurpfile chunk_response "${CHUNK_RESPONSE}" \ - --slurpfile retrieval_response "${RETRIEVAL_RESPONSE}" \ - --slurpfile scored_benchmark "${SCORED_BENCHMARK}" \ - --slurpfile startup_attempts <(jq -s '.' "${STARTUP_ATTEMPTS_JSONL}") \ - '{ - schema: $schema, - run_id: $run_id, - generated_at: $generated_at, - adapter_id: $adapter_id, - evidence_class: $evidence_class, - overall_status: $overall_status, - status_source: "smoke_materialization", - scored_benchmark: $scored_benchmark[0], - no_quality_claim: true, - failure: ( - if $failure_class == "" then null - else { - class: $failure_class, - reason: $failure_reason - } - end - ), - artifacts: { - smoke: $out_rel, - external_adapter_manifest: $manifest_rel, - generated_fixture: $fixture_rel, - scored_report_json: $report_json_rel, - scored_report_markdown: $report_md_rel, - artifact_dir: $artifact_dir, - work_dir: $work_dir - }, - upstream: { - repository: $repo_url, - ref: $ragflow_ref, - quickstart: "https://ragflow.io/docs/", - http_api_reference: "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md", - api_key_guide: "https://ragflow.io/docs/acquire_ragflow_api_key" - }, - docker_boundary: { - status: $setup_status, - official_compose_path: "ragflow/docker/docker-compose.yml", - compose_project: $compose_project, - image: $ragflow_image, - device: $cpu_gpu_mode, - start_enabled: ($start_enabled == "1"), - resource_envelope_accepted: ($accept_resource_envelope == "1"), - allow_arm: ($allow_arm == "1"), - pull_image_requested: ($pull_image == "1"), - cleanup_requested: ($cleanup == "1"), - host_global_installs_required: ($host_global_installs_required == "true"), - tooling: { - docker: $docker_status, - git: $git_status, - curl: $curl_status, - jq: $jq_status - } - }, - setup: { - status: $setup_status, - command: "cargo make smoke-ragflow-docker", - live_command: "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker", - started: ($started == "true"), - startup_time_ms: (if $startup_time_ms == "" then null else ($startup_time_ms | tonumber) end), - vm_max_map_count: { - status: $vm_max_map_count_status, - observed: (if $vm_max_map_count == "" then null else $vm_max_map_count end), - required_min: 262144, - action: $vm_max_map_count_action - }, - image: { - present: ($image_present == "true"), - size_bytes: (if $image_size_bytes == "" then null else ($image_size_bytes | tonumber) end), - official_compressed_size_note: "RAGFlow quickstart lists the stable image at about 2 GB compressed.", - official_expanded_size_note: "RAGFlow quickstart says the image expands to about 7 GB once unpacked.", - inspect: ($image_inspect[0] // []) - }, - resource_envelope: { - official_min_cpu_cores: 4, - official_min_ram_gb: 16, - official_min_disk_gb: 50, - docker_info: ($docker_info[0] // {}), - docker_system_df: $docker_df - }, - provider_boundaries: { - ragflow_api_base: $api_base, - ragflow_api_key_provided: ($api_key_provided == "true"), - operator_owned_provider_credentials_used: false, - private_corpus_used: false, - generated_public_corpus_only: true, - external_llm_quality_scoring_claimed: false - }, - retry_behavior: { - startup_poll_attempts_configured: ($startup_attempt_count | tonumber), - startup_interval_seconds: ($startup_interval_seconds | tonumber), - compose_timeout_seconds: ($compose_timeout_seconds | tonumber), - startup_attempts: ($startup_attempts[0] // []) - }, - log_excerpt: { - compose_up: ($compose_up_log | split("\n") | .[0:40]), - compose_down: ($compose_down_log | split("\n") | .[0:20]) - } - }, - corpus: { - profile: "generated_public", - evidence_id: $evidence_id, - document_name: $document_name, - evidence_token: $evidence_token, - text: $corpus_text, - dataset_id: (if $dataset_id == "" then null else $dataset_id end), - document_id: (if $document_id == "" then null else $document_id end), - chunk_id: (if $chunk_id == "" then null else $chunk_id end) - }, - run: { - status: $run_status, - steps: { - dataset_creation: { - status: $dataset_step_status, - request_artifact: "dataset-create-request.json", - response_artifact: "dataset-create-response.json", - response: ($dataset_response[0] // null) - }, - document_creation: { - status: $document_step_status, - request_artifact: "document-create-request.json", - response_artifact: "document-create-response.json", - response: ($document_response[0] // null) - }, - chunk_ingest: { - status: $chunk_step_status, - request_artifact: "chunk-create-request.json", - response_artifact: "chunk-create-response.json", - response: ($chunk_response[0] // null) - }, - retrieval_query: { - status: $retrieval_step_status, - request_artifact: "retrieval-request.json", - response_artifact: "retrieval-response.json", - response: ($retrieval_response[0] // null) - } - } - }, - result: { - status: $result_status, - evidence: "RAGFlow retrieval reference chunks are mapped to real_world_job evidence ids when content or document metadata matches the generated public corpus.", - reference_chunk_count: (($reference_mapping[0] // []) | length), - mapped_reference_chunk_count: (($reference_mapping[0] // []) | map(select((.evidence_ids // []) | length > 0)) | length) - }, - evidence_mapping: { - expected_evidence_ids: [$evidence_id], - reference_chunks: ($reference_mapping[0] // []), - field_mapping: { - "id": "chunk_id", - "document_id": "document_id", - "document_name_or_document_keyword": "document_name", - "dataset_id_or_kb_id": "dataset_id", - "content_or_content_with_weight": "content", - "positions": "positions", - "similarity": "similarity", - "vector_similarity": "vector_similarity", - "term_similarity": "term_similarity" - } - } - }' >"${OUT}" -} - -write_manifest() { - local generated_at out_rel manifest_rel retrieval_suite_status production_ops_status capability_retrieval_status capability_setup_status - generated_at="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" - out_rel="$(relative_path "${OUT}")" - manifest_rel="$(relative_path "${MANIFEST_OUT}")" - retrieval_suite_status="$(json_status "${RESULT_STATUS}")" - capability_retrieval_status="$(json_status "${RESULT_STATUS}")" - capability_setup_status="$(json_status "${SETUP_STATUS}")" - production_ops_status="not_encoded" - - jq -n \ - --arg generated_at "${generated_at}" \ - --arg manifest_id "ragflow-docker-evidence-smoke-${RUN_ID}" \ - --arg out_rel "${out_rel}" \ - --arg manifest_rel "${manifest_rel}" \ - --arg evidence_class "${EVIDENCE_CLASS}" \ - --arg overall_status "$(json_status "${OVERALL_STATUS}")" \ - --arg setup_status "$(json_status "${SETUP_STATUS}")" \ - --arg run_status "$(json_status "${RUN_STATUS}")" \ - --arg result_status "$(json_status "${RESULT_STATUS}")" \ - --arg retrieval_suite_status "${retrieval_suite_status}" \ - --arg production_ops_status "${production_ops_status}" \ - --arg capability_setup_status "${capability_setup_status}" \ - --arg capability_retrieval_status "${capability_retrieval_status}" \ - --arg ragflow_image "${RAGFLOW_IMAGE}" \ - --arg cpu_gpu_mode "${CPU_GPU_MODE}" \ - --arg failure_reason "${FAILURE_REASON}" \ - --arg host_global_installs_required "${HOST_GLOBAL_INSTALLS_REQUIRED}" \ - '{ - schema: "elf.real_world_external_adapter_manifest/v1", - manifest_id: $manifest_id, - docker_isolation: { - default: true, - compose_file: "official RAGFlow docker/docker-compose.yml", - runner: "scripts/ragflow-docker-evidence-smoke.sh", - artifact_dir: "tmp/real-world-memory/ragflow-smoke", - host_global_installs_required: ($host_global_installs_required == "true"), - notes: [ - "Generated by the RAGFlow evidence-smoke script at " + $generated_at + ".", - "The smoke uses a generated public corpus and does not use private corpus or operator-owned provider credentials." - ] - }, - adapters: [ - { - adapter_id: "ragflow_docker_evidence_smoke", - project: "RAGFlow", - adapter_kind: "docker_service_evidence_smoke", - evidence_class: $evidence_class, - docker_default: true, - host_global_installs_required: ($host_global_installs_required == "true"), - overall_status: $overall_status, - setup: { - status: $setup_status, - evidence: "Official RAGFlow Docker Compose boundary and resource envelope were evaluated for the tiny evidence smoke.", - command: "cargo make smoke-ragflow-docker", - artifact: $out_rel - }, - run: { - status: $run_status, - evidence: "The smoke attempts dataset creation, empty-document corpus ingest, chunk insert, retrieval query, and reference chunk extraction.", - command: "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker", - artifact: $out_rel - }, - result: { - status: $result_status, - evidence: ( - if $failure_reason == "" then "Returned RAGFlow reference chunks were mapped to generated real_world_job evidence ids for the smoke only." - else $failure_reason - end - ), - artifact: $out_rel - }, - capabilities: [ - { - capability: "official_docker_service_boundary", - status: $capability_setup_status, - evidence: "The script uses the official RAGFlow Docker Compose setup and records image, disk, startup, CPU/GPU, and vm.max_map_count evidence." - }, - { - capability: "dataset_or_chunk_ingest", - status: $run_status, - evidence: "The live path creates a generated public dataset, empty document, and chunk before querying." - }, - { - capability: "retrieval_reference_mapping", - status: $capability_retrieval_status, - evidence: "The script maps returned chunk id, document id, document name, dataset id, positions, and similarity fields to benchmark evidence ids." - }, - { - capability: "quality_or_scale_claim", - status: "not_encoded", - evidence: "The smoke does not run broad RAGFlow quality scoring, scale tests, private corpora, or comparative ranking claims." - } - ], - suites: [ - { - suite_id: "retrieval", - status: $retrieval_suite_status, - evidence: "Only the generated-public RAGFlow evidence-smoke retrieval path is represented." - }, - { - suite_id: "production_ops", - status: $production_ops_status, - evidence: "Resource envelope evidence is recorded, but no production-ops suite scoring is encoded." - }, - { - suite_id: "knowledge_compilation", - status: "not_encoded", - evidence: "RAGFlow page or knowledge-compilation behavior is not part of this smoke." - } - ], - evidence: [ - { - kind: "artifact", - ref: $out_rel, - status: $result_status - }, - { - kind: "manifest", - ref: $manifest_rel, - status: $overall_status - }, - { - kind: "source", - ref: "https://ragflow.io/docs/", - status: "real" - }, - { - kind: "source", - ref: "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md", - status: "real" - } - ], - execution_metadata: { - sources: [ - { - label: "RAGFlow quickstart", - url: "https://ragflow.io/docs/", - evidence: "Official Docker startup, resource envelope, vm.max_map_count, and provider configuration guidance." - }, - { - label: "RAGFlow HTTP API reference", - url: "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md", - evidence: "Official dataset, document, chunk, retrieval, and reference-chunk field contract." - } - ], - setup_path: "Run the official RAGFlow Docker Compose stack with generated public corpus only.", - runtime_boundary: "Official RAGFlow Docker Compose service boundary; no host-global RAGFlow install.", - resource_expectation: ( - "RAGFlow image " + $ragflow_image + ", CPU/GPU mode " + $cpu_gpu_mode + ", official minimums 4 CPU cores, 16 GB RAM, 50 GB disk, and vm.max_map_count >= 262144." - ), - retry_guidance: [ - "Default command records a typed blocked preflight unless resource-heavy startup is explicitly enabled.", - "Set ELF_RAGFLOW_SMOKE_START=1 and ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 for a live Docker startup attempt.", - "Provide only a local self-hosted RAGFlow API key; do not use private corpora or operator-owned model provider credentials for this smoke." - ], - research_depth: "D2 feasibility plus XY-885 evidence-smoke implementation; generated artifact decides live evidence class." - }, - notes: [ - "This adapter record is generated by a smoke artifact and must not be generalized into broad RAGFlow quality evidence.", - "Failure before query output remains typed as blocked, incomplete, or not_encoded." - ] - } - ] - }' >"${MANIFEST_OUT}" -} - -write_fixture() { - local result_status reason - result_status="$(json_status "${RESULT_STATUS}")" - reason="${FAILURE_REASON}" - - jq -n \ - --arg run_id "${RUN_ID}" \ - --arg evidence_id "${EVIDENCE_ID}" \ - --arg evidence_token "${EVIDENCE_TOKEN}" \ - --arg corpus_text "${CORPUS_TEXT}" \ - --arg result_status "${result_status}" \ - --arg failure_reason "${reason}" \ - '{ - schema: "elf.real_world_job/v1", - job_id: "ragflow-evidence-smoke-001", - suite: "retrieval", - title: "Map RAGFlow reference chunks to generated evidence", - corpus: { - corpus_id: "ragflow-generated-public-smoke", - profile: "generated_public", - items: [ - { - evidence_id: $evidence_id, - kind: "document", - text: $corpus_text, - source_ref: { - schema: "source_ref/v1", - resolver: "ragflow_smoke/v1", - ref: { - run_id: $run_id, - evidence_token: $evidence_token - } - }, - created_at: "2026-06-10T00:00:00Z" - } - ], - adapter_response: { - adapter_id: "ragflow_docker_evidence_smoke", - answer: { - content: ( - if $result_status == "pass" then - "RAGFlow returned reference chunks that map to the generated ragflow-smoke-anchor evidence id." - else - "" - end - ), - claims: ( - if $result_status == "pass" then - [ - { - claim_id: "ragflow_reference_mapping", - text: "RAGFlow reference chunks map to the generated ragflow-smoke-anchor evidence id.", - evidence_ids: [$evidence_id], - confidence: "derived_from_ragflow_reference_chunk_mapping" - } - ] - else - [] - end - ), - evidence_ids: (if $result_status == "pass" then [$evidence_id] else [] end), - latency_ms: 0.0, - cost: { - currency: "USD", - amount: 0.0, - input_tokens: 0, - output_tokens: 0 - } - } - } - }, - timeline: [ - { - event_id: "ragflow-smoke-corpus-generated", - ts: "2026-06-10T00:00:00Z", - actor: "system", - action: "generated_public_corpus", - evidence_ids: [$evidence_id], - summary: "The RAGFlow smoke generated a tiny public corpus for reference chunk mapping." - } - ], - prompt: { - role: "user", - content: "Which RAGFlow smoke evidence token maps to the generated reference chunk?", - job_mode: "answer", - constraints: ["cite_evidence", "avoid_broad_quality_claims"] - }, - expected_answer: { - must_include: [ - { - claim_id: "ragflow_reference_mapping", - text: "RAGFlow reference chunks map to the generated ragflow-smoke-anchor evidence id." - } - ], - must_not_include: ["RAGFlow passed a broad graph/RAG quality benchmark."], - evidence_links: { - ragflow_reference_mapping: [$evidence_id] - }, - answer_type: "direct_answer", - accepted_alternates: [], - requires_caveat: true, - requires_refusal: false - }, - required_evidence: [ - { - evidence_id: $evidence_id, - claim_id: "ragflow_reference_mapping", - requirement: "cite", - quote: "ragflow-smoke-anchor evidence id" - } - ], - negative_traps: [], - scoring_rubric: { - dimensions: { - answer_correctness: { - weight: 0.3, - max_points: 1.0, - criteria: "States the generated evidence mapping without broad quality claims." - }, - evidence_grounding: { - weight: 0.45, - max_points: 1.0, - criteria: "Maps returned RAGFlow reference chunks to the generated evidence id." - }, - trap_avoidance: { - weight: 0.15, - max_points: 1.0, - criteria: "Does not claim broad RAGFlow quality from the tiny smoke." - }, - latency_resource: { - weight: 0.1, - max_points: 1.0, - criteria: "Records setup, resource, provider, and reference-mapping boundaries." - } - }, - pass_threshold: 0.75, - hard_fail_rules: [] - }, - allowed_uncertainty: { - can_answer_unknown: false, - acceptable_phrases: ["tiny generated corpus", "reference chunk smoke only"], - fallback_action: "state_blocker" - }, - operator_debug: null, - encoding: {}, - memory_evolution: null, - tags: ["external_adapter", "generated_public", "ragflow", "no_live_claim"] - } - | if ["blocked", "incomplete", "not_encoded"] | index($result_status) then - .encoding = {status: $result_status, reason: $failure_reason} - else - . - end' >"${FIXTURE_PATH}" -} - -write_scored_report() { - ( - cd "${ROOT_DIR}" - cargo run -p elf-eval --bin real_world_job_benchmark -- run \ - --fixtures "${FIXTURE_PATH}" \ - --out "${REPORT_JSON}" \ - --run-id real-world-memory-live-ragflow \ - --adapter-id ragflow_docker_evidence_smoke \ - --adapter-name "RAGFlow Docker evidence smoke adapter" \ - --adapter-behavior docker_service_evidence_smoke \ - --adapter-storage-status "$(json_status "${SETUP_STATUS}")" \ - --adapter-runtime-status "$(json_status "${OVERALL_STATUS}")" \ - --adapter-notes "Generated by the RAGFlow Docker evidence smoke; pass or wrong_result requires reference chunks mapped to generated evidence ids, while resource/setup/API-key limits remain typed." \ - --external-adapter-manifest "${MANIFEST_OUT}" - cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ - --report "${REPORT_JSON}" \ - --out "${REPORT_MD}" - ) -} - -write_summary() { - jq -n \ - --slurpfile materialization "${OUT}" \ - --slurpfile manifest "${MANIFEST_OUT}" \ - --slurpfile report "${REPORT_JSON}" \ - '{ - schema: "elf.ragflow_docker_smoke_summary/v1", - generated_at: (now | todateiso8601), - adapter_id: "ragflow_docker_evidence_smoke", - evidence_class: $materialization[0].evidence_class, - status_boundary: { - materialization: "setup/run/evidence-mapping state emitted by the smoke runner", - manifest: "external adapter declaration consumed by the scorer", - scored_benchmark: "post-score real_world_job outcome; use this for quality status" - }, - scored_benchmark: $materialization[0].scored_benchmark, - materialization: $materialization[0], - manifest: { - json: ($materialization[0].artifacts.external_adapter_manifest // "tmp/real-world-memory/ragflow-smoke/memory_projects_manifest.ragflow-smoke.json"), - status_source: "external_adapter_manifest_pre_score", - summary: $manifest[0].adapters[0].overall_status, - suites: $manifest[0].adapters[0].suites - }, - report: { - json: ($materialization[0].artifacts.scored_report_json // "tmp/real-world-memory/ragflow-smoke/ragflow-report.json"), - markdown: ($materialization[0].artifacts.scored_report_markdown // "tmp/real-world-memory/ragflow-smoke/ragflow-report.md"), - summary: $report[0].summary, - suites: $report[0].suites - } - }' >"${SUMMARY_OUT}" -} -write_outputs() { - write_scored_benchmark - write_artifact - write_manifest - write_fixture - write_scored_report - write_scored_benchmark - write_artifact - write_summary - echo "RAGFlow smoke artifact: ${OUT}" - echo "RAGFlow smoke manifest: ${MANIFEST_OUT}" - echo "RAGFlow smoke report: ${REPORT_JSON}" - echo "RAGFlow smoke summary: ${SUMMARY_OUT}" -} +source "${ROOT_DIR}/scripts/ragflow_smoke/common.sh" +source "${ROOT_DIR}/scripts/ragflow_smoke/docker.sh" +source "${ROOT_DIR}/scripts/ragflow_smoke/api.sh" +source "${ROOT_DIR}/scripts/ragflow_smoke/scoring.sh" +source "${ROOT_DIR}/scripts/ragflow_smoke/materialization.sh" +source "${ROOT_DIR}/scripts/ragflow_smoke/manifest.sh" +source "${ROOT_DIR}/scripts/ragflow_smoke/fixture.sh" +source "${ROOT_DIR}/scripts/ragflow_smoke/summary.sh" for cmd in jq curl; do required_command "${cmd}" diff --git a/scripts/ragflow_smoke/api.sh b/scripts/ragflow_smoke/api.sh new file mode 100644 index 00000000..fb15cad3 --- /dev/null +++ b/scripts/ragflow_smoke/api.sh @@ -0,0 +1,183 @@ +# RAGFlow Docker evidence smoke helper functions. +# Sourced by scripts/ragflow-docker-evidence-smoke.sh. + +run_api_smoke() { + local dataset_name="${RUN_ID}" + + jq -n --arg name "${dataset_name}" '{ + name: $name, + description: "Generated public ELF RAGFlow Docker evidence smoke corpus.", + permission: "me", + chunk_method: "manual", + parser_config: {"raptor": {"use_raptor": false}} + }' >"${DATASET_REQUEST}" + + if api_json_request POST "/api/v1/datasets" "${DATASET_REQUEST}" "${DATASET_RESPONSE}" \ + && response_code_ok "${DATASET_RESPONSE}"; then + DATASET_STEP_STATUS="pass" + DATASET_ID="$(extract_id "${DATASET_RESPONSE}")" + else + DATASET_STEP_STATUS="incomplete" + RUN_STATUS="incomplete" + RESULT_STATUS="incomplete" + OVERALL_STATUS="incomplete" + FAILURE_CLASS="ragflow_dataset_create_failed" + FAILURE_REASON="RAGFlow dataset creation did not return a successful response." + return 0 + fi + + if [[ -z "${DATASET_ID}" ]]; then + DATASET_STEP_STATUS="incomplete" + RUN_STATUS="incomplete" + RESULT_STATUS="incomplete" + OVERALL_STATUS="incomplete" + FAILURE_CLASS="ragflow_dataset_id_missing" + FAILURE_REASON="RAGFlow dataset creation succeeded but no dataset id was found in the response." + return 0 + fi + + jq -n --arg name "${DOCUMENT_NAME}" '{name: $name}' >"${DOCUMENT_REQUEST}" + + if api_json_request POST "/api/v1/datasets/${DATASET_ID}/documents?type=empty" \ + "${DOCUMENT_REQUEST}" "${DOCUMENT_RESPONSE}" \ + && response_code_ok "${DOCUMENT_RESPONSE}"; then + DOCUMENT_STEP_STATUS="pass" + DOCUMENT_ID="$(extract_id "${DOCUMENT_RESPONSE}")" + else + DOCUMENT_STEP_STATUS="incomplete" + RUN_STATUS="incomplete" + RESULT_STATUS="incomplete" + OVERALL_STATUS="incomplete" + FAILURE_CLASS="ragflow_document_create_failed" + FAILURE_REASON="RAGFlow empty document creation did not return a successful response." + return 0 + fi + + if [[ -z "${DOCUMENT_ID}" ]]; then + DOCUMENT_STEP_STATUS="incomplete" + RUN_STATUS="incomplete" + RESULT_STATUS="incomplete" + OVERALL_STATUS="incomplete" + FAILURE_CLASS="ragflow_document_id_missing" + FAILURE_REASON="RAGFlow empty document creation succeeded but no document id was found in the response." + return 0 + fi + + jq -n \ + --arg content "${CORPUS_TEXT}" \ + --arg token "${EVIDENCE_TOKEN}" \ + '{ + content: $content, + important_keywords: [$token], + questions: ["Which evidence token should map to ragflow-smoke-anchor?"] + }' >"${CHUNK_REQUEST}" + + if api_json_request POST "/api/v1/datasets/${DATASET_ID}/documents/${DOCUMENT_ID}/chunks" \ + "${CHUNK_REQUEST}" "${CHUNK_RESPONSE}" \ + && response_code_ok "${CHUNK_RESPONSE}"; then + CHUNK_STEP_STATUS="pass" + CHUNK_ID="$(extract_id "${CHUNK_RESPONSE}")" + else + CHUNK_STEP_STATUS="incomplete" + RUN_STATUS="incomplete" + RESULT_STATUS="incomplete" + OVERALL_STATUS="incomplete" + FAILURE_CLASS="ragflow_chunk_create_failed" + FAILURE_REASON="RAGFlow chunk creation did not return a successful response." + return 0 + fi + + jq -n \ + --arg question "Which RAGFlow smoke evidence token maps to ragflow-smoke-anchor?" \ + --arg dataset_id "${DATASET_ID}" \ + --arg document_id "${DOCUMENT_ID}" \ + '{ + question: $question, + dataset_ids: [$dataset_id], + document_ids: [$document_id], + page: 1, + page_size: 5, + similarity_threshold: 0.0, + vector_similarity_weight: 0.0, + top_k: 5, + keyword: true, + highlight: false + }' >"${RETRIEVAL_REQUEST}" + + if api_json_request POST "/api/v1/retrieval" "${RETRIEVAL_REQUEST}" "${RETRIEVAL_RESPONSE}" \ + && response_code_ok "${RETRIEVAL_RESPONSE}"; then + RETRIEVAL_STEP_STATUS="pass" + else + RETRIEVAL_STEP_STATUS="incomplete" + RUN_STATUS="incomplete" + RESULT_STATUS="incomplete" + OVERALL_STATUS="incomplete" + FAILURE_CLASS="ragflow_retrieval_failed" + FAILURE_REASON="RAGFlow retrieval did not return a successful response." + return 0 + fi + + jq \ + --arg evidence_id "${EVIDENCE_ID}" \ + --arg token "${EVIDENCE_TOKEN}" \ + --arg document_name "${DOCUMENT_NAME}" ' + def chunk_array: + if (.data.chunks? | type) == "array" then .data.chunks + elif (.reference.chunks? | type) == "array" then .reference.chunks + else [] end; + chunk_array + | map({ + chunk_id: (.id // .chunk_id // ""), + content: (.content // .content_with_weight // ""), + document_id: (.document_id // .doc_id // ""), + document_name: (.document_name // .document_keyword // .doc_name // .docnm_kwd // ""), + dataset_id: (.dataset_id // .kb_id // ""), + positions: (.positions // []), + similarity: (.similarity // null), + vector_similarity: (.vector_similarity // null), + term_similarity: (.term_similarity // null), + evidence_ids: ( + if (((.content // .content_with_weight // "") | contains($token)) + or ((.document_name // .document_keyword // .doc_name // .docnm_kwd // "") == $document_name)) + then [$evidence_id] + else [] + end + ), + mapping_status: ( + if ((.content // .content_with_weight // "") | contains($token)) then "matched_content" + elif ((.document_name // .document_keyword // .doc_name // .docnm_kwd // "") == $document_name) then "matched_document" + else "unmatched" + end + ) + })' "${RETRIEVAL_RESPONSE}" >"${REFERENCE_MAPPING}" + + RUN_STATUS="pass" + EVIDENCE_CLASS="live_real_world" + + if jq -e --arg evidence_id "${EVIDENCE_ID}" ' + length > 0 and any(.[]; (.evidence_ids // []) | index($evidence_id)) + ' "${REFERENCE_MAPPING}" >/dev/null; then + RESULT_STATUS="pass" + OVERALL_STATUS="pass" + FAILURE_CLASS="" + FAILURE_REASON="" + else + RESULT_STATUS="wrong_result" + OVERALL_STATUS="wrong_result" + FAILURE_CLASS="ragflow_reference_mapping_missing" + FAILURE_REASON="RAGFlow retrieval returned chunks but none mapped to the generated evidence id." + fi +} + +cleanup_stack() { + local repo_dir="${WORK_DIR}/ragflow" + + if [[ "${STARTED}" != "true" || "${CLEANUP}" != "1" || ! -d "${repo_dir}/docker" ]]; then + return 0 + fi + + ( + cd "${repo_dir}/docker" + docker compose -p "${COMPOSE_PROJECT}" -f docker-compose.yml down -v + ) >"${COMPOSE_DOWN_LOG}" 2>&1 || true +} diff --git a/scripts/ragflow_smoke/common.sh b/scripts/ragflow_smoke/common.sh new file mode 100644 index 00000000..405af356 --- /dev/null +++ b/scripts/ragflow_smoke/common.sh @@ -0,0 +1,96 @@ +# RAGFlow Docker evidence smoke helper functions. +# Sourced by scripts/ragflow-docker-evidence-smoke.sh. + +required_command() { + local cmd="$1" + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd}; cannot write RAGFlow smoke artifacts." >&2 + exit 1 + fi +} + +optional_command_status() { + local cmd="$1" + if command -v "${cmd}" >/dev/null 2>&1; then + printf 'available' + else + printf 'missing' + fi +} + +relative_path() { + local path="$1" + if [[ "${path}" == "${ROOT_DIR}/"* ]]; then + printf '%s' "${path#"${ROOT_DIR}/"}" + else + printf '%s' "${path}" + fi +} + +json_status() { + local status="$1" + case "${status}" in + real | mocked | unsupported | blocked | incomplete | wrong_result | lifecycle_fail | pass | not_encoded) + printf '%s' "${status}" + ;; + *) + printf 'incomplete' + ;; + esac +} + +capture_docker_info() { + if docker info --format '{{json .}}' >"${DOCKER_INFO}" 2>"${ARTIFACT_DIR}/docker-info.stderr"; then + return 0 + fi + + jq -n --rawfile stderr "${ARTIFACT_DIR}/docker-info.stderr" '{ + error: "docker_info_failed", + stderr: $stderr + }' >"${DOCKER_INFO}" + return 1 +} + +capture_disk_info() { + docker system df >"${DOCKER_DF}" 2>/dev/null || true +} + +capture_vm_max_map_count() { + if VM_MAX_MAP_COUNT="$(sysctl -n vm.max_map_count 2>/dev/null)"; then + if [[ "${VM_MAX_MAP_COUNT}" =~ ^[0-9]+$ ]] && [[ "${VM_MAX_MAP_COUNT}" -ge 262144 ]]; then + VM_MAX_MAP_COUNT_STATUS="pass" + elif [[ "${VM_MAX_MAP_COUNT}" =~ ^[0-9]+$ ]]; then + VM_MAX_MAP_COUNT_STATUS="blocked" + else + VM_MAX_MAP_COUNT_STATUS="not_observed" + fi + else + VM_MAX_MAP_COUNT="" + VM_MAX_MAP_COUNT_STATUS="not_observed" + fi +} + +capture_image_info() { + if [[ "${PULL_IMAGE}" == "1" && "${ACCEPT_RESOURCE_ENVELOPE}" == "1" ]]; then + docker pull "${RAGFLOW_IMAGE}" >"${ARTIFACT_DIR}/docker-pull.log" 2>&1 || true + fi + + if docker image inspect "${RAGFLOW_IMAGE}" >"${IMAGE_INSPECT}" 2>/dev/null; then + IMAGE_PRESENT="true" + IMAGE_SIZE_BYTES="$(jq -r '.[0].Size // ""' "${IMAGE_INSPECT}")" + else + printf '[]\n' >"${IMAGE_INSPECT}" + fi +} + +update_env_var() { + local file="$1" + local key="$2" + local value="$3" + + if grep -q "^${key}=" "${file}"; then + sed -i.bak "s|^${key}=.*|${key}=${value}|" "${file}" + else + printf '\n%s=%s\n' "${key}" "${value}" >>"${file}" + fi +} diff --git a/scripts/ragflow_smoke/docker.sh b/scripts/ragflow_smoke/docker.sh new file mode 100644 index 00000000..6fd98136 --- /dev/null +++ b/scripts/ragflow_smoke/docker.sh @@ -0,0 +1,135 @@ +# RAGFlow Docker evidence smoke helper functions. +# Sourced by scripts/ragflow-docker-evidence-smoke.sh. + +prepare_official_ragflow_repo() { + local repo_dir="${WORK_DIR}/ragflow" + + if [[ ! -d "${repo_dir}/.git" ]]; then + rm -rf "${repo_dir}" + git clone --depth 1 --branch "${RAGFLOW_REF}" "${RAGFLOW_REPO_URL}" "${repo_dir}" \ + >"${ARTIFACT_DIR}/ragflow-git-clone.log" 2>&1 + else + git -C "${repo_dir}" fetch --depth 1 origin "${RAGFLOW_REF}" \ + >"${ARTIFACT_DIR}/ragflow-git-fetch.log" 2>&1 + git -C "${repo_dir}" checkout -f FETCH_HEAD \ + >"${ARTIFACT_DIR}/ragflow-git-checkout.log" 2>&1 + fi + + update_env_var "${repo_dir}/docker/.env" "DEVICE" "${CPU_GPU_MODE}" + update_env_var "${repo_dir}/docker/.env" "SVR_WEB_HTTP_PORT" "${ELF_RAGFLOW_WEB_HTTP_PORT:-18080}" + update_env_var "${repo_dir}/docker/.env" "SVR_WEB_HTTPS_PORT" "${ELF_RAGFLOW_WEB_HTTPS_PORT:-18443}" + update_env_var "${repo_dir}/docker/.env" "SVR_HTTP_PORT" "${API_PORT}" + update_env_var "${repo_dir}/docker/.env" "ADMIN_SVR_HTTP_PORT" "${ELF_RAGFLOW_ADMIN_PORT:-19381}" + update_env_var "${repo_dir}/docker/.env" "SVR_MCP_PORT" "${ELF_RAGFLOW_MCP_PORT:-19382}" + update_env_var "${repo_dir}/docker/.env" "GO_HTTP_PORT" "${ELF_RAGFLOW_GO_HTTP_PORT:-19384}" + update_env_var "${repo_dir}/docker/.env" "GO_ADMIN_PORT" "${ELF_RAGFLOW_GO_ADMIN_PORT:-19383}" + update_env_var "${repo_dir}/docker/.env" "EXPOSE_MYSQL_PORT" "${ELF_RAGFLOW_MYSQL_PORT:-13306}" + update_env_var "${repo_dir}/docker/.env" "MINIO_CONSOLE_PORT" "${ELF_RAGFLOW_MINIO_CONSOLE_PORT:-19001}" + update_env_var "${repo_dir}/docker/.env" "MINIO_PORT" "${ELF_RAGFLOW_MINIO_PORT:-19000}" + update_env_var "${repo_dir}/docker/.env" "REDIS_PORT" "${ELF_RAGFLOW_REDIS_PORT:-16379}" + update_env_var "${repo_dir}/docker/.env" "ES_PORT" "${ELF_RAGFLOW_ES_PORT:-11200}" + update_env_var "${repo_dir}/docker/.env" "OS_PORT" "${ELF_RAGFLOW_OS_PORT:-11201}" + update_env_var "${repo_dir}/docker/.env" "RAGFLOW_IMAGE" "${RAGFLOW_IMAGE}" + + printf '%s' "${repo_dir}" +} + +run_with_timeout_if_available() { + local seconds="$1" + shift + + if command -v timeout >/dev/null 2>&1; then + timeout "${seconds}" "$@" + else + "$@" + fi +} + +start_ragflow_stack() { + local repo_dir="$1" + local started_at ended_at + started_at="$(date +%s)" + + if ( + cd "${repo_dir}/docker" + run_with_timeout_if_available "${COMPOSE_TIMEOUT_SECONDS}" \ + docker compose -p "${COMPOSE_PROJECT}" -f docker-compose.yml up -d + ) >"${COMPOSE_UP_LOG}" 2>&1; then + STARTED="true" + SETUP_STATUS="pass" + FAILURE_CLASS="" + FAILURE_REASON="" + else + SETUP_STATUS="incomplete" + OVERALL_STATUS="incomplete" + RESULT_STATUS="incomplete" + FAILURE_CLASS="ragflow_compose_start_failed" + FAILURE_REASON="Official RAGFlow Docker Compose did not start successfully; see compose-up.log in the artifact directory." + fi + + ended_at="$(date +%s)" + STARTUP_TIME_MS="$(((ended_at - started_at) * 1000))" +} + +wait_for_ragflow_api() { + local attempt code + + for attempt in $(seq 1 "${STARTUP_ATTEMPTS}"); do + code="$(curl -sS -o /dev/null -w '%{http_code}' "${API_BASE}/api/v1/system/healthz" 2>/dev/null || true)" + jq -nc --argjson attempt "${attempt}" --arg code "${code}" --arg url "${API_BASE}/api/v1/system/healthz" '{ + attempt: $attempt, + url: $url, + http_code: $code + }' >>"${STARTUP_ATTEMPTS_JSONL}" + + if [[ "${code}" == "200" ]]; then + return 0 + fi + + sleep "${STARTUP_INTERVAL_SECONDS}" + done + + return 1 +} + +api_json_request() { + local method="$1" + local path="$2" + local request_file="$3" + local response_file="$4" + local stderr_file="${response_file}.stderr" + local code + + code="$(curl -sS -X "${method}" \ + -o "${response_file}" \ + -w '%{http_code}' \ + -H 'Content-Type: application/json' \ + -H "Authorization: Bearer ${API_KEY}" \ + --data-binary @"${request_file}" \ + "${API_BASE}${path}" 2>"${stderr_file}" || true)" + + jq -n --arg code "${code}" --rawfile stderr "${stderr_file}" '{ + http_code: $code, + stderr: $stderr + }' >"${response_file}.meta.json" + + [[ "${code}" =~ ^2 ]] +} + +response_code_ok() { + local response_file="$1" + + jq -e '(.code? == 0) or (.id? != null) or (.data? != null)' "${response_file}" >/dev/null 2>&1 +} + +extract_id() { + local response_file="$1" + jq -r ' + .data.id + // .data[0].id + // .data.document_id + // .data.chunk_id + // .id + // empty + ' "${response_file}" +} diff --git a/scripts/ragflow_smoke/fixture.sh b/scripts/ragflow_smoke/fixture.sh new file mode 100644 index 00000000..1b98e6c8 --- /dev/null +++ b/scripts/ragflow_smoke/fixture.sh @@ -0,0 +1,157 @@ +# RAGFlow Docker evidence smoke helper functions. +# Sourced by scripts/ragflow-docker-evidence-smoke.sh. + +write_fixture() { + local result_status reason + result_status="$(json_status "${RESULT_STATUS}")" + reason="${FAILURE_REASON}" + + jq -n \ + --arg run_id "${RUN_ID}" \ + --arg evidence_id "${EVIDENCE_ID}" \ + --arg evidence_token "${EVIDENCE_TOKEN}" \ + --arg corpus_text "${CORPUS_TEXT}" \ + --arg result_status "${result_status}" \ + --arg failure_reason "${reason}" \ + '{ + schema: "elf.real_world_job/v1", + job_id: "ragflow-evidence-smoke-001", + suite: "retrieval", + title: "Map RAGFlow reference chunks to generated evidence", + corpus: { + corpus_id: "ragflow-generated-public-smoke", + profile: "generated_public", + items: [ + { + evidence_id: $evidence_id, + kind: "document", + text: $corpus_text, + source_ref: { + schema: "source_ref/v1", + resolver: "ragflow_smoke/v1", + ref: { + run_id: $run_id, + evidence_token: $evidence_token + } + }, + created_at: "2026-06-10T00:00:00Z" + } + ], + adapter_response: { + adapter_id: "ragflow_docker_evidence_smoke", + answer: { + content: ( + if $result_status == "pass" then + "RAGFlow returned reference chunks that map to the generated ragflow-smoke-anchor evidence id." + else + "" + end + ), + claims: ( + if $result_status == "pass" then + [ + { + claim_id: "ragflow_reference_mapping", + text: "RAGFlow reference chunks map to the generated ragflow-smoke-anchor evidence id.", + evidence_ids: [$evidence_id], + confidence: "derived_from_ragflow_reference_chunk_mapping" + } + ] + else + [] + end + ), + evidence_ids: (if $result_status == "pass" then [$evidence_id] else [] end), + latency_ms: 0.0, + cost: { + currency: "USD", + amount: 0.0, + input_tokens: 0, + output_tokens: 0 + } + } + } + }, + timeline: [ + { + event_id: "ragflow-smoke-corpus-generated", + ts: "2026-06-10T00:00:00Z", + actor: "system", + action: "generated_public_corpus", + evidence_ids: [$evidence_id], + summary: "The RAGFlow smoke generated a tiny public corpus for reference chunk mapping." + } + ], + prompt: { + role: "user", + content: "Which RAGFlow smoke evidence token maps to the generated reference chunk?", + job_mode: "answer", + constraints: ["cite_evidence", "avoid_broad_quality_claims"] + }, + expected_answer: { + must_include: [ + { + claim_id: "ragflow_reference_mapping", + text: "RAGFlow reference chunks map to the generated ragflow-smoke-anchor evidence id." + } + ], + must_not_include: ["RAGFlow passed a broad graph/RAG quality benchmark."], + evidence_links: { + ragflow_reference_mapping: [$evidence_id] + }, + answer_type: "direct_answer", + accepted_alternates: [], + requires_caveat: true, + requires_refusal: false + }, + required_evidence: [ + { + evidence_id: $evidence_id, + claim_id: "ragflow_reference_mapping", + requirement: "cite", + quote: "ragflow-smoke-anchor evidence id" + } + ], + negative_traps: [], + scoring_rubric: { + dimensions: { + answer_correctness: { + weight: 0.3, + max_points: 1.0, + criteria: "States the generated evidence mapping without broad quality claims." + }, + evidence_grounding: { + weight: 0.45, + max_points: 1.0, + criteria: "Maps returned RAGFlow reference chunks to the generated evidence id." + }, + trap_avoidance: { + weight: 0.15, + max_points: 1.0, + criteria: "Does not claim broad RAGFlow quality from the tiny smoke." + }, + latency_resource: { + weight: 0.1, + max_points: 1.0, + criteria: "Records setup, resource, provider, and reference-mapping boundaries." + } + }, + pass_threshold: 0.75, + hard_fail_rules: [] + }, + allowed_uncertainty: { + can_answer_unknown: false, + acceptable_phrases: ["tiny generated corpus", "reference chunk smoke only"], + fallback_action: "state_blocker" + }, + operator_debug: null, + encoding: {}, + memory_evolution: null, + tags: ["external_adapter", "generated_public", "ragflow", "no_live_claim"] + } + | if ["blocked", "incomplete", "not_encoded"] | index($result_status) then + .encoding = {status: $result_status, reason: $failure_reason} + else + . + end' >"${FIXTURE_PATH}" +} diff --git a/scripts/ragflow_smoke/manifest.sh b/scripts/ragflow_smoke/manifest.sh new file mode 100644 index 00000000..ecbb9b61 --- /dev/null +++ b/scripts/ragflow_smoke/manifest.sh @@ -0,0 +1,169 @@ +# RAGFlow Docker evidence smoke helper functions. +# Sourced by scripts/ragflow-docker-evidence-smoke.sh. + +write_manifest() { + local generated_at out_rel manifest_rel retrieval_suite_status production_ops_status capability_retrieval_status capability_setup_status + generated_at="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" + out_rel="$(relative_path "${OUT}")" + manifest_rel="$(relative_path "${MANIFEST_OUT}")" + retrieval_suite_status="$(json_status "${RESULT_STATUS}")" + capability_retrieval_status="$(json_status "${RESULT_STATUS}")" + capability_setup_status="$(json_status "${SETUP_STATUS}")" + production_ops_status="not_encoded" + + jq -n \ + --arg generated_at "${generated_at}" \ + --arg manifest_id "ragflow-docker-evidence-smoke-${RUN_ID}" \ + --arg out_rel "${out_rel}" \ + --arg manifest_rel "${manifest_rel}" \ + --arg evidence_class "${EVIDENCE_CLASS}" \ + --arg overall_status "$(json_status "${OVERALL_STATUS}")" \ + --arg setup_status "$(json_status "${SETUP_STATUS}")" \ + --arg run_status "$(json_status "${RUN_STATUS}")" \ + --arg result_status "$(json_status "${RESULT_STATUS}")" \ + --arg retrieval_suite_status "${retrieval_suite_status}" \ + --arg production_ops_status "${production_ops_status}" \ + --arg capability_setup_status "${capability_setup_status}" \ + --arg capability_retrieval_status "${capability_retrieval_status}" \ + --arg ragflow_image "${RAGFLOW_IMAGE}" \ + --arg cpu_gpu_mode "${CPU_GPU_MODE}" \ + --arg failure_reason "${FAILURE_REASON}" \ + --arg host_global_installs_required "${HOST_GLOBAL_INSTALLS_REQUIRED}" \ + '{ + schema: "elf.real_world_external_adapter_manifest/v1", + manifest_id: $manifest_id, + docker_isolation: { + default: true, + compose_file: "official RAGFlow docker/docker-compose.yml", + runner: "scripts/ragflow-docker-evidence-smoke.sh", + artifact_dir: "tmp/real-world-memory/ragflow-smoke", + host_global_installs_required: ($host_global_installs_required == "true"), + notes: [ + "Generated by the RAGFlow evidence-smoke script at " + $generated_at + ".", + "The smoke uses a generated public corpus and does not use private corpus or operator-owned provider credentials." + ] + }, + adapters: [ + { + adapter_id: "ragflow_docker_evidence_smoke", + project: "RAGFlow", + adapter_kind: "docker_service_evidence_smoke", + evidence_class: $evidence_class, + docker_default: true, + host_global_installs_required: ($host_global_installs_required == "true"), + overall_status: $overall_status, + setup: { + status: $setup_status, + evidence: "Official RAGFlow Docker Compose boundary and resource envelope were evaluated for the tiny evidence smoke.", + command: "cargo make smoke-ragflow-docker", + artifact: $out_rel + }, + run: { + status: $run_status, + evidence: "The smoke attempts dataset creation, empty-document corpus ingest, chunk insert, retrieval query, and reference chunk extraction.", + command: "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker", + artifact: $out_rel + }, + result: { + status: $result_status, + evidence: ( + if $failure_reason == "" then "Returned RAGFlow reference chunks were mapped to generated real_world_job evidence ids for the smoke only." + else $failure_reason + end + ), + artifact: $out_rel + }, + capabilities: [ + { + capability: "official_docker_service_boundary", + status: $capability_setup_status, + evidence: "The script uses the official RAGFlow Docker Compose setup and records image, disk, startup, CPU/GPU, and vm.max_map_count evidence." + }, + { + capability: "dataset_or_chunk_ingest", + status: $run_status, + evidence: "The live path creates a generated public dataset, empty document, and chunk before querying." + }, + { + capability: "retrieval_reference_mapping", + status: $capability_retrieval_status, + evidence: "The script maps returned chunk id, document id, document name, dataset id, positions, and similarity fields to benchmark evidence ids." + }, + { + capability: "quality_or_scale_claim", + status: "not_encoded", + evidence: "The smoke does not run broad RAGFlow quality scoring, scale tests, private corpora, or comparative ranking claims." + } + ], + suites: [ + { + suite_id: "retrieval", + status: $retrieval_suite_status, + evidence: "Only the generated-public RAGFlow evidence-smoke retrieval path is represented." + }, + { + suite_id: "production_ops", + status: $production_ops_status, + evidence: "Resource envelope evidence is recorded, but no production-ops suite scoring is encoded." + }, + { + suite_id: "knowledge_compilation", + status: "not_encoded", + evidence: "RAGFlow page or knowledge-compilation behavior is not part of this smoke." + } + ], + evidence: [ + { + kind: "artifact", + ref: $out_rel, + status: $result_status + }, + { + kind: "manifest", + ref: $manifest_rel, + status: $overall_status + }, + { + kind: "source", + ref: "https://ragflow.io/docs/", + status: "real" + }, + { + kind: "source", + ref: "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md", + status: "real" + } + ], + execution_metadata: { + sources: [ + { + label: "RAGFlow quickstart", + url: "https://ragflow.io/docs/", + evidence: "Official Docker startup, resource envelope, vm.max_map_count, and provider configuration guidance." + }, + { + label: "RAGFlow HTTP API reference", + url: "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md", + evidence: "Official dataset, document, chunk, retrieval, and reference-chunk field contract." + } + ], + setup_path: "Run the official RAGFlow Docker Compose stack with generated public corpus only.", + runtime_boundary: "Official RAGFlow Docker Compose service boundary; no host-global RAGFlow install.", + resource_expectation: ( + "RAGFlow image " + $ragflow_image + ", CPU/GPU mode " + $cpu_gpu_mode + ", official minimums 4 CPU cores, 16 GB RAM, 50 GB disk, and vm.max_map_count >= 262144." + ), + retry_guidance: [ + "Default command records a typed blocked preflight unless resource-heavy startup is explicitly enabled.", + "Set ELF_RAGFLOW_SMOKE_START=1 and ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 for a live Docker startup attempt.", + "Provide only a local self-hosted RAGFlow API key; do not use private corpora or operator-owned model provider credentials for this smoke." + ], + research_depth: "D2 feasibility plus XY-885 evidence-smoke implementation; generated artifact decides live evidence class." + }, + notes: [ + "This adapter record is generated by a smoke artifact and must not be generalized into broad RAGFlow quality evidence.", + "Failure before query output remains typed as blocked, incomplete, or not_encoded." + ] + } + ] + }' >"${MANIFEST_OUT}" +} diff --git a/scripts/ragflow_smoke/materialization.sh b/scripts/ragflow_smoke/materialization.sh new file mode 100644 index 00000000..78725d9d --- /dev/null +++ b/scripts/ragflow_smoke/materialization.sh @@ -0,0 +1,245 @@ +# RAGFlow Docker evidence smoke helper functions. +# Sourced by scripts/ragflow-docker-evidence-smoke.sh. + +write_artifact() { + local generated_at out_rel manifest_rel fixture_rel report_json_rel report_md_rel docker_status git_status curl_status jq_status + generated_at="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" + out_rel="$(relative_path "${OUT}")" + manifest_rel="$(relative_path "${MANIFEST_OUT}")" + fixture_rel="$(relative_path "${FIXTURE_PATH}")" + report_json_rel="$(relative_path "${REPORT_JSON}")" + report_md_rel="$(relative_path "${REPORT_MD}")" + docker_status="$(optional_command_status docker)" + git_status="$(optional_command_status git)" + curl_status="$(optional_command_status curl)" + jq_status="$(optional_command_status jq)" + + jq -n \ + --arg schema "elf.ragflow_docker_evidence_smoke/v1" \ + --arg run_id "${RUN_ID}" \ + --arg generated_at "${generated_at}" \ + --arg adapter_id "ragflow_docker_evidence_smoke" \ + --arg evidence_class "${EVIDENCE_CLASS}" \ + --arg overall_status "$(json_status "${OVERALL_STATUS}")" \ + --arg setup_status "$(json_status "${SETUP_STATUS}")" \ + --arg run_status "$(json_status "${RUN_STATUS}")" \ + --arg result_status "$(json_status "${RESULT_STATUS}")" \ + --arg failure_class "${FAILURE_CLASS}" \ + --arg failure_reason "${FAILURE_REASON}" \ + --arg out_rel "${out_rel}" \ + --arg manifest_rel "${manifest_rel}" \ + --arg fixture_rel "${fixture_rel}" \ + --arg report_json_rel "${report_json_rel}" \ + --arg report_md_rel "${report_md_rel}" \ + --arg artifact_dir "$(relative_path "${ARTIFACT_DIR}")" \ + --arg work_dir "$(relative_path "${WORK_DIR}")" \ + --arg repo_url "${RAGFLOW_REPO_URL}" \ + --arg ragflow_ref "${RAGFLOW_REF}" \ + --arg ragflow_image "${RAGFLOW_IMAGE}" \ + --arg compose_project "${COMPOSE_PROJECT}" \ + --arg cpu_gpu_mode "${CPU_GPU_MODE}" \ + --arg start_enabled "${START_RAGFLOW}" \ + --arg accept_resource_envelope "${ACCEPT_RESOURCE_ENVELOPE}" \ + --arg allow_arm "${ALLOW_ARM}" \ + --arg pull_image "${PULL_IMAGE}" \ + --arg cleanup "${CLEANUP}" \ + --arg api_base "${API_BASE}" \ + --arg api_key_provided "$([[ -n "${API_KEY}" ]] && printf true || printf false)" \ + --arg startup_time_ms "${STARTUP_TIME_MS}" \ + --arg started "${STARTED}" \ + --arg startup_attempt_count "${STARTUP_ATTEMPTS}" \ + --arg startup_interval_seconds "${STARTUP_INTERVAL_SECONDS}" \ + --arg compose_timeout_seconds "${COMPOSE_TIMEOUT_SECONDS}" \ + --arg evidence_id "${EVIDENCE_ID}" \ + --arg document_name "${DOCUMENT_NAME}" \ + --arg evidence_token "${EVIDENCE_TOKEN}" \ + --arg corpus_text "${CORPUS_TEXT}" \ + --arg dataset_id "${DATASET_ID}" \ + --arg document_id "${DOCUMENT_ID}" \ + --arg chunk_id "${CHUNK_ID}" \ + --arg vm_max_map_count "${VM_MAX_MAP_COUNT}" \ + --arg vm_max_map_count_status "${VM_MAX_MAP_COUNT_STATUS}" \ + --arg vm_max_map_count_action "${VM_MAX_MAP_COUNT_ACTION}" \ + --arg image_present "${IMAGE_PRESENT}" \ + --arg image_size_bytes "${IMAGE_SIZE_BYTES}" \ + --arg host_global_installs_required "${HOST_GLOBAL_INSTALLS_REQUIRED}" \ + --arg docker_status "${docker_status}" \ + --arg git_status "${git_status}" \ + --arg curl_status "${curl_status}" \ + --arg jq_status "${jq_status}" \ + --arg dataset_step_status "$(json_status "${DATASET_STEP_STATUS}")" \ + --arg document_step_status "$(json_status "${DOCUMENT_STEP_STATUS}")" \ + --arg chunk_step_status "$(json_status "${CHUNK_STEP_STATUS}")" \ + --arg retrieval_step_status "$(json_status "${RETRIEVAL_STEP_STATUS}")" \ + --slurpfile docker_info "${DOCKER_INFO}" \ + --slurpfile image_inspect "${IMAGE_INSPECT}" \ + --slurpfile reference_mapping "${REFERENCE_MAPPING}" \ + --rawfile docker_df "${DOCKER_DF}" \ + --rawfile compose_up_log "${COMPOSE_UP_LOG}" \ + --rawfile compose_down_log "${COMPOSE_DOWN_LOG}" \ + --slurpfile dataset_response "${DATASET_RESPONSE}" \ + --slurpfile document_response "${DOCUMENT_RESPONSE}" \ + --slurpfile chunk_response "${CHUNK_RESPONSE}" \ + --slurpfile retrieval_response "${RETRIEVAL_RESPONSE}" \ + --slurpfile scored_benchmark "${SCORED_BENCHMARK}" \ + --slurpfile startup_attempts <(jq -s '.' "${STARTUP_ATTEMPTS_JSONL}") \ + '{ + schema: $schema, + run_id: $run_id, + generated_at: $generated_at, + adapter_id: $adapter_id, + evidence_class: $evidence_class, + overall_status: $overall_status, + status_source: "smoke_materialization", + scored_benchmark: $scored_benchmark[0], + no_quality_claim: true, + failure: ( + if $failure_class == "" then null + else { + class: $failure_class, + reason: $failure_reason + } + end + ), + artifacts: { + smoke: $out_rel, + external_adapter_manifest: $manifest_rel, + generated_fixture: $fixture_rel, + scored_report_json: $report_json_rel, + scored_report_markdown: $report_md_rel, + artifact_dir: $artifact_dir, + work_dir: $work_dir + }, + upstream: { + repository: $repo_url, + ref: $ragflow_ref, + quickstart: "https://ragflow.io/docs/", + http_api_reference: "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md", + api_key_guide: "https://ragflow.io/docs/acquire_ragflow_api_key" + }, + docker_boundary: { + status: $setup_status, + official_compose_path: "ragflow/docker/docker-compose.yml", + compose_project: $compose_project, + image: $ragflow_image, + device: $cpu_gpu_mode, + start_enabled: ($start_enabled == "1"), + resource_envelope_accepted: ($accept_resource_envelope == "1"), + allow_arm: ($allow_arm == "1"), + pull_image_requested: ($pull_image == "1"), + cleanup_requested: ($cleanup == "1"), + host_global_installs_required: ($host_global_installs_required == "true"), + tooling: { + docker: $docker_status, + git: $git_status, + curl: $curl_status, + jq: $jq_status + } + }, + setup: { + status: $setup_status, + command: "cargo make smoke-ragflow-docker", + live_command: "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker", + started: ($started == "true"), + startup_time_ms: (if $startup_time_ms == "" then null else ($startup_time_ms | tonumber) end), + vm_max_map_count: { + status: $vm_max_map_count_status, + observed: (if $vm_max_map_count == "" then null else $vm_max_map_count end), + required_min: 262144, + action: $vm_max_map_count_action + }, + image: { + present: ($image_present == "true"), + size_bytes: (if $image_size_bytes == "" then null else ($image_size_bytes | tonumber) end), + official_compressed_size_note: "RAGFlow quickstart lists the stable image at about 2 GB compressed.", + official_expanded_size_note: "RAGFlow quickstart says the image expands to about 7 GB once unpacked.", + inspect: ($image_inspect[0] // []) + }, + resource_envelope: { + official_min_cpu_cores: 4, + official_min_ram_gb: 16, + official_min_disk_gb: 50, + docker_info: ($docker_info[0] // {}), + docker_system_df: $docker_df + }, + provider_boundaries: { + ragflow_api_base: $api_base, + ragflow_api_key_provided: ($api_key_provided == "true"), + operator_owned_provider_credentials_used: false, + private_corpus_used: false, + generated_public_corpus_only: true, + external_llm_quality_scoring_claimed: false + }, + retry_behavior: { + startup_poll_attempts_configured: ($startup_attempt_count | tonumber), + startup_interval_seconds: ($startup_interval_seconds | tonumber), + compose_timeout_seconds: ($compose_timeout_seconds | tonumber), + startup_attempts: ($startup_attempts[0] // []) + }, + log_excerpt: { + compose_up: ($compose_up_log | split("\n") | .[0:40]), + compose_down: ($compose_down_log | split("\n") | .[0:20]) + } + }, + corpus: { + profile: "generated_public", + evidence_id: $evidence_id, + document_name: $document_name, + evidence_token: $evidence_token, + text: $corpus_text, + dataset_id: (if $dataset_id == "" then null else $dataset_id end), + document_id: (if $document_id == "" then null else $document_id end), + chunk_id: (if $chunk_id == "" then null else $chunk_id end) + }, + run: { + status: $run_status, + steps: { + dataset_creation: { + status: $dataset_step_status, + request_artifact: "dataset-create-request.json", + response_artifact: "dataset-create-response.json", + response: ($dataset_response[0] // null) + }, + document_creation: { + status: $document_step_status, + request_artifact: "document-create-request.json", + response_artifact: "document-create-response.json", + response: ($document_response[0] // null) + }, + chunk_ingest: { + status: $chunk_step_status, + request_artifact: "chunk-create-request.json", + response_artifact: "chunk-create-response.json", + response: ($chunk_response[0] // null) + }, + retrieval_query: { + status: $retrieval_step_status, + request_artifact: "retrieval-request.json", + response_artifact: "retrieval-response.json", + response: ($retrieval_response[0] // null) + } + } + }, + result: { + status: $result_status, + evidence: "RAGFlow retrieval reference chunks are mapped to real_world_job evidence ids when content or document metadata matches the generated public corpus.", + reference_chunk_count: (($reference_mapping[0] // []) | length), + mapped_reference_chunk_count: (($reference_mapping[0] // []) | map(select((.evidence_ids // []) | length > 0)) | length) + }, + evidence_mapping: { + expected_evidence_ids: [$evidence_id], + reference_chunks: ($reference_mapping[0] // []), + field_mapping: { + "id": "chunk_id", + "document_id": "document_id", + "document_name_or_document_keyword": "document_name", + "dataset_id_or_kb_id": "dataset_id", + "content_or_content_with_weight": "content", + "positions": "positions", + "similarity": "similarity", + "vector_similarity": "vector_similarity", + "term_similarity": "term_similarity" + } + } + }' >"${OUT}" +} diff --git a/scripts/ragflow_smoke/scoring.sh b/scripts/ragflow_smoke/scoring.sh new file mode 100644 index 00000000..367ef776 --- /dev/null +++ b/scripts/ragflow_smoke/scoring.sh @@ -0,0 +1,60 @@ +# RAGFlow Docker evidence smoke helper functions. +# Sourced by scripts/ragflow-docker-evidence-smoke.sh. + +write_scored_benchmark() { + if [[ -s "${REPORT_JSON}" ]]; then + jq 'def count($key): (.summary[$key] // 0); + def scored_status: + if count("wrong_result") > 0 then "wrong_result" + elif count("lifecycle_fail") > 0 then "lifecycle_fail" + elif count("incomplete") > 0 then "incomplete" + elif count("blocked") > 0 then "blocked" + elif count("not_encoded") > 0 then "not_encoded" + elif count("pass") > 0 then "pass" + else "not_encoded" + end; + { + schema: "elf.scored_benchmark_status/v1", + source: "real_world_job_benchmark", + status: scored_status, + counts: { + pass: count("pass"), + wrong_result: count("wrong_result"), + lifecycle_fail: count("lifecycle_fail"), + incomplete: count("incomplete"), + blocked: count("blocked"), + not_encoded: count("not_encoded") + }, + job_count: (.summary.job_count // 0), + mean_score: (.summary.mean_score // null), + evidence_coverage: (.summary.evidence_coverage // null) + }' "${REPORT_JSON}" >"${SCORED_BENCHMARK}" + else + jq -n '{ + schema: "elf.scored_benchmark_status/v1", + source: "real_world_job_benchmark", + status: "pending", + reason: "The smoke materialization was written before benchmark scoring completed." + }' >"${SCORED_BENCHMARK}" + fi +} + +write_scored_report() { + ( + cd "${ROOT_DIR}" + cargo run -p elf-eval --bin real_world_job_benchmark -- run \ + --fixtures "${FIXTURE_PATH}" \ + --out "${REPORT_JSON}" \ + --run-id real-world-memory-live-ragflow \ + --adapter-id ragflow_docker_evidence_smoke \ + --adapter-name "RAGFlow Docker evidence smoke adapter" \ + --adapter-behavior docker_service_evidence_smoke \ + --adapter-storage-status "$(json_status "${SETUP_STATUS}")" \ + --adapter-runtime-status "$(json_status "${OVERALL_STATUS}")" \ + --adapter-notes "Generated by the RAGFlow Docker evidence smoke; pass or wrong_result requires reference chunks mapped to generated evidence ids, while resource/setup/API-key limits remain typed." \ + --external-adapter-manifest "${MANIFEST_OUT}" + cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ + --report "${REPORT_JSON}" \ + --out "${REPORT_MD}" + ) +} diff --git a/scripts/ragflow_smoke/summary.sh b/scripts/ragflow_smoke/summary.sh new file mode 100644 index 00000000..7cb99214 --- /dev/null +++ b/scripts/ragflow_smoke/summary.sh @@ -0,0 +1,49 @@ +# RAGFlow Docker evidence smoke helper functions. +# Sourced by scripts/ragflow-docker-evidence-smoke.sh. + +write_summary() { + jq -n \ + --slurpfile materialization "${OUT}" \ + --slurpfile manifest "${MANIFEST_OUT}" \ + --slurpfile report "${REPORT_JSON}" \ + '{ + schema: "elf.ragflow_docker_smoke_summary/v1", + generated_at: (now | todateiso8601), + adapter_id: "ragflow_docker_evidence_smoke", + evidence_class: $materialization[0].evidence_class, + status_boundary: { + materialization: "setup/run/evidence-mapping state emitted by the smoke runner", + manifest: "external adapter declaration consumed by the scorer", + scored_benchmark: "post-score real_world_job outcome; use this for quality status" + }, + scored_benchmark: $materialization[0].scored_benchmark, + materialization: $materialization[0], + manifest: { + json: ($materialization[0].artifacts.external_adapter_manifest // "tmp/real-world-memory/ragflow-smoke/memory_projects_manifest.ragflow-smoke.json"), + status_source: "external_adapter_manifest_pre_score", + summary: $manifest[0].adapters[0].overall_status, + suites: $manifest[0].adapters[0].suites + }, + report: { + json: ($materialization[0].artifacts.scored_report_json // "tmp/real-world-memory/ragflow-smoke/ragflow-report.json"), + markdown: ($materialization[0].artifacts.scored_report_markdown // "tmp/real-world-memory/ragflow-smoke/ragflow-report.md"), + summary: $report[0].summary, + suites: $report[0].suites + } + }' >"${SUMMARY_OUT}" +} + +write_outputs() { + write_scored_benchmark + write_artifact + write_manifest + write_fixture + write_scored_report + write_scored_benchmark + write_artifact + write_summary + echo "RAGFlow smoke artifact: ${OUT}" + echo "RAGFlow smoke manifest: ${MANIFEST_OUT}" + echo "RAGFlow smoke report: ${REPORT_JSON}" + echo "RAGFlow smoke summary: ${SUMMARY_OUT}" +} From 2076b24d18f6b4fbe311c792b9b11de943e87e35 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 05:50:18 -0400 Subject: [PATCH 2/4] {"schema":"decodex/commit/1","summary":"Clean Letta smoke context whitespace","authority":"manual"} --- scripts/letta_core_archive_smoke/context.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/letta_core_archive_smoke/context.py b/scripts/letta_core_archive_smoke/context.py index 813eab1f..73e277e9 100644 --- a/scripts/letta_core_archive_smoke/context.py +++ b/scripts/letta_core_archive_smoke/context.py @@ -49,4 +49,3 @@ STARTUP_INTERVAL_SECONDS = float(os.environ.get("ELF_LETTA_STARTUP_INTERVAL_SECONDS", "2")) CORE_KINDS = {"core_block", "core_block_contract", "core_block_event"} - From e4f9f320893fd3a6bc9da633c91964a2373baa7e Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 05:59:48 -0400 Subject: [PATCH 3/4] {"schema":"decodex/commit/1","summary":"Address smoke runner review comments","authority":"manual"} --- scripts/graphiti_temporal_smoke/context.py | 2 -- scripts/graphiti_temporal_smoke/manifest.py | 3 +-- .../materialization.py | 23 ++++++++++++++++++- scripts/graphiti_temporal_smoke/runner.py | 10 -------- scripts/graphiti_temporal_smoke/runtime.py | 21 +++++++++++++++-- scripts/letta_core_archive_smoke/artifacts.py | 16 ++++++++++++- scripts/letta_core_archive_smoke/context.py | 2 -- scripts/letta_core_archive_smoke/runtime.py | 14 +++++++++-- 8 files changed, 69 insertions(+), 22 deletions(-) diff --git a/scripts/graphiti_temporal_smoke/context.py b/scripts/graphiti_temporal_smoke/context.py index 442836e0..6884619d 100644 --- a/scripts/graphiti_temporal_smoke/context.py +++ b/scripts/graphiti_temporal_smoke/context.py @@ -6,8 +6,6 @@ from datetime import datetime, timezone from pathlib import Path -from typing import Any - SCRIPT_DIR = Path(__file__).resolve().parent.parent ROOT_DIR = SCRIPT_DIR.parent diff --git a/scripts/graphiti_temporal_smoke/manifest.py b/scripts/graphiti_temporal_smoke/manifest.py index b8b66bd1..e67a81c2 100644 --- a/scripts/graphiti_temporal_smoke/manifest.py +++ b/scripts/graphiti_temporal_smoke/manifest.py @@ -2,11 +2,10 @@ from __future__ import annotations -from pathlib import Path from typing import Any from .common import rel, utc_now, write_json -from .context import * # noqa: F403 +from .context import FALKORDB_HOST, FALKORDB_PORT, GRAPHITI_REF, MANIFEST_OUT, OUT, RUN_ID, TIMEOUT_SECONDS from .models import StatusState def write_manifest(status: StatusState) -> dict[str, Any]: diff --git a/scripts/graphiti_temporal_smoke/materialization.py b/scripts/graphiti_temporal_smoke/materialization.py index f96fd4f0..1658789c 100644 --- a/scripts/graphiti_temporal_smoke/materialization.py +++ b/scripts/graphiti_temporal_smoke/materialization.py @@ -8,7 +8,28 @@ from .benchmark import scored_benchmark from .common import command_to_json, dir_size, file_count, rel, utc_now, write_json -from .context import * # noqa: F403 +from .context import ( + API_BASE, + API_KEY, + EMBEDDING_MODEL, + FALKORDB_DATABASE, + FALKORDB_HOST, + FALKORDB_PASSWORD, + FALKORDB_PORT, + FALKORDB_USERNAME, + GRAPHITI_PACKAGE, + GRAPHITI_REF, + LLM_MODEL, + MANIFEST_OUT, + OUT, + REPORT_JSON, + REPORT_MD, + RUN_ID, + RUN_LIVE, + SUMMARY_OUT, + TIMEOUT_SECONDS, + WORK_DIR, +) from .models import CommandRecord, StatusState def write_materialization( diff --git a/scripts/graphiti_temporal_smoke/runner.py b/scripts/graphiti_temporal_smoke/runner.py index 16c20989..66ed3e40 100644 --- a/scripts/graphiti_temporal_smoke/runner.py +++ b/scripts/graphiti_temporal_smoke/runner.py @@ -120,16 +120,6 @@ def main() -> int: status.failure_reason = mapping["reason"] fixture_path = write_fixture(facts, status, mapping) - materialization = write_materialization( - status, - facts, - fixture_path, - command_records, - inserted, - search_results, - mapping, - started_at, - ) manifest = write_manifest(status) report = run_scored_report(fixture_path, MANIFEST_OUT, status) materialization = write_materialization( diff --git a/scripts/graphiti_temporal_smoke/runtime.py b/scripts/graphiti_temporal_smoke/runtime.py index ffe6dbab..ab224fee 100644 --- a/scripts/graphiti_temporal_smoke/runtime.py +++ b/scripts/graphiti_temporal_smoke/runtime.py @@ -10,8 +10,25 @@ from pathlib import Path from typing import Any -from .common import run_command, write_json -from .context import * # noqa: F403 +from .common import rel, run_command, write_json +from .context import ( + API_BASE, + API_KEY, + EMBEDDING_MODEL, + FALKORDB_DATABASE, + FALKORDB_HOST, + FALKORDB_PASSWORD, + FALKORDB_PORT, + FALKORDB_USERNAME, + GRAPHITI_PACKAGE, + INSTALL_GRAPHITI, + LLM_MODEL, + LOG_DIR, + RUN_ID, + STARTUP_ATTEMPTS, + STARTUP_INTERVAL_SECONDS, + WORK_DIR, +) from .corpus import temporal_facts from .models import CommandRecord diff --git a/scripts/letta_core_archive_smoke/artifacts.py b/scripts/letta_core_archive_smoke/artifacts.py index e4e12276..c631b31e 100644 --- a/scripts/letta_core_archive_smoke/artifacts.py +++ b/scripts/letta_core_archive_smoke/artifacts.py @@ -8,7 +8,21 @@ from .benchmark import scored_benchmark from .common import command_to_json, rel, utc_now, write_json -from .context import * # noqa: F403 +from .context import ( + LETTA_BASE_URL, + LETTA_CLIENT_REF, + LETTA_EMBEDDING, + LETTA_MODEL, + MANIFEST_OUT, + OUT, + REPORT_JSON, + REPORT_MD, + RUN_ID, + RUN_LIVE, + SUMMARY_OUT, + TIMEOUT_SECONDS, + WORK_DIR, +) from .fixtures import benchmark_input_contract from .models import CommandRecord, StatusState diff --git a/scripts/letta_core_archive_smoke/context.py b/scripts/letta_core_archive_smoke/context.py index 73e277e9..63152153 100644 --- a/scripts/letta_core_archive_smoke/context.py +++ b/scripts/letta_core_archive_smoke/context.py @@ -6,8 +6,6 @@ from datetime import datetime, timezone from pathlib import Path -from typing import Any - SCRIPT_DIR = Path(__file__).resolve().parent.parent ROOT_DIR = SCRIPT_DIR.parent diff --git a/scripts/letta_core_archive_smoke/runtime.py b/scripts/letta_core_archive_smoke/runtime.py index 5eddd2b7..fa9c04be 100644 --- a/scripts/letta_core_archive_smoke/runtime.py +++ b/scripts/letta_core_archive_smoke/runtime.py @@ -3,7 +3,7 @@ from __future__ import annotations import json -import textwrap +import sys import time import urllib.error import urllib.request @@ -11,7 +11,17 @@ from typing import Any from .common import run_command, write_json -from .context import * # noqa: F403 +from .context import ( + INSTALL_CLIENT, + LETTA_BASE_URL, + LETTA_CLIENT_PACKAGE, + LETTA_EMBEDDING, + LETTA_MODEL, + RUN_ID, + STARTUP_ATTEMPTS, + STARTUP_INTERVAL_SECONDS, + WORK_DIR, +) from .fixtures import benchmark_input_contract, slug from .models import CommandRecord From 842b8f927e034aaab53225e94d2410d65a862ed4 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 06:05:56 -0400 Subject: [PATCH 4/4] {"schema":"decodex/commit/1","summary":"Remove unused Letta runtime import","authority":"manual"} --- scripts/letta_core_archive_smoke/runtime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/letta_core_archive_smoke/runtime.py b/scripts/letta_core_archive_smoke/runtime.py index fa9c04be..d3959cc5 100644 --- a/scripts/letta_core_archive_smoke/runtime.py +++ b/scripts/letta_core_archive_smoke/runtime.py @@ -22,7 +22,7 @@ STARTUP_INTERVAL_SECONDS, WORK_DIR, ) -from .fixtures import benchmark_input_contract, slug +from .fixtures import benchmark_input_contract from .models import CommandRecord def wait_for_letta(command_records: list[CommandRecord]) -> bool: