Evaluation: Added type for dataset (#641)

AkhileshNegi · web-flow · commit 03a0568259c6 · 2026-03-05T16:48:34.000+05:30
diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py
@@ -11,6 +11,7 @@
 from app.crud.evaluations.score import EvaluationScore
 from app.models import EvaluationRun
 from app.models.llm.request import ConfigBlob, LLMCallConfig
+from app.models.stt_evaluation import EvaluationType
 from app.services.llm.jobs import resolve_config_blob
 
 from app.core.db import engine
@@ -80,6 +81,7 @@ def create_evaluation_run(
         run_name=run_name,
         dataset_name=dataset_name,
         dataset_id=dataset_id,
+        type=EvaluationType.TEXT.value,
         config_id=config_id,
         config_version=config_version,
         status="pending",
@@ -129,6 +131,7 @@ def list_evaluation_runs(
         select(EvaluationRun)
         .where(EvaluationRun.organization_id == organization_id)
         .where(EvaluationRun.project_id == project_id)
+        .where(EvaluationRun.type == EvaluationType.TEXT.value)
         .order_by(EvaluationRun.inserted_at.desc())
         .limit(limit)
         .offset(offset)
@@ -167,6 +170,7 @@ def get_evaluation_run_by_id(
         .where(EvaluationRun.id == evaluation_id)
         .where(EvaluationRun.organization_id == organization_id)
         .where(EvaluationRun.project_id == project_id)
+        .where(EvaluationRun.type == EvaluationType.TEXT.value)
     )
 
     eval_run = session.exec(statement).first()
diff --git a/backend/app/crud/evaluations/dataset.py b/backend/app/crud/evaluations/dataset.py
@@ -22,6 +22,7 @@
 )
 from app.core.util import now
 from app.models import EvaluationDataset, EvaluationRun
+from app.models.stt_evaluation import EvaluationType
 
 logger = logging.getLogger(__name__)
 
@@ -60,6 +61,7 @@ def create_evaluation_dataset(
         dataset = EvaluationDataset(
             name=name,
             description=description,
+            type=EvaluationType.TEXT.value,
             dataset_metadata=dataset_metadata,
             object_store_url=object_store_url,
             langfuse_dataset_id=langfuse_dataset_id,
@@ -122,6 +124,7 @@ def get_dataset_by_id(
         .where(EvaluationDataset.id == dataset_id)
         .where(EvaluationDataset.organization_id == organization_id)
         .where(EvaluationDataset.project_id == project_id)
+        .where(EvaluationDataset.type == EvaluationType.TEXT.value)
     )
 
     dataset = session.exec(statement).first()
@@ -158,6 +161,7 @@ def get_dataset_by_name(
         .where(EvaluationDataset.name == name)
         .where(EvaluationDataset.organization_id == organization_id)
         .where(EvaluationDataset.project_id == project_id)
+        .where(EvaluationDataset.type == EvaluationType.TEXT.value)
     )
 
     dataset = session.exec(statement).first()
@@ -194,6 +198,7 @@ def list_datasets(
         select(EvaluationDataset)
         .where(EvaluationDataset.organization_id == organization_id)
         .where(EvaluationDataset.project_id == project_id)
+        .where(EvaluationDataset.type == EvaluationType.TEXT.value)
         .order_by(EvaluationDataset.inserted_at.desc())
         .limit(limit)
         .offset(offset)
diff --git a/backend/app/tests/crud/evaluations/test_core.py b/backend/app/tests/crud/evaluations/test_core.py
@@ -0,0 +1,248 @@
+from uuid import uuid4
+
+from sqlmodel import Session, select
+
+from app.core.util import now
+from app.crud.evaluations.core import (
+    create_evaluation_run,
+    get_evaluation_run_by_id,
+    list_evaluation_runs,
+)
+from app.crud.evaluations.dataset import create_evaluation_dataset
+from app.models import EvaluationRun, Organization, Project
+from app.models.stt_evaluation import EvaluationType
+
+
+def _create_config(db: Session, project_id: int) -> tuple:
+    """Helper to create a config and config_version for evaluation runs."""
+    from app.models.config import Config, ConfigVersion
+
+    config = Config(
+        name="test_config",
+        project_id=project_id,
+        inserted_at=now(),
+        updated_at=now(),
+    )
+    db.add(config)
+    db.commit()
+    db.refresh(config)
+
+    config_version = ConfigVersion(
+        config_id=config.id,
+        version=1,
+        config_blob={"completion": {"params": {"model": "gpt-4o"}}},
+        inserted_at=now(),
+        updated_at=now(),
+    )
+    db.add(config_version)
+    db.commit()
+    db.refresh(config_version)
+
+    return config.id, config_version.version
+
+
+class TestCreateEvaluationRun:
+    """Test creating evaluation runs."""
+
+    def test_create_evaluation_run_sets_text_type(self, db: Session) -> None:
+        """Test that create_evaluation_run sets type to TEXT."""
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        dataset = create_evaluation_dataset(
+            session=db,
+            name="test_dataset_run_type",
+            dataset_metadata={"original_items_count": 10},
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        config_id, config_version = _create_config(db, project.id)
+
+        eval_run = create_evaluation_run(
+            session=db,
+            run_name="test_run",
+            dataset_name=dataset.name,
+            dataset_id=dataset.id,
+            config_id=config_id,
+            config_version=config_version,
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        assert eval_run.id is not None
+        assert eval_run.type == EvaluationType.TEXT.value
+        assert eval_run.status == "pending"
+        assert eval_run.run_name == "test_run"
+
+
+class TestGetEvaluationRunById:
+    """Test fetching evaluation runs by ID."""
+
+    def test_get_evaluation_run_by_id_success(self, db: Session) -> None:
+        """Test fetching an existing evaluation run by ID."""
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        dataset = create_evaluation_dataset(
+            session=db,
+            name="test_dataset_get_run",
+            dataset_metadata={"original_items_count": 10},
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        config_id, config_version = _create_config(db, project.id)
+
+        eval_run = create_evaluation_run(
+            session=db,
+            run_name="test_get_run",
+            dataset_name=dataset.name,
+            dataset_id=dataset.id,
+            config_id=config_id,
+            config_version=config_version,
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        fetched = get_evaluation_run_by_id(
+            session=db,
+            evaluation_id=eval_run.id,
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        assert fetched is not None
+        assert fetched.id == eval_run.id
+        assert fetched.run_name == "test_get_run"
+
+    def test_get_evaluation_run_by_id_not_found(self, db: Session) -> None:
+        """Test fetching a non-existent evaluation run."""
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        fetched = get_evaluation_run_by_id(
+            session=db,
+            evaluation_id=99999,
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        assert fetched is None
+
+    def test_get_evaluation_run_by_id_excludes_non_text_type(self, db: Session) -> None:
+        """Test that get_evaluation_run_by_id excludes runs with non-text type."""
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        dataset = create_evaluation_dataset(
+            session=db,
+            name="test_dataset_exclude_run",
+            dataset_metadata={"original_items_count": 10},
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        config_id, config_version = _create_config(db, project.id)
+
+        eval_run = create_evaluation_run(
+            session=db,
+            run_name="test_stt_run",
+            dataset_name=dataset.name,
+            dataset_id=dataset.id,
+            config_id=config_id,
+            config_version=config_version,
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        # Manually update type to STT to simulate a non-text run
+        eval_run.type = EvaluationType.STT.value
+        db.add(eval_run)
+        db.commit()
+
+        fetched = get_evaluation_run_by_id(
+            session=db,
+            evaluation_id=eval_run.id,
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        assert fetched is None
+
+
+class TestListEvaluationRuns:
+    """Test listing evaluation runs."""
+
+    def test_list_evaluation_runs_empty(self, db: Session) -> None:
+        """Test listing evaluation runs when none exist."""
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        runs = list_evaluation_runs(
+            session=db, organization_id=org.id, project_id=project.id
+        )
+
+        assert len(runs) == 0
+
+    def test_list_evaluation_runs_excludes_non_text_type(self, db: Session) -> None:
+        """Test that list_evaluation_runs only returns text type runs."""
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        dataset = create_evaluation_dataset(
+            session=db,
+            name="test_dataset_list_runs",
+            dataset_metadata={"original_items_count": 10},
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        config_id, config_version = _create_config(db, project.id)
+
+        # Create text evaluation runs
+        for i in range(3):
+            create_evaluation_run(
+                session=db,
+                run_name=f"text_run_{i}",
+                dataset_name=dataset.name,
+                dataset_id=dataset.id,
+                config_id=config_id,
+                config_version=config_version,
+                organization_id=org.id,
+                project_id=project.id,
+            )
+
+        # Create a non-text evaluation run by updating type after creation
+        stt_run = create_evaluation_run(
+            session=db,
+            run_name="stt_run",
+            dataset_name=dataset.name,
+            dataset_id=dataset.id,
+            config_id=config_id,
+            config_version=config_version,
+            organization_id=org.id,
+            project_id=project.id,
+        )
+        stt_run.type = EvaluationType.STT.value
+        db.add(stt_run)
+        db.commit()
+
+        runs = list_evaluation_runs(
+            session=db, organization_id=org.id, project_id=project.id
+        )
+
+        assert len(runs) == 3
+        assert all(r.type == EvaluationType.TEXT.value for r in runs)
diff --git a/backend/app/tests/crud/evaluations/test_dataset.py b/backend/app/tests/crud/evaluations/test_dataset.py