ProjectTech4DevAI · AkhileshNegi · Feb 24, 2026 · Feb 11, 2026 · Feb 11, 2026 · Feb 11, 2026
diff --git a/backend/app/api/routes/stt_evaluations/evaluation.py b/backend/app/api/routes/stt_evaluations/evaluation.py
@@ -2,19 +2,18 @@
 
 import logging
 
+from asgi_correlation_id import correlation_id
 from fastapi import APIRouter, Body, Depends, HTTPException, Query
 
 from app.api.deps import AuthContextDep, SessionDep
 from app.api.permissions import Permission, require_permission
+from app.celery.utils import start_low_priority_job
 from app.crud.stt_evaluations import (
     create_stt_run,
-    create_stt_results,
     get_results_by_run_id,
-    get_samples_by_dataset_id,
     get_stt_dataset_by_id,
     get_stt_run_by_id,
     list_stt_runs,
-    start_stt_evaluation_batch,
     update_stt_run,
 )
 from app.models.stt_evaluation import (
@@ -80,56 +79,36 @@ def start_stt_evaluation(
         total_items=sample_count * len(run_create.models),
     )
 
-    # Get samples for the dataset
-    samples = get_samples_by_dataset_id(
-        session=_session,
-        dataset_id=run_create.dataset_id,
-        org_id=auth_context.organization_.id,
-        project_id=auth_context.project_.id,
-    )
-
-    # Create result records for each sample and model
-    create_stt_results(
-        session=_session,
-        samples=samples,
-        evaluation_run_id=run.id,
-        org_id=auth_context.organization_.id,
-        project_id=auth_context.project_.id,
-        models=run_create.models,
-    )
-
+    # Offload batch submission (signed URLs, JSONL, Gemini upload) to Celery worker
+    trace_id = correlation_id.get() or "N/A"
     try:
-        batch_result = start_stt_evaluation_batch(
-            session=_session,
-            run=run,
-            samples=samples,
-            org_id=auth_context.organization_.id,
+        celery_task_id = start_low_priority_job(
+            function_path="app.services.stt_evaluations.batch_job.execute_batch_submission",
             project_id=auth_context.project_.id,
+            job_id=str(run.id),
+            trace_id=trace_id,
+            organization_id=auth_context.organization_.id,
+            dataset_id=run_create.dataset_id,
         )
         logger.info(
-            f"[start_stt_evaluation] STT evaluation batch submitted | "
-            f"run_id: {run.id}, batch_jobs: {list(batch_result.get('batch_jobs', {}).keys())}"
+            f"[start_stt_evaluation] Batch submission queued | "
+            f"run_id: {run.id}, celery_task_id: {celery_task_id}"
         )
     except Exception as e:
         logger.error(
-            f"[start_stt_evaluation] Batch submission failed | "
+            f"[start_stt_evaluation] Failed to queue batch submission | "
             f"run_id: {run.id}, error: {str(e)}"
         )
         update_stt_run(
             session=_session,
             run_id=run.id,
             status="failed",
-            error_message=str(e),
+            error_message=f"Failed to queue batch submission: {str(e)}",
+        )
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to queue batch submission: {e}",
         )
-        raise HTTPException(
-            status_code=500,
-            detail=f"Failed to queue batch submission: {e}",
-        )
+        raise HTTPException(
+            status_code=500,
+            detail="Failed to start evaluation. Please try again later.",
+        )
-        raise HTTPException(
-            status_code=500,
-            detail=f"Failed to queue batch submission: {e}",
-        )
+        raise HTTPException(
+            status_code=500,
+            detail="Failed to start evaluation. Please try again later.",
+        )
-        raise HTTPException(status_code=500, detail=f"Batch submission failed: {e}")
-
-    # Refresh run to get updated status
-    run = get_stt_run_by_id(
-        session=_session,
-        run_id=run.id,
-        org_id=auth_context.organization_.id,
-        project_id=auth_context.project_.id,
-    )
 
     return APIResponse.success_response(
         data=STTEvaluationRunPublic(

diff --git a/backend/app/api/routes/stt_evaluations/result.py b/backend/app/api/routes/stt_evaluations/result.py
@@ -40,18 +40,6 @@ def update_result_feedback(
         f"result_id: {result_id}, is_correct: {feedback.is_correct}"
     )
 
-    # Verify result exists and belongs to this project
-    existing = get_stt_result_by_id(
-        session=_session,
-        result_id=result_id,
-        org_id=auth_context.organization_.id,
-        project_id=auth_context.project_.id,
-    )
-
-    if not existing:
-        raise HTTPException(status_code=404, detail="Result not found")
-
-    # Update feedback
     result = update_human_feedback(
         session=_session,
         result_id=result_id,

diff --git a/backend/app/core/batch/__init__.py b/backend/app/core/batch/__init__.py
@@ -1,6 +1,6 @@
 """Batch processing infrastructure for LLM providers."""
 
-from .base import BatchProvider
+from .base import BATCH_KEY, BatchProvider
 from .gemini import BatchJobState, GeminiBatchProvider, create_stt_batch_requests
 from .openai import OpenAIBatchProvider
 from .operations import (
@@ -12,6 +12,7 @@
 from .polling import poll_batch_status
 
 __all__ = [
+    "BATCH_KEY",
     "BatchProvider",
     "BatchJobState",
     "GeminiBatchProvider",

diff --git a/backend/app/core/batch/base.py b/backend/app/core/batch/base.py
@@ -3,6 +3,10 @@
 from abc import ABC, abstractmethod
 from typing import Any
 
+# Unified key used across all batch providers to identify individual requests/responses.
+# OpenAI uses "custom_id" natively; Gemini uses "key" but we normalize to this constant.
+BATCH_KEY = "custom_id"
+
 
 class BatchProvider(ABC):
     """Abstract base class for LLM batch providers (OpenAI, Anthropic, etc.)."""
@@ -61,7 +65,7 @@ def download_batch_results(self, output_file_id: str) -> list[dict[str, Any]]:
 
         Returns:
             List of result dictionaries, each containing:
-                - custom_id: Item identifier from input
+                - BATCH_KEY: Item identifier from input
                 - response: Provider's response data
                 - error: Error info (if item failed)
                 - Any other provider-specific result data

diff --git a/backend/app/core/batch/gemini.py b/backend/app/core/batch/gemini.py
@@ -13,7 +13,7 @@
 
 from app.core.storage_utils import get_mime_from_url
 
-from .base import BatchProvider
+from .base import BATCH_KEY, BatchProvider
 
 logger = logging.getLogger(__name__)
 
@@ -188,7 +188,7 @@ def download_batch_results(
 
         Returns:
             List of result dictionaries, each containing:
-                - custom_id: Item key from input
+                - BATCH_KEY: Item key from input
                 - response: Dict with "text" key containing the generated text
                 - error: Error info (if item failed), None otherwise
         """
@@ -225,15 +225,15 @@ def download_batch_results(
                             text = self._extract_text_from_response_dict(response_obj)
                             results.append(
                                 {
-                                    "custom_id": custom_id,
+                                    BATCH_KEY: custom_id,
                                     "response": {"text": text},
                                     "error": None,
                                 }
                             )
                         elif parsed.get("error"):
                             results.append(
                                 {
-                                    "custom_id": custom_id,
+                                    BATCH_KEY: custom_id,
                                     "response": None,
                                     "error": str(parsed["error"]),
                                 }

diff --git a/backend/app/core/batch/openai.py b/backend/app/core/batch/openai.py
@@ -149,7 +149,7 @@ def download_batch_results(self, output_file_id: str) -> list[dict[str, Any]]:
 
         Returns:
             List of result dictionaries, each containing:
-                - custom_id: Item identifier from input
+                - BATCH_KEY: Item identifier from input
                 - response: OpenAI response data (body, status_code, request_id)
                 - error: Error info (if item failed)
 

diff --git a/backend/app/crud/evaluations/batch.py b/backend/app/crud/evaluations/batch.py
@@ -14,6 +14,8 @@
 from openai import OpenAI
 from sqlmodel import Session
 
+from app.core.batch.base import BATCH_KEY
+
 from app.core.batch import OpenAIBatchProvider, start_batch_job
 from app.models import EvaluationRun
 from app.models.llm.request import KaapiLLMParams
@@ -66,7 +68,7 @@ def build_evaluation_jsonl(
     Build JSONL data for evaluation batch using OpenAI Responses API.
 
     Each line is a dict with:
-    - custom_id: Unique identifier for the request (dataset item ID)
+    - BATCH_KEY: Unique identifier for the request (dataset item ID)
     - method: POST
     - url: /v1/responses
     - body: Response request using config as-is with input from dataset
@@ -124,7 +126,7 @@ def build_evaluation_jsonl(
             ]
 
         batch_request = {
-            "custom_id": item["id"],
+            BATCH_KEY: item["id"],
             "method": "POST",
             "url": "/v1/responses",
             "body": body,

diff --git a/backend/app/crud/evaluations/embeddings.py b/backend/app/crud/evaluations/embeddings.py
@@ -16,6 +16,7 @@
 from sqlmodel import Session
 
 from app.core.batch import OpenAIBatchProvider, start_batch_job
+from app.core.batch.base import BATCH_KEY
 from app.core.util import now
 from app.models import EvaluationRun
 
@@ -58,7 +59,7 @@ def build_embedding_jsonl(
     Build JSONL data for embedding batch using OpenAI Embeddings API.
 
     Each line is a dict with:
-    - custom_id: Langfuse trace_id (for direct score updates)
+    - BATCH_KEY: Langfuse trace_id (for direct score updates)
     - method: POST
     - url: /v1/embeddings
     - body: Embedding request with input array [output, ground_truth]
@@ -110,9 +111,9 @@ def build_embedding_jsonl(
             continue
 
         # Build the batch request object for Embeddings API
-        # Use trace_id as custom_id for direct score updates
+        # Use trace_id as BATCH_KEY for direct score updates
         batch_request = {
-            "custom_id": trace_id,
+            BATCH_KEY: trace_id,
             "method": "POST",
             "url": "/v1/embeddings",
             "body": {
@@ -155,10 +156,10 @@ def parse_embedding_results(raw_results: list[dict[str, Any]]) -> list[dict[str,
 
     for line_num, response in enumerate(raw_results, 1):
         try:
-            # Extract custom_id (which is now the Langfuse trace_id)
-            trace_id = response.get("custom_id")
+            # Extract BATCH_KEY (which is now the Langfuse trace_id)
+            trace_id = response.get(BATCH_KEY)
             if not trace_id:
-                logger.warning(f"Line {line_num}: No custom_id found, skipping")
+                logger.warning(f"Line {line_num}: No {BATCH_KEY} found, skipping")
-                logger.warning(f"Line {line_num}: No {BATCH_KEY} found, skipping")
+                logger.warning(f"[parse_embedding_results] Line {line_num}: No {BATCH_KEY} found, skipping")
-                logger.warning(f"Line {line_num}: No {BATCH_KEY} found, skipping")
+                logger.warning(f"[parse_embedding_results] Line {line_num}: No {BATCH_KEY} found, skipping")
                 continue
 
             # Handle errors in batch processing

diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py
@@ -25,6 +25,7 @@
     poll_batch_status,
     upload_batch_results_to_object_store,
 )
+from app.core.batch.base import BATCH_KEY
 from app.crud.evaluations.batch import fetch_dataset_items
 from app.crud.evaluations.core import update_evaluation_run, resolve_model_from_config
 from app.crud.evaluations.embeddings import (
@@ -81,11 +82,11 @@ def parse_evaluation_output(
 
     for line_num, response in enumerate(raw_results, 1):
         try:
-            # Extract custom_id (which is our dataset item ID)
-            item_id = response.get("custom_id")
+            # Extract BATCH_KEY (which is our dataset item ID)
+            item_id = response.get(BATCH_KEY)
             if not item_id:
                 logger.warning(
-                    f"[parse_evaluation_output] No custom_id found, skipping | line={line_num}"
+                    f"[parse_evaluation_output] No {BATCH_KEY} found, skipping | line={line_num}"
                 )
                 continue
 

diff --git a/backend/app/crud/stt_evaluations/__init__.py b/backend/app/crud/stt_evaluations/__init__.py
@@ -16,7 +16,6 @@
     update_stt_run,
 )
 from .result import (
-    create_stt_results,
     get_stt_result_by_id,
     get_results_by_run_id,
     update_human_feedback,
@@ -39,7 +38,6 @@
     "list_stt_runs",
     "update_stt_run",
     # Result
-    "create_stt_results",
     "get_stt_result_by_id",
     "get_results_by_run_id",
     "update_human_feedback",