ProjectTech4DevAI · Prajna1999 · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026 · Jan 21, 2026
diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -13,6 +13,7 @@ WORKDIR /app/
 RUN apt-get update && apt-get install -y \
     curl \
     poppler-utils \
+    ffmpeg \
  && rm -rf /var/lib/apt/lists/*
 
 # Install uv package manager

diff --git a/backend/app/core/audio_utils.py b/backend/app/core/audio_utils.py
@@ -0,0 +1,45 @@
+"""
+Audio processing utilities for format conversion.
+
+This module provides utilities for converting audio between different formats,
+particularly for TTS output post-processing.
+"""
+import io
+import logging
+from pydub import AudioSegment
+
+
+logger = logging.getLogger(__name__)
+
+
+def convert_pcm_to_mp3(
+    pcm_bytes: bytes, sample_rate: int = 24000
+) -> tuple[bytes | None, str | None]:
+    try:
+        audio = AudioSegment(
+            data=pcm_bytes, sample_width=2, frame_rate=sample_rate, channels=1
+        )
+
+        output_buffer = io.BytesIO()
+        audio.export(output_buffer, format="mp3", bitrate="192k")
+        return output_buffer.getvalue(), None
+    except Exception as e:
+        return None, str(e)
+
+
+def convert_pcm_to_ogg(
+    pcm_bytes: bytes, sample_rate: int = 24000
+) -> tuple[bytes | None, str | None]:
+    """Convert raw PCM to OGG with Opus codec."""
+    try:
+        audio = AudioSegment(
+            data=pcm_bytes, sample_width=2, frame_rate=sample_rate, channels=1
+        )
+
+        output_buffer = io.BytesIO()
+        audio.export(
+            output_buffer, format="ogg", codec="libopus", parameters=["-b:a", "64k"]
+        )
+        return output_buffer.getvalue(), None
+    except Exception as e:
+        return None, str(e)
diff --git a/backend/app/crud/llm.py b/backend/app/crud/llm.py
@@ -1,18 +1,10 @@
-"""
-CRUD operations for LLM calls.
-
-This module handles database operations for LLM calls including:
-1. Creating new LLM call records
-2. Updating LLM call responses
-3. Fetching LLM calls by ID
-"""
-
 import logging
 from typing import Any, Literal
 
 from uuid import UUID
 from sqlmodel import Session, select
 from app.core.util import now
+import base64
 import json
 from app.models.llm import LlmCall, LLMCallRequest, ConfigBlob
 from app.models.llm.request import (
@@ -41,7 +33,8 @@ def serialize_input(query_input: QueryInput | str) -> str:
                 "type": "audio",
                 "format": query_input.content.format,
                 "mime_type": query_input.content.mime_type,
-                "size_bytes": len(query_input.content.value),
+                # approximate byte size from b64encoded value
+                "size_bytes": len(query_input.content.value) * 3 // 4,
             }
         )
     else:
@@ -74,8 +67,10 @@ def create_llm_call(
     """
     # Determine input/output types based on completion config type
     completion_config = resolved_config.completion
-    completion_type = completion_config.type or getattr(
-        completion_config.params, "type", "text"
+    completion_type = completion_config.type or (
+        completion_config.params.get("type", "text")
+        if isinstance(completion_config.params, dict)
+        else getattr(completion_config.params, "type", "text")
     )
 
     input_type: Literal["text", "audio", "image"]
@@ -92,9 +87,9 @@ def create_llm_call(
         output_type = "text"
 
     model = (
-        completion_config.params.model
-        if hasattr(completion_config.params, "model")
-        else completion_config.params.get("model", "")
+        completion_config.params.get("model", "")
+        if isinstance(completion_config.params, dict)
+        else getattr(completion_config.params, "model", "")
     )
 
     # Build config dict for storage
@@ -174,8 +169,23 @@ def update_llm_call_response(
 
     if provider_response_id is not None:
         db_llm_call.provider_response_id = provider_response_id
+
     if content is not None:
+        # For audio outputs (AudioOutput model): calculate size metadata from base64 content
+        # AudioOutput serializes as: {"type": "audio", "content": {"format": "base64", "value": "...", "mime_type": "..."}}
+        if content.get("type") == "audio":
+            audio_value = content.get("content", {}).get("value")
+            if audio_value:
+                try:
+                    audio_data = base64.b64decode(audio_value)
+                    content["audio_size_bytes"] = len(audio_data)
+                except Exception as e:
+                    logger.warning(
+                        f"[update_llm_call_response] Failed to calculate audio size: {e}"
+                    )
+
         db_llm_call.content = content
+
     if usage is not None:
         db_llm_call.usage = usage
     if conversation_id is not None:

diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py
@@ -1,14 +1,11 @@
+import sqlalchemy as sa
 from typing import Annotated, Any, Literal, Union
-
 from uuid import UUID, uuid4
-from sqlmodel import Field, SQLModel
-from pydantic import Discriminator, model_validator, HttpUrl
+from pydantic import model_validator, HttpUrl
 from datetime import datetime
-from app.core.util import now
-
-import sqlalchemy as sa
 from sqlalchemy.dialects.postgresql import JSONB
 from sqlmodel import Field, SQLModel, Index, text
+from app.core.util import now
 
 
 class TextLLMParams(SQLModel):
@@ -70,8 +67,8 @@ class TextContent(SQLModel):
 
 class AudioContent(SQLModel):
     format: Literal["base64"] = "base64"
-    value: str = Field(..., min_length=1, description="Base64 encoded audio")
-    # keeping the mime_type liberal here, since does not affect transcription type
+    value: str = Field(..., description="Base64 encoded audio")
+    # keeping the mime_type liberal here, since does not affect base64 encoding
     mime_type: str | None = Field(
         None,
         description="MIME type of the audio (e.g., audio/wav, audio/mp3, audio/ogg)",
@@ -487,8 +484,13 @@ class LlmCall(SQLModel, table=True):
 
     updated_at: datetime = Field(
         default_factory=now,
-        nullable=False,
-        sa_column_kwargs={"comment": "Timestamp when the LLM call was last updated"},
+        sa_column=sa.Column(
+            sa.DateTime,
+            default=now,
+            nullable=False,
+            onupdate=now,
+            comment="Timestamp when the LLM call was last updated",
+        ),
     )
 
     deleted_at: datetime | None = Field(

diff --git a/backend/app/models/llm/response.py b/backend/app/models/llm/response.py
@@ -3,7 +3,6 @@
 
 This module contains structured response models for LLM API calls.
 """
-
 from sqlmodel import SQLModel, Field
 from typing import Literal, Annotated
 from app.models.llm.request import AudioContent, TextContent
@@ -27,7 +26,7 @@ class AudioOutput(SQLModel):
 
 
 # Type alias for LLM output (discriminated union)
-LLMOutput = Annotated[TextOutput | AudioOutput | None, Field(discriminator="type")]
+LLMOutput = Annotated[TextOutput | AudioOutput, Field(discriminator="type")]
 
 
 class LLMResponse(SQLModel):
@@ -45,7 +44,7 @@ class LLMResponse(SQLModel):
     model: str = Field(
         ..., description="Model used by the provider (e.g., gpt-4-turbo)."
     )
-    output: LLMOutput = Field(
+    output: LLMOutput | None = Field(
         ...,
         description="Structured output containing text and optional additional data.",
     )

diff --git a/backend/app/services/llm/input_resolver.py b/backend/app/services/llm/input_resolver.py