KRLabsOrg · adaamko · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/scripts/code_hallucination/config.py b/scripts/code_hallucination/config.py
@@ -36,6 +36,7 @@
 HALLUCINATION_RATIO = 0.4  # 40% hallucinated, 60% clean
 MAX_FILE_CHARS = 12000  # Cap individual source file size
 MAX_CONTEXT7_CHARS = 4000  # Documentation fetch limit
+MAX_PROMPT_CHARS = 24000  # ~6K tokens, leaves room for answer within 8K model context
 
 # === LLM Config ===
 RETRY_DELAY = 2

diff --git a/scripts/code_hallucination/context7_docs.py b/scripts/code_hallucination/context7_docs.py
@@ -228,7 +228,7 @@ def run(instances: list[dict]):
 
             if processed % 100 == 0:
                 print(
-                    f"  Progress: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)"
+                    f"  Phase 4: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)"
                 )
 
     print(

diff --git a/scripts/code_hallucination/format_builder.py b/scripts/code_hallucination/format_builder.py
@@ -1,15 +1,21 @@
-"""Phase 5: Assign answer format to each instance."""
+"""Phase 5: Assign answer format to each instance.
 
+Supports both sequential (remote API) and async batch (local vLLM) modes.
+Set BATCH_SIZE>1 env var for parallel requests to local vLLM.
+"""
+
+import asyncio
 import json
 import random
 import textwrap
 import time
 
-from openai import OpenAI
+from openai import AsyncOpenAI, OpenAI
 
 from .config import (
     API_BASE_URL,
     API_KEY,
+    BATCH_SIZE,
     FORMAT_TYPES,
     FORMAT_WEIGHTS,
     FORMATS_PATH,
@@ -26,27 +32,24 @@
     that a developer would receive from an AI assistant.
 
     Your response MUST:
-    - Start with a brief explanation (1-3 sentences) of what the issue is and how to fix it
+    - Start with 1-2 sentences explaining what was wrong and how to fix it
     - Include the code in a properly formatted code block (```python)
-    - Optionally end with a short note about what changed or why
+    - Do NOT add anything after the code block
 
     Your response must NOT:
-    - Include phrases like "Here's the fix" or "I'll help you with that" — just explain directly
-    - Be longer than necessary — keep it concise
+    - Include phrases like "Here's the fix" or "I'll help you with that"
+    - Be longer than 2 sentences of explanation + the code block
     - Change the code in any way — use it exactly as provided
     - Add any imports or code not in the original
 
-    Example style:
-    The issue is that `process_data` uses `dict.items()` instead of iterating
-    over the sorted keys, which causes non-deterministic output.
+    Example:
+    The `process_data` function uses `dict.items()` instead of iterating over sorted keys, causing non-deterministic output.
 
     ```python
     def process_data(data):
         for key in sorted(data.keys()):
             yield key, data[key]
     ```
-
-    This ensures consistent ordering regardless of insertion order.
 """)
 
 
@@ -75,7 +78,7 @@ def _generate_explanation(
                     {"role": "user", "content": user_msg},
                 ],
                 temperature=LLM_TEMPERATURE,
-                max_tokens=2000,
+                max_tokens=200,
             )
             result = response.choices[0].message.content.strip()
             # Verify the code is actually in the response
@@ -94,6 +97,47 @@ def _generate_explanation(
     return None
 
 
+async def _generate_explanation_async(
+    aclient: AsyncOpenAI, model: str, code: str, query: str, context: str
+) -> str | None:
+    """Async version of _generate_explanation for batch processing."""
+    user_msg = f"""User's question: {query}
+
+Context (relevant source code):
+{context[:3000]}
+
+Correct code fix:
+```python
+{code}
+```
+
+Write a natural AI assistant response that includes this exact code."""
+
+    for attempt in range(MAX_RETRIES):
+        try:
+            response = await aclient.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": EXPLANATION_SYSTEM_PROMPT},
+                    {"role": "user", "content": user_msg},
+                ],
+                temperature=LLM_TEMPERATURE,
+                max_tokens=200,
+            )
+            result = response.choices[0].message.content.strip()
+            if code[:50] in result or "```" in result:
+                return result
+            if attempt < MAX_RETRIES - 1:
+                continue
+            return None
+        except Exception:
+            if attempt < MAX_RETRIES - 1:
+                await asyncio.sleep(RETRY_DELAY * (attempt + 1))
+            else:
+                return None
+    return None
+
+
 def assign_format(source_data: dict) -> tuple[str, str]:
     """Assign a format type and build the answer for an instance.
 
@@ -169,7 +213,8 @@ def run(
 ):
     """Run Phase 5: Assign formats and build answers.
 
-    Returns list of dicts with instance_id, format_type, answer.
+    Uses async batch processing when BATCH_SIZE > 1 (for local vLLM).
+    Falls back to sequential processing for remote APIs (BATCH_SIZE=1).
     """
     print("=" * 60)
     print("Phase 5: Answer Format Building")
@@ -180,70 +225,187 @@ def run(
     if queries is None:
         queries = {}
 
-    # Only init LLM client if we'll need it (lazy)
-    client = None
-
-    results = []
-    format_counts = {fmt: 0 for fmt in FORMAT_TYPES}
-    skipped = 0
-    explanation_failures = 0
-
-    for inst in instances:
+    # Load existing for resumability
+    existing = {}
+    if FORMATS_PATH.exists():
+        with open(FORMATS_PATH) as f:
+            for line in f:
+                try:
+                    entry = json.loads(line)
+                    existing[entry["instance_id"]] = entry
+                except (json.JSONDecodeError, KeyError):
+                    continue
+    print(f"Already processed: {len(existing)} formats")
+
+    to_process = [inst for inst in instances if inst["instance_id"] not in existing]
+    print(f"Remaining: {len(to_process)} instances to process")
+    print(f"Batch size: {BATCH_SIZE}")
+
+    # First pass: assign formats for all instances (no LLM needed)
+    # Collect which ones need explanation generation
+    needs_explanation = []  # (instance_id, code, query, context)
+    entries_no_llm = []  # entries that don't need LLM
+
+    for inst in to_process:
         instance_id = inst["instance_id"]
 
-        # Load source data from cache
         cache_path = source_cache_dir / f"{instance_id}.json"
         if not cache_path.exists():
-            skipped += 1
             continue
 
-        with open(cache_path) as f:
-            source_data = json.load(f)
+        with open(cache_path) as fp:
+            source_data = json.load(fp)
 
         fmt, answer = assign_format(source_data)
         if fmt is None:
-            skipped += 1
             continue
 
-        # Generate explanation wrapper for code_with_explanation format
         if fmt == "code_with_explanation":
-            if client is None:
-                client = OpenAI(api_key=api_key, base_url=base_url)
-                print(f"  LLM client initialized for code_with_explanation ({base_url})")
-
             query = queries.get(instance_id, inst.get("problem_statement", "")[:500])
             context = source_data.get("patch_code", "")
-            explained = _generate_explanation(client, model, answer, query, context)
+            needs_explanation.append((instance_id, answer, query, context, fmt))
+        else:
+            entries_no_llm.append(
+                {
+                    "instance_id": instance_id,
+                    "format_type": fmt,
+                    "answer": answer,
+                }
+            )
+
+    # Write non-LLM entries immediately
+    results = list(existing.values())
+    format_counts = {fmt: 0 for fmt in FORMAT_TYPES}
+    for entry in results:
+        fmt = entry.get("format_type")
+        if fmt in format_counts:
+            format_counts[fmt] += 1
+
+    processed = 0
+    explanation_failures = 0
+
+    with open(FORMATS_PATH, "a") as f:
+        for entry in entries_no_llm:
+            f.write(json.dumps(entry) + "\n")
+            results.append(entry)
+            format_counts[entry["format_type"]] += 1
+            processed += 1
+        f.flush()
+
+    print(f"  Assigned {len(entries_no_llm)} non-LLM formats")
+    print(f"  Need LLM explanation: {len(needs_explanation)} instances")
+
+    # Second pass: generate explanations (batched or sequential)
+    if needs_explanation:
+        if BATCH_SIZE > 1:
+            explanation_failures = _run_explanations_batched(
+                needs_explanation, format_counts, results, api_key, base_url, model
+            )
+        else:
+            explanation_failures = _run_explanations_sequential(
+                needs_explanation, format_counts, results, api_key, base_url, model
+            )
+
+    processed += len(needs_explanation)
+
+    print(f"\nAssigned formats for {len(results)} instances")
+    if explanation_failures:
+        print(f"  Explanation generation failures (fell back to fragment): {explanation_failures}")
+    for fmt, count in format_counts.items():
+        pct = count * 100 // max(len(results), 1)
+        print(f"  {fmt}: {count} ({pct}%)")
+
+    return results
+
+
+def _run_explanations_sequential(
+    needs_explanation, format_counts, results, api_key, base_url, model
+):
+    """Generate explanations sequentially (for remote APIs)."""
+    client = OpenAI(api_key=api_key, base_url=base_url)
+    explanation_failures = 0
+    processed = 0
+
+    with open(FORMATS_PATH, "a") as f:
+        for instance_id, code, query, context, _ in needs_explanation:
+            explained = _generate_explanation(client, model, code, query, context)
 
             if explained is None:
-                # Fallback: use raw code as fragment
                 fmt = "fragment"
+                answer = code
                 explanation_failures += 1
             else:
+                fmt = "code_with_explanation"
                 answer = explained
 
-        results.append(
-            {
+            entry = {
                 "instance_id": instance_id,
                 "format_type": fmt,
                 "answer": answer,
             }
-        )
-        format_counts[fmt] += 1
-
-    # Save
-    with open(FORMATS_PATH, "w") as f:
-        for entry in results:
             f.write(json.dumps(entry) + "\n")
+            f.flush()
+            results.append(entry)
+            format_counts[fmt] += 1
+            processed += 1
 
-    print(f"\nAssigned formats for {len(results)} instances (skipped {skipped})")
-    if explanation_failures:
-        print(f"  Explanation generation failures (fell back to fragment): {explanation_failures}")
-    for fmt, count in format_counts.items():
-        pct = count * 100 // max(len(results), 1)
-        print(f"  {fmt}: {count} ({pct}%)")
+            if processed % 100 == 0:
+                print(
+                    f"  Phase 5 (explanations): {processed}/{len(needs_explanation)} "
+                    f"(failures: {explanation_failures})"
+                )
 
-    return results
+    return explanation_failures
+
+
+def _run_explanations_batched(needs_explanation, format_counts, results, api_key, base_url, model):
+    """Generate explanations with async batching (for local vLLM)."""
+    aclient = AsyncOpenAI(api_key=api_key, base_url=base_url)
+    explanation_failures = 0
+    processed = 0
+
+    async def process_batches():
+        nonlocal explanation_failures, processed
+
+        with open(FORMATS_PATH, "a") as f:
+            for batch_start in range(0, len(needs_explanation), BATCH_SIZE):
+                batch = needs_explanation[batch_start : batch_start + BATCH_SIZE]
+
+                tasks = []
+                for instance_id, code, query, context, _ in batch:
+                    tasks.append(_generate_explanation_async(aclient, model, code, query, context))
+
+                batch_results = await asyncio.gather(*tasks, return_exceptions=True)
+
+                for (instance_id, code, query, context, _), explained in zip(batch, batch_results):
+                    if isinstance(explained, Exception) or explained is None:
+                        fmt = "fragment"
+                        answer = code
+                        explanation_failures += 1
+                    else:
+                        fmt = "code_with_explanation"
+                        answer = explained
+
+                    entry = {
+                        "instance_id": instance_id,
+                        "format_type": fmt,
+                        "answer": answer,
+                    }
+                    f.write(json.dumps(entry) + "\n")
+                    results.append(entry)
+                    format_counts[fmt] += 1
+                    processed += 1
+
+                f.flush()
+
+                if processed % 100 == 0 or batch_start + BATCH_SIZE >= len(needs_explanation):
+                    print(
+                        f"  Phase 5 (explanations): {processed}/{len(needs_explanation)} "
+                        f"(failures: {explanation_failures})"
+                    )
+
+    asyncio.run(process_batches())
+    return explanation_failures
 
 
 if __name__ == "__main__":
-Original file line number
+Diff line change
@@ Expand Up / @@ -228,7 +228,7 @@ def run(instances: list[dict]): @@
                 if processed % 100 == 0:
                     print(
-                        f"  Progress: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)"
+                        f"  Phase 4: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)"
                     )
         print(
@@ Expand Down @@