Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions scripts/code_hallucination/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
HALLUCINATION_RATIO = 0.4 # 40% hallucinated, 60% clean
MAX_FILE_CHARS = 12000 # Cap individual source file size
MAX_CONTEXT7_CHARS = 4000 # Documentation fetch limit
MAX_PROMPT_CHARS = 24000 # ~6K tokens, leaves room for answer within 8K model context

# === LLM Config ===
RETRY_DELAY = 2
Expand Down
2 changes: 1 addition & 1 deletion scripts/code_hallucination/context7_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def run(instances: list[dict]):

if processed % 100 == 0:
print(
f" Progress: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)"
f" Phase 4: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)"
)

print(
Expand Down
260 changes: 211 additions & 49 deletions scripts/code_hallucination/format_builder.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
"""Phase 5: Assign answer format to each instance."""
"""Phase 5: Assign answer format to each instance.

Supports both sequential (remote API) and async batch (local vLLM) modes.
Set BATCH_SIZE>1 env var for parallel requests to local vLLM.
"""

import asyncio
import json
import random
import textwrap
import time

from openai import OpenAI
from openai import AsyncOpenAI, OpenAI

from .config import (
API_BASE_URL,
API_KEY,
BATCH_SIZE,
FORMAT_TYPES,
FORMAT_WEIGHTS,
FORMATS_PATH,
Expand All @@ -26,27 +32,24 @@
that a developer would receive from an AI assistant.

Your response MUST:
- Start with a brief explanation (1-3 sentences) of what the issue is and how to fix it
- Start with 1-2 sentences explaining what was wrong and how to fix it
- Include the code in a properly formatted code block (```python)
- Optionally end with a short note about what changed or why
- Do NOT add anything after the code block

Your response must NOT:
- Include phrases like "Here's the fix" or "I'll help you with that" — just explain directly
- Be longer than necessary — keep it concise
- Include phrases like "Here's the fix" or "I'll help you with that"
- Be longer than 2 sentences of explanation + the code block
- Change the code in any way — use it exactly as provided
- Add any imports or code not in the original

Example style:
The issue is that `process_data` uses `dict.items()` instead of iterating
over the sorted keys, which causes non-deterministic output.
Example:
The `process_data` function uses `dict.items()` instead of iterating over sorted keys, causing non-deterministic output.

```python
def process_data(data):
for key in sorted(data.keys()):
yield key, data[key]
```

This ensures consistent ordering regardless of insertion order.
""")


Expand Down Expand Up @@ -75,7 +78,7 @@ def _generate_explanation(
{"role": "user", "content": user_msg},
],
temperature=LLM_TEMPERATURE,
max_tokens=2000,
max_tokens=200,
)
result = response.choices[0].message.content.strip()
# Verify the code is actually in the response
Expand All @@ -94,6 +97,47 @@ def _generate_explanation(
return None


async def _generate_explanation_async(
aclient: AsyncOpenAI, model: str, code: str, query: str, context: str
) -> str | None:
"""Async version of _generate_explanation for batch processing."""
user_msg = f"""User's question: {query}

Context (relevant source code):
{context[:3000]}

Correct code fix:
```python
{code}
```

Write a natural AI assistant response that includes this exact code."""

for attempt in range(MAX_RETRIES):
try:
response = await aclient.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": EXPLANATION_SYSTEM_PROMPT},
{"role": "user", "content": user_msg},
],
temperature=LLM_TEMPERATURE,
max_tokens=200,
)
result = response.choices[0].message.content.strip()
if code[:50] in result or "```" in result:
return result
if attempt < MAX_RETRIES - 1:
continue
return None
except Exception:
if attempt < MAX_RETRIES - 1:
await asyncio.sleep(RETRY_DELAY * (attempt + 1))
else:
return None
return None


def assign_format(source_data: dict) -> tuple[str, str]:
"""Assign a format type and build the answer for an instance.

Expand Down Expand Up @@ -169,7 +213,8 @@ def run(
):
"""Run Phase 5: Assign formats and build answers.

Returns list of dicts with instance_id, format_type, answer.
Uses async batch processing when BATCH_SIZE > 1 (for local vLLM).
Falls back to sequential processing for remote APIs (BATCH_SIZE=1).
"""
print("=" * 60)
print("Phase 5: Answer Format Building")
Expand All @@ -180,70 +225,187 @@ def run(
if queries is None:
queries = {}

# Only init LLM client if we'll need it (lazy)
client = None

results = []
format_counts = {fmt: 0 for fmt in FORMAT_TYPES}
skipped = 0
explanation_failures = 0

for inst in instances:
# Load existing for resumability
existing = {}
if FORMATS_PATH.exists():
with open(FORMATS_PATH) as f:
for line in f:
try:
entry = json.loads(line)
existing[entry["instance_id"]] = entry
except (json.JSONDecodeError, KeyError):
continue
print(f"Already processed: {len(existing)} formats")

to_process = [inst for inst in instances if inst["instance_id"] not in existing]
print(f"Remaining: {len(to_process)} instances to process")
print(f"Batch size: {BATCH_SIZE}")

# First pass: assign formats for all instances (no LLM needed)
# Collect which ones need explanation generation
needs_explanation = [] # (instance_id, code, query, context)
entries_no_llm = [] # entries that don't need LLM

for inst in to_process:
instance_id = inst["instance_id"]

# Load source data from cache
cache_path = source_cache_dir / f"{instance_id}.json"
if not cache_path.exists():
skipped += 1
continue

with open(cache_path) as f:
source_data = json.load(f)
with open(cache_path) as fp:
source_data = json.load(fp)

fmt, answer = assign_format(source_data)
if fmt is None:
skipped += 1
continue

# Generate explanation wrapper for code_with_explanation format
if fmt == "code_with_explanation":
if client is None:
client = OpenAI(api_key=api_key, base_url=base_url)
print(f" LLM client initialized for code_with_explanation ({base_url})")

query = queries.get(instance_id, inst.get("problem_statement", "")[:500])
context = source_data.get("patch_code", "")
explained = _generate_explanation(client, model, answer, query, context)
needs_explanation.append((instance_id, answer, query, context, fmt))
else:
entries_no_llm.append(
{
"instance_id": instance_id,
"format_type": fmt,
"answer": answer,
}
)

# Write non-LLM entries immediately
results = list(existing.values())
format_counts = {fmt: 0 for fmt in FORMAT_TYPES}
for entry in results:
fmt = entry.get("format_type")
if fmt in format_counts:
format_counts[fmt] += 1

processed = 0
explanation_failures = 0

with open(FORMATS_PATH, "a") as f:
for entry in entries_no_llm:
f.write(json.dumps(entry) + "\n")
results.append(entry)
format_counts[entry["format_type"]] += 1
processed += 1
f.flush()

print(f" Assigned {len(entries_no_llm)} non-LLM formats")
print(f" Need LLM explanation: {len(needs_explanation)} instances")

# Second pass: generate explanations (batched or sequential)
if needs_explanation:
if BATCH_SIZE > 1:
explanation_failures = _run_explanations_batched(
needs_explanation, format_counts, results, api_key, base_url, model
)
else:
explanation_failures = _run_explanations_sequential(
needs_explanation, format_counts, results, api_key, base_url, model
)

processed += len(needs_explanation)

print(f"\nAssigned formats for {len(results)} instances")
if explanation_failures:
print(f" Explanation generation failures (fell back to fragment): {explanation_failures}")
for fmt, count in format_counts.items():
pct = count * 100 // max(len(results), 1)
print(f" {fmt}: {count} ({pct}%)")

return results


def _run_explanations_sequential(
needs_explanation, format_counts, results, api_key, base_url, model
):
"""Generate explanations sequentially (for remote APIs)."""
client = OpenAI(api_key=api_key, base_url=base_url)
explanation_failures = 0
processed = 0

with open(FORMATS_PATH, "a") as f:
for instance_id, code, query, context, _ in needs_explanation:
explained = _generate_explanation(client, model, code, query, context)

if explained is None:
# Fallback: use raw code as fragment
fmt = "fragment"
answer = code
explanation_failures += 1
else:
fmt = "code_with_explanation"
answer = explained

results.append(
{
entry = {
"instance_id": instance_id,
"format_type": fmt,
"answer": answer,
}
)
format_counts[fmt] += 1

# Save
with open(FORMATS_PATH, "w") as f:
for entry in results:
f.write(json.dumps(entry) + "\n")
f.flush()
results.append(entry)
format_counts[fmt] += 1
processed += 1

print(f"\nAssigned formats for {len(results)} instances (skipped {skipped})")
if explanation_failures:
print(f" Explanation generation failures (fell back to fragment): {explanation_failures}")
for fmt, count in format_counts.items():
pct = count * 100 // max(len(results), 1)
print(f" {fmt}: {count} ({pct}%)")
if processed % 100 == 0:
print(
f" Phase 5 (explanations): {processed}/{len(needs_explanation)} "
f"(failures: {explanation_failures})"
)

return results
return explanation_failures


def _run_explanations_batched(needs_explanation, format_counts, results, api_key, base_url, model):
"""Generate explanations with async batching (for local vLLM)."""
aclient = AsyncOpenAI(api_key=api_key, base_url=base_url)
explanation_failures = 0
processed = 0

async def process_batches():
nonlocal explanation_failures, processed

with open(FORMATS_PATH, "a") as f:
for batch_start in range(0, len(needs_explanation), BATCH_SIZE):
batch = needs_explanation[batch_start : batch_start + BATCH_SIZE]

tasks = []
for instance_id, code, query, context, _ in batch:
tasks.append(_generate_explanation_async(aclient, model, code, query, context))

batch_results = await asyncio.gather(*tasks, return_exceptions=True)

for (instance_id, code, query, context, _), explained in zip(batch, batch_results):
if isinstance(explained, Exception) or explained is None:
fmt = "fragment"
answer = code
explanation_failures += 1
else:
fmt = "code_with_explanation"
answer = explained

entry = {
"instance_id": instance_id,
"format_type": fmt,
"answer": answer,
}
f.write(json.dumps(entry) + "\n")
results.append(entry)
format_counts[fmt] += 1
processed += 1

f.flush()

if processed % 100 == 0 or batch_start + BATCH_SIZE >= len(needs_explanation):
print(
f" Phase 5 (explanations): {processed}/{len(needs_explanation)} "
f"(failures: {explanation_failures})"
)

asyncio.run(process_batches())
return explanation_failures


if __name__ == "__main__":
Expand Down
Loading