self-evolving-codegen/config.py at main · tathadn/self-evolving-codegen · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""Centralized configuration for model names, cost controls, and evolution settings.

All agent files must import their model name and max_tokens from here.
To override a model for a single run, set the env var before invoking:
    ORCHESTRATOR_MODEL=claude-opus-4-6 python run_evolution.py --experiment final_run
"""

from __future__ import annotations

import os

from dotenv import load_dotenv

load_dotenv()

ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")

# ── LangSmith tracing ──────────────────────────────────────────────────────────
# Set LANGCHAIN_TRACING_V2=true and LANGCHAIN_API_KEY in your .env to enable.
# All LangGraph runs will appear in the LangSmith project below.
LANGCHAIN_TRACING_V2 = os.getenv("LANGCHAIN_TRACING_V2", "false").lower() == "true"
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY", "")
LANGCHAIN_PROJECT = os.getenv("LANGCHAIN_PROJECT", "self-evolving-codegen")

# ── Model assignments ──────────────────────────────────────────────────────────
# During development (Phases 4-5): Sonnet everywhere.
# For the final showcase run (Phase 6), override via env var — do NOT edit this file:
#   ORCHESTRATOR_MODEL=claude-opus-4-6 python run_evolution.py --experiment final_run
ORCHESTRATOR_MODEL = os.getenv("ORCHESTRATOR_MODEL", "claude-sonnet-4-6")
PLANNER_MODEL = os.getenv("PLANNER_MODEL", "claude-sonnet-4-6")
CODER_MODEL = os.getenv("CODER_MODEL", "claude-sonnet-4-6")
REVIEWER_MODEL = os.getenv("REVIEWER_MODEL", "claude-sonnet-4-6")
TESTER_MODEL = os.getenv("TESTER_MODEL", "claude-sonnet-4-6")
ANALYZER_MODEL = os.getenv("ANALYZER_MODEL", "claude-sonnet-4-6")
EVOLVER_MODEL = os.getenv("EVOLVER_MODEL", "claude-sonnet-4-6")
# EVALUATOR must ALWAYS be Haiku. It runs on every task in every generation.
# Using Sonnet here would cost ~12x more with minimal quality gain.
EVALUATOR_MODEL = os.getenv("EVALUATOR_MODEL", "claude-haiku-4-5-20251001")

# ── Max tokens per component (controls output cost) ───────────────────────────
MAX_TOKENS: dict[str, int] = {
    "orchestrator": 1500,
    "planner": 2500,
    "coder": 4000,
    "reviewer": 1500,
    "tester": 4000,
    "evaluator": 1500,  # structured JSON — keep short
    "analyzer": 2000,
    "evolver": 2000,
}

# ── Evolution settings ─────────────────────────────────────────────────────────
DEFAULT_GENERATIONS = 10
DEFAULT_BATCH_SIZE = 5
MAX_PROMPT_LENGTH = 1000  # words — prevent prompt bloat

# ── Cost controls ──────────────────────────────────────────────────────────────
ROLLBACK_THRESHOLD = 0.15  # Revert if gen N+1 is >15% worse than gen N
API_CALL_DELAY = 1.5  # Seconds between API calls (rate limit protection on Pro plan)
ENABLE_CACHE = True  # NEVER set to False during development
CACHE_DIR = ".cache/pipeline_runs"

# ── Scoring weights ────────────────────────────────────────────────────────────
WEIGHTS: dict[str, float] = {
    "bug_detection_rate": 0.30,
    "false_failure_rate": 0.25,  # inverted: lower is better
    "coverage_quality": 0.20,
    "edge_case_coverage": 0.15,
    "redundancy_rate": 0.10,  # inverted: lower is better
}

# ── Sample tasks for evolution ─────────────────────────────────────────────────
SAMPLE_TASKS: list[str] = [
    "A Python calculator that supports add, subtract, multiply, divide with error handling"
    " for division by zero",
    "A Python FastAPI server with /health and /echo POST endpoints",
    "A Python linked list implementation with insert, delete, search, and reverse methods",
    "A Python file-based todo list manager with add, remove, list, and mark-complete operations",
    "A Python password validator that checks length, uppercase, lowercase, digits, and special"
    " characters",
    "A Python CSV parser that reads a file and computes column statistics (mean, median, min, max)",
    "A Python rate limiter class using the token bucket algorithm",
    "A Python LRU cache implementation with get and put operations",
    "A Python Markdown-to-HTML converter supporting headers, bold, italic, and links",
    "A Python binary search tree with insert, delete, search, and in-order traversal",
]