CodeDocGen/plot_evaluation.py at master · Kartik-Burele/CodeDocGen · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np

# -------------- Load Data -----------------

def load_results(path: str = "evaluation_results.csv") -> pd.DataFrame:
    df = pd.read_csv(path)
    return df

# -------------- Plot 1: Response Time vs File Size -----------------

def plot_response_time_vs_size(df: pd.DataFrame):
    plt.figure()
    plt.scatter(df["file_size_kb"], df["response_time_s"])
    plt.xlabel("File size (KB)")
    plt.ylabel("Response time (s)")
    plt.title("Response Time vs File Size")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("response_time_vs_size.png")
    plt.close()

# -------------- Plot 2: Hallucination Rate vs Identifiers -----------------

def plot_hallucination_vs_identifiers(df: pd.DataFrame):
    plt.figure()
    plt.scatter(df["num_identifiers"], df["hallucination_rate"])
    plt.xlabel("Number of identifiers in code")
    plt.ylabel("Hallucination rate")
    plt.title("Hallucination Rate vs Number of Identifiers")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("hallucination_vs_identifiers.png")
    plt.close()

# -------------- Plot 3: Readability Score Histogram -----------------

def plot_readability_histogram(df: pd.DataFrame):
    plt.figure()
    plt.hist(df["readability_score"], bins=5)
    plt.xlabel("Readability score (Flesch Reading Ease)")
    plt.ylabel("Number of files")
    plt.title("Distribution of Readability Scores")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("readability_histogram.png")
    plt.close()

# -------------- Plot 4: Radar Chart for LLM-as-Judge -----------------

def plot_llm_radar(df: pd.DataFrame, file_index: int = 0):
    """
    Radar chart for one file (by row index) using LLM-as-judge metrics.
    You can also average across all files if you want.
    """
    metrics = ["llm_accuracy", "llm_completeness", "llm_clarity", "llm_hallucination", "llm_style"]
    labels = ["Accuracy", "Completeness", "Clarity", "Low Hallucination", "Style"]

    row = df.iloc[file_index]

    # For hallucination, invert scale: high = good (10 - original)
    scores = [
        row["llm_accuracy"],
        row["llm_completeness"],
        row["llm_clarity"],
        10 - row["llm_hallucination"],
        row["llm_style"],
    ]

    # Radar setup
    num_vars = len(labels)
    angles = np.linspace(0, 2 * math.pi, num_vars, endpoint=False).tolist()
    scores += scores[:1]
    angles += angles[:1]

    fig, ax = plt.subplots(subplot_kw=dict(polar=True))
    ax.plot(angles, scores)
    ax.fill(angles, scores, alpha=0.1)

    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(labels)
    ax.set_yticklabels([])

    file_name = row["file_name"]
    ax.set_title(f"LLM-as-Judge Scores – {file_name}")
    plt.tight_layout()
    plt.savefig(f"llm_radar_{file_name.replace('/', '_')}.png")
    plt.close()

# -------------- Main Runner -----------------

def main():
    df = load_results("evaluation_results.csv")
    plot_response_time_vs_size(df)
    plot_hallucination_vs_identifiers(df)
    plot_readability_histogram(df)
    # Radar for each file (or just one)
    for i in range(len(df)):
        plot_llm_radar(df, file_index=i)
    print("Plots generated: PNG files in current folder.")

if __name__ == "__main__":
    main()