-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathplot_evaluation.py
More file actions
105 lines (86 loc) · 3.23 KB
/
Copy pathplot_evaluation.py
File metadata and controls
105 lines (86 loc) · 3.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np
# -------------- Load Data -----------------
def load_results(path: str = "evaluation_results.csv") -> pd.DataFrame:
df = pd.read_csv(path)
return df
# -------------- Plot 1: Response Time vs File Size -----------------
def plot_response_time_vs_size(df: pd.DataFrame):
plt.figure()
plt.scatter(df["file_size_kb"], df["response_time_s"])
plt.xlabel("File size (KB)")
plt.ylabel("Response time (s)")
plt.title("Response Time vs File Size")
plt.grid(True)
plt.tight_layout()
plt.savefig("response_time_vs_size.png")
plt.close()
# -------------- Plot 2: Hallucination Rate vs Identifiers -----------------
def plot_hallucination_vs_identifiers(df: pd.DataFrame):
plt.figure()
plt.scatter(df["num_identifiers"], df["hallucination_rate"])
plt.xlabel("Number of identifiers in code")
plt.ylabel("Hallucination rate")
plt.title("Hallucination Rate vs Number of Identifiers")
plt.grid(True)
plt.tight_layout()
plt.savefig("hallucination_vs_identifiers.png")
plt.close()
# -------------- Plot 3: Readability Score Histogram -----------------
def plot_readability_histogram(df: pd.DataFrame):
plt.figure()
plt.hist(df["readability_score"], bins=5)
plt.xlabel("Readability score (Flesch Reading Ease)")
plt.ylabel("Number of files")
plt.title("Distribution of Readability Scores")
plt.grid(True)
plt.tight_layout()
plt.savefig("readability_histogram.png")
plt.close()
# -------------- Plot 4: Radar Chart for LLM-as-Judge -----------------
def plot_llm_radar(df: pd.DataFrame, file_index: int = 0):
"""
Radar chart for one file (by row index) using LLM-as-judge metrics.
You can also average across all files if you want.
"""
metrics = ["llm_accuracy", "llm_completeness", "llm_clarity", "llm_hallucination", "llm_style"]
labels = ["Accuracy", "Completeness", "Clarity", "Low Hallucination", "Style"]
row = df.iloc[file_index]
# For hallucination, invert scale: high = good (10 - original)
scores = [
row["llm_accuracy"],
row["llm_completeness"],
row["llm_clarity"],
10 - row["llm_hallucination"],
row["llm_style"],
]
# Radar setup
num_vars = len(labels)
angles = np.linspace(0, 2 * math.pi, num_vars, endpoint=False).tolist()
scores += scores[:1]
angles += angles[:1]
fig, ax = plt.subplots(subplot_kw=dict(polar=True))
ax.plot(angles, scores)
ax.fill(angles, scores, alpha=0.1)
ax.set_xticks(angles[:-1])
ax.set_xticklabels(labels)
ax.set_yticklabels([])
file_name = row["file_name"]
ax.set_title(f"LLM-as-Judge Scores – {file_name}")
plt.tight_layout()
plt.savefig(f"llm_radar_{file_name.replace('/', '_')}.png")
plt.close()
# -------------- Main Runner -----------------
def main():
df = load_results("evaluation_results.csv")
plot_response_time_vs_size(df)
plot_hallucination_vs_identifiers(df)
plot_readability_histogram(df)
# Radar for each file (or just one)
for i in range(len(df)):
plot_llm_radar(df, file_index=i)
print("Plots generated: PNG files in current folder.")
if __name__ == "__main__":
main()