Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
logs/
cache/
output/
output*/

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
2 changes: 2 additions & 0 deletions env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
uv pip install datasets==3.2.0
uv pip install anthropic==0.43.0
74 changes: 72 additions & 2 deletions lcb_runner/benchmarks/code_generation.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import json
import zlib
import pickle
Expand All @@ -8,6 +9,8 @@

from datasets import load_dataset

CODE_GENERATION_DATASET_PATH = os.getenv("CODE_GENERATION_DATASET_PATH", "livecodebench/code_generation_lite")
VERSION_FILES = None

class Platform(Enum):
LEETCODE = "leetcode"
Expand Down Expand Up @@ -121,8 +124,75 @@ def get_evaluation_sample(self):
}


def get_code_generation_version_files(release_version):
global VERSION_FILES

if VERSION_FILES is not None:
if release_version in VERSION_FILES:
return VERSION_FILES[release_version]
else:
return "test.jsonl"

VERSION_FILES = {
"release_v1": ["test.jsonl"],
"release_v2": ["test.jsonl", "test2.jsonl"],
"release_v3": ["test.jsonl", "test2.jsonl", "test3.jsonl"],
"release_v4": ["test.jsonl", "test2.jsonl", "test3.jsonl", "test4.jsonl"],
"release_v5": [
"test.jsonl",
"test2.jsonl",
"test3.jsonl",
"test4.jsonl",
"test5.jsonl",
],
"release_v6": [
"test.jsonl",
"test2.jsonl",
"test3.jsonl",
"test4.jsonl",
"test5.jsonl",
"test6.jsonl",
],
"release_latest": [
"test.jsonl",
"test2.jsonl",
"test3.jsonl",
"test4.jsonl",
"test5.jsonl",
"test6.jsonl",
],
}

v_list = ["v1", "v2", "v3", "v4", "v5", "v6", "v7"]
for v in v_list:
idx = int(v[1:])
VERSION_FILES[v] = [f"test{idx}.jsonl" if idx != 1 else "test.jsonl"]

for idx1 in range(1, len(v_list) + 1):
for idx2 in range(idx1 + 1, len(v_list) + 1):
key = v_list[idx1 - 1] + "_" + v_list[idx2 - 1]
VERSION_FILES[key] = [
f"test{idx}.jsonl" if idx != 1 else "test.jsonl"
for idx in range(idx1, idx2 + 1)
]

return VERSION_FILES[release_version]


def load_code_generation_dataset(release_version="release_v1", start_date=None, end_date=None) -> list[CodeGenerationProblem]:
dataset = load_dataset("livecodebench/code_generation_lite", split="test", version_tag=release_version, trust_remote_code=True)
data_dir = CODE_GENERATION_DATASET_PATH
files = get_code_generation_version_files(release_version)

dataset = []
for fname in files:
fpath = os.path.join(data_dir, fname)
if not os.path.exists(fpath):
continue

with open(fpath, "r") as f:
for line in f:
dataset.append(json.loads(line))

dataset = [CodeGenerationProblem(**p) for p in dataset] # type: ignore
if start_date is not None:
p_start_date = datetime.strptime(start_date, "%Y-%m-%d")
Expand All @@ -137,7 +207,7 @@ def load_code_generation_dataset(release_version="release_v1", start_date=None,


def load_code_generation_dataset_not_fast(release_version="release_v1") -> list[CodeGenerationProblem]:
dataset = load_dataset("livecodebench/code_generation", split="test")
dataset = load_dataset(CODE_GENERATION_DATASET_PATH, split="test")
dataset = [CodeGenerationProblem(**p) for p in dataset] # type: ignore
print(f"Loaded {len(dataset)} problems")
return dataset
Expand Down
7 changes: 7 additions & 0 deletions lcb_runner/lm_styles.py
Original file line number Diff line number Diff line change
Expand Up @@ -861,6 +861,13 @@ def to_dict(self) -> dict:
datetime(2024, 4, 1),
"https://huggingface.co/agentica-org/DeepCoder-14B-Preview",
),
LanguageModel(
"deepseek-ai/DeepSeek-V4-Pro/preview-test",
"DeepSeek-V4-Pro",
LMStyle.DeepSeekAPI,
datetime(2024, 5, 27),
"https://cloud.siliconflow.cn/me/playground/chat/17885303197",
),
]

LanguageModelStore: dict[str, LanguageModel] = {
Expand Down
7 changes: 4 additions & 3 deletions lcb_runner/runner/base_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def run_single(combined_args) -> list[str]:
prompt: str | list[dict[str, str]]
cache: dict[str, str]
call_method: callable
prompt, cache, args, call_method = combined_args
prompt, idx, cache, args, call_method = combined_args

if isinstance(prompt, list):
prompt_cache = json.dumps(prompt)
Expand All @@ -59,7 +59,7 @@ def run_single(combined_args) -> list[str]:
if len(cache[prompt_cache]) == args.n:
return cache[prompt_cache]

result = call_method(prompt)
result = call_method(prompt, idx)
assert len(result) == args.n

return result
Expand All @@ -69,11 +69,12 @@ def run_batch(self, prompts: list[str | list[dict[str, str]]]) -> list[list[str]
arguments = [
(
prompt,
idx,
self.cache, ## pass the cache as argument for cache check
self.args, ## pass the args as argument for cache check
self._run_single, ## pass the _run_single method as argument because of multiprocessing
)
for prompt in prompts
for idx, prompt in enumerate(prompts)
]
if self.args.multiprocess > 1:
parallel_outputs = run_tasks_in_parallel(
Expand Down
Loading