siliconflow · leaves-zwx · Jun 1, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 logs/
 cache/
 output/
+output*/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/env.sh b/env.sh
@@ -0,0 +1,2 @@
+uv pip install datasets==3.2.0
+uv pip install anthropic==0.43.0
diff --git a/lcb_runner/benchmarks/code_generation.py b/lcb_runner/benchmarks/code_generation.py
@@ -1,3 +1,4 @@
+import os
 import json
 import zlib
 import pickle
@@ -8,6 +9,8 @@
 
 from datasets import load_dataset
 
+CODE_GENERATION_DATASET_PATH = os.getenv("CODE_GENERATION_DATASET_PATH", "livecodebench/code_generation_lite")
+VERSION_FILES = None
 
 class Platform(Enum):
     LEETCODE = "leetcode"
@@ -121,8 +124,75 @@ def get_evaluation_sample(self):
         }
 
 
+def get_code_generation_version_files(release_version):
+    global VERSION_FILES
+
+    if VERSION_FILES is not None:
+        if release_version in VERSION_FILES:
+            return VERSION_FILES[release_version]
+        else:
+            return "test.jsonl"
+
+    VERSION_FILES = {
+        "release_v1": ["test.jsonl"],
+        "release_v2": ["test.jsonl", "test2.jsonl"],
+        "release_v3": ["test.jsonl", "test2.jsonl", "test3.jsonl"],
+        "release_v4": ["test.jsonl", "test2.jsonl", "test3.jsonl", "test4.jsonl"],
+        "release_v5": [
+            "test.jsonl",
+            "test2.jsonl",
+            "test3.jsonl",
+            "test4.jsonl",
+            "test5.jsonl",
+        ],
+        "release_v6": [
+            "test.jsonl",
+            "test2.jsonl",
+            "test3.jsonl",
+            "test4.jsonl",
+            "test5.jsonl",
+            "test6.jsonl",
+        ],
+        "release_latest": [
+            "test.jsonl",
+            "test2.jsonl",
+            "test3.jsonl",
+            "test4.jsonl",
+            "test5.jsonl",
+            "test6.jsonl",
+        ],
+    }
+
+    v_list = ["v1", "v2", "v3", "v4", "v5", "v6", "v7"]
+    for v in v_list:
+        idx = int(v[1:])
+        VERSION_FILES[v] = [f"test{idx}.jsonl" if idx != 1 else "test.jsonl"]
+
+    for idx1 in range(1, len(v_list) + 1):
+        for idx2 in range(idx1 + 1, len(v_list) + 1):
+            key = v_list[idx1 - 1] + "_" + v_list[idx2 - 1]
+            VERSION_FILES[key] = [
+                f"test{idx}.jsonl" if idx != 1 else "test.jsonl"
+                for idx in range(idx1, idx2 + 1)
+            ]
+
+    return VERSION_FILES[release_version]
+
+
 def load_code_generation_dataset(release_version="release_v1", start_date=None, end_date=None) -> list[CodeGenerationProblem]:
-    dataset = load_dataset("livecodebench/code_generation_lite", split="test", version_tag=release_version, trust_remote_code=True)
+    data_dir = CODE_GENERATION_DATASET_PATH
+    files = get_code_generation_version_files(release_version)
+
+    dataset = []
+    for fname in files:
+        fpath = os.path.join(data_dir, fname)
+        if not os.path.exists(fpath):
+            continue
+
+        with open(fpath, "r") as f:
+            for line in f:
+                dataset.append(json.loads(line))
+
     dataset = [CodeGenerationProblem(**p) for p in dataset]  # type: ignore
     if start_date is not None:
         p_start_date = datetime.strptime(start_date, "%Y-%m-%d")
@@ -137,7 +207,7 @@ def load_code_generation_dataset(release_version="release_v1", start_date=None,
 
 
 def load_code_generation_dataset_not_fast(release_version="release_v1") -> list[CodeGenerationProblem]:
-    dataset = load_dataset("livecodebench/code_generation", split="test")
+    dataset = load_dataset(CODE_GENERATION_DATASET_PATH, split="test")
     dataset = [CodeGenerationProblem(**p) for p in dataset]  # type: ignore
     print(f"Loaded {len(dataset)} problems")
     return dataset

diff --git a/lcb_runner/lm_styles.py b/lcb_runner/lm_styles.py
@@ -861,6 +861,13 @@ def to_dict(self) -> dict:
         datetime(2024, 4, 1),
         "https://huggingface.co/agentica-org/DeepCoder-14B-Preview",
     ),
+    LanguageModel(
+        "deepseek-ai/DeepSeek-V4-Pro/preview-test",
+        "DeepSeek-V4-Pro",
+        LMStyle.DeepSeekAPI,
+        datetime(2024, 5, 27),
+        "https://cloud.siliconflow.cn/me/playground/chat/17885303197",
+    ),
 ]
 
 LanguageModelStore: dict[str, LanguageModel] = {

diff --git a/lcb_runner/runner/base_runner.py b/lcb_runner/runner/base_runner.py
@@ -46,7 +46,7 @@ def run_single(combined_args) -> list[str]:
         prompt: str | list[dict[str, str]]
         cache: dict[str, str]
         call_method: callable
-        prompt, cache, args, call_method = combined_args
+        prompt, idx, cache, args, call_method = combined_args
 
         if isinstance(prompt, list):
             prompt_cache = json.dumps(prompt)
@@ -59,7 +59,7 @@ def run_single(combined_args) -> list[str]:
             if len(cache[prompt_cache]) == args.n:
                 return cache[prompt_cache]
 
-        result = call_method(prompt)
+        result = call_method(prompt, idx)
         assert len(result) == args.n
 
         return result
@@ -69,11 +69,12 @@ def run_batch(self, prompts: list[str | list[dict[str, str]]]) -> list[list[str]
         arguments = [
             (
                 prompt,
+                idx,
                 self.cache,  ## pass the cache as argument for cache check
                 self.args,  ## pass the args as argument for cache check
                 self._run_single,  ## pass the _run_single method as argument because of multiprocessing
             )
-            for prompt in prompts
+            for idx, prompt in enumerate(prompts)
         ]
         if self.args.multiprocess > 1:
             parallel_outputs = run_tasks_in_parallel(
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		uv pip install datasets==3.2.0
		uv pip install anthropic==0.43.0