From 667619258ee663a34b8994477a0302b51ca9006e Mon Sep 17 00:00:00 2001 From: XinzeLi Date: Mon, 1 Jun 2026 15:39:41 +0800 Subject: [PATCH] optimize concurent eval --- .gitignore | 1 + env.sh | 2 + lcb_runner/benchmarks/code_generation.py | 74 +++++++- lcb_runner/lm_styles.py | 7 + lcb_runner/runner/base_runner.py | 7 +- lcb_runner/runner/deepseek_runner.py | 221 ++++++++++++++++++++--- lcb_runner/runner/parser.py | 5 + lcb_runner/utils/multiprocess.py | 112 ++++-------- live_code_bench.sh | 27 +++ 9 files changed, 344 insertions(+), 112 deletions(-) create mode 100644 env.sh create mode 100644 live_code_bench.sh diff --git a/.gitignore b/.gitignore index dd68cc6c..b1d8f2de 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ logs/ cache/ output/ +output*/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/env.sh b/env.sh new file mode 100644 index 00000000..eb1835ee --- /dev/null +++ b/env.sh @@ -0,0 +1,2 @@ +uv pip install datasets==3.2.0 +uv pip install anthropic==0.43.0 \ No newline at end of file diff --git a/lcb_runner/benchmarks/code_generation.py b/lcb_runner/benchmarks/code_generation.py index e2e48a73..6fb227aa 100644 --- a/lcb_runner/benchmarks/code_generation.py +++ b/lcb_runner/benchmarks/code_generation.py @@ -1,3 +1,4 @@ +import os import json import zlib import pickle @@ -8,6 +9,8 @@ from datasets import load_dataset +CODE_GENERATION_DATASET_PATH = os.getenv("CODE_GENERATION_DATASET_PATH", "livecodebench/code_generation_lite") +VERSION_FILES = None class Platform(Enum): LEETCODE = "leetcode" @@ -121,8 +124,75 @@ def get_evaluation_sample(self): } +def get_code_generation_version_files(release_version): + global VERSION_FILES + + if VERSION_FILES is not None: + if release_version in VERSION_FILES: + return VERSION_FILES[release_version] + else: + return "test.jsonl" + + VERSION_FILES = { + "release_v1": ["test.jsonl"], + "release_v2": ["test.jsonl", "test2.jsonl"], + "release_v3": ["test.jsonl", "test2.jsonl", "test3.jsonl"], + "release_v4": ["test.jsonl", "test2.jsonl", "test3.jsonl", "test4.jsonl"], + "release_v5": [ + "test.jsonl", + "test2.jsonl", + "test3.jsonl", + "test4.jsonl", + "test5.jsonl", + ], + "release_v6": [ + "test.jsonl", + "test2.jsonl", + "test3.jsonl", + "test4.jsonl", + "test5.jsonl", + "test6.jsonl", + ], + "release_latest": [ + "test.jsonl", + "test2.jsonl", + "test3.jsonl", + "test4.jsonl", + "test5.jsonl", + "test6.jsonl", + ], + } + + v_list = ["v1", "v2", "v3", "v4", "v5", "v6", "v7"] + for v in v_list: + idx = int(v[1:]) + VERSION_FILES[v] = [f"test{idx}.jsonl" if idx != 1 else "test.jsonl"] + + for idx1 in range(1, len(v_list) + 1): + for idx2 in range(idx1 + 1, len(v_list) + 1): + key = v_list[idx1 - 1] + "_" + v_list[idx2 - 1] + VERSION_FILES[key] = [ + f"test{idx}.jsonl" if idx != 1 else "test.jsonl" + for idx in range(idx1, idx2 + 1) + ] + + return VERSION_FILES[release_version] + + def load_code_generation_dataset(release_version="release_v1", start_date=None, end_date=None) -> list[CodeGenerationProblem]: - dataset = load_dataset("livecodebench/code_generation_lite", split="test", version_tag=release_version, trust_remote_code=True) + data_dir = CODE_GENERATION_DATASET_PATH + files = get_code_generation_version_files(release_version) + + dataset = [] + for fname in files: + fpath = os.path.join(data_dir, fname) + if not os.path.exists(fpath): + continue + + with open(fpath, "r") as f: + for line in f: + dataset.append(json.loads(line)) + dataset = [CodeGenerationProblem(**p) for p in dataset] # type: ignore if start_date is not None: p_start_date = datetime.strptime(start_date, "%Y-%m-%d") @@ -137,7 +207,7 @@ def load_code_generation_dataset(release_version="release_v1", start_date=None, def load_code_generation_dataset_not_fast(release_version="release_v1") -> list[CodeGenerationProblem]: - dataset = load_dataset("livecodebench/code_generation", split="test") + dataset = load_dataset(CODE_GENERATION_DATASET_PATH, split="test") dataset = [CodeGenerationProblem(**p) for p in dataset] # type: ignore print(f"Loaded {len(dataset)} problems") return dataset diff --git a/lcb_runner/lm_styles.py b/lcb_runner/lm_styles.py index d10b2117..c3a18dfa 100644 --- a/lcb_runner/lm_styles.py +++ b/lcb_runner/lm_styles.py @@ -861,6 +861,13 @@ def to_dict(self) -> dict: datetime(2024, 4, 1), "https://huggingface.co/agentica-org/DeepCoder-14B-Preview", ), + LanguageModel( + "deepseek-ai/DeepSeek-V4-Pro/preview-test", + "DeepSeek-V4-Pro", + LMStyle.DeepSeekAPI, + datetime(2024, 5, 27), + "https://cloud.siliconflow.cn/me/playground/chat/17885303197", + ), ] LanguageModelStore: dict[str, LanguageModel] = { diff --git a/lcb_runner/runner/base_runner.py b/lcb_runner/runner/base_runner.py index d99c8243..9d235e13 100644 --- a/lcb_runner/runner/base_runner.py +++ b/lcb_runner/runner/base_runner.py @@ -46,7 +46,7 @@ def run_single(combined_args) -> list[str]: prompt: str | list[dict[str, str]] cache: dict[str, str] call_method: callable - prompt, cache, args, call_method = combined_args + prompt, idx, cache, args, call_method = combined_args if isinstance(prompt, list): prompt_cache = json.dumps(prompt) @@ -59,7 +59,7 @@ def run_single(combined_args) -> list[str]: if len(cache[prompt_cache]) == args.n: return cache[prompt_cache] - result = call_method(prompt) + result = call_method(prompt, idx) assert len(result) == args.n return result @@ -69,11 +69,12 @@ def run_batch(self, prompts: list[str | list[dict[str, str]]]) -> list[list[str] arguments = [ ( prompt, + idx, self.cache, ## pass the cache as argument for cache check self.args, ## pass the args as argument for cache check self._run_single, ## pass the _run_single method as argument because of multiprocessing ) - for prompt in prompts + for idx, prompt in enumerate(prompts) ] if self.args.multiprocess > 1: parallel_outputs = run_tasks_in_parallel( diff --git a/lcb_runner/runner/deepseek_runner.py b/lcb_runner/runner/deepseek_runner.py index 2f59ba64..bf45b2d1 100644 --- a/lcb_runner/runner/deepseek_runner.py +++ b/lcb_runner/runner/deepseek_runner.py @@ -1,5 +1,7 @@ import os -from time import sleep +import time +import numpy as np +import httpx try: import openai @@ -9,62 +11,221 @@ from lcb_runner.runner.base_runner import BaseRunner +DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API") or os.getenv("DEEPSEEK_API_KEY") +DEEPSEEK_API_URL = os.getenv("DEEPSEEK_API_URL") or "https://api.deepseek.com" +MAX_RETRIES = int(os.getenv("MAX_RETRIES", 3)) + class DeepSeekRunner(BaseRunner): - client = OpenAI( - api_key=os.getenv("DEEPSEEK_API"), - base_url="https://api.deepseek.com", - ) + client = None + + def _get_client(self): + if self.client is None: + if self.args.internal_auth: + self.client = OpenAI( + api_key="dummy-key", + base_url=DEEPSEEK_API_URL, + default_headers={"X-MaaS-Auth-Token": DEEPSEEK_API_KEY}, + ) + if "extra_body" in self.client_kwargs: + extra_body = self.client_kwargs["extra_body"] + thinking = extra_body.pop("thinking", {}) + thinking_type = thinking.get("type", "") + if thinking_type == "enabled": + extra_body["enable_thinking"] = True + elif thinking_type == "disabled": + extra_body["enable_thinking"] = False + + else: + self.client = OpenAI( + api_key=DEEPSEEK_API_KEY, + base_url=DEEPSEEK_API_URL, + ) + return self.client def __init__(self, args, model): super().__init__(args, model) + self.client_kwargs: dict[str | str] = { "model": args.model, "temperature": args.temperature, - "max_tokens": args.max_tokens, + "max_completion_tokens": args.max_tokens, "top_p": args.top_p, "frequency_penalty": 0, "presence_penalty": 0, "n": 1, - "timeout": args.openai_timeout, - # "stop": args.stop, --> stop is only used for base models currently + "timeout": httpx.Timeout( + connect=60.0, + read=float(args.openai_timeout), + write=60.0, + pool=60.0, + ), + "extra_body": { + "thinking": {"type": "enabled"}, # {"type": "enabled"} or {"type": "disabled"} + "reasoning_effort": "max", + }, } - def _run_single(self, prompt: list[dict[str, str]]) -> list[str]: + self.retryable_exceptions = ( + openai.APIError, + openai.RateLimitError, + openai.InternalServerError, + openai.OpenAIError, + openai.APIStatusError, + openai.APITimeoutError, + openai.APIConnectionError, + httpx.ReadTimeout, + httpx.ConnectTimeout, + httpx.PoolTimeout, + ) + + def _run_single(self, prompt: list[dict[str, str]], idx: int) -> list[str]: assert isinstance(prompt, list) + pid = os.getpid() + retry_sleep_seconds = 10 - def __run_single(counter): + def __run_single(retries_left): try: - response = self.client.chat.completions.create( + print( + f"[PID {pid}][Task {idx}] sending streaming request (retries_left={retries_left}/{MAX_RETRIES})...", + flush=True, + ) + + start_time = time.monotonic() + stream = self._get_client().chat.completions.create( messages=prompt, + stream=True, + stream_options={"include_usage": True}, **self.client_kwargs, ) - content = response.choices[0].message.content + + chunk_count = 0 + reasoning_content = "" + content = "" + first_token_time = None + complete_time = None + decode_times = None + prev_decode_time = None + prompt_tokens = None + completion_tokens = 0 + reasoning_tokens = 0 + thinking_begin = False + thinking_end = False + last_print_time = start_time + print_streaming_every_seconds = 60 + + for chunk in stream: + now = time.monotonic() + + if chunk.usage: + if prompt_tokens is None: + prompt_tokens = chunk.usage.prompt_tokens + + assert chunk.usage.completion_tokens >= completion_tokens + completion_tokens = chunk.usage.completion_tokens + + if ( + hasattr(chunk.usage, "completion_tokens_details") + and chunk.usage.completion_tokens_details + ): + details = chunk.usage.completion_tokens_details + assert details.reasoning_tokens >= reasoning_tokens + reasoning_tokens = details.reasoning_tokens + + if len(chunk.choices) == 0: + continue + + choice = chunk.choices[0] + + if choice.finish_reason is not None: + complete_time = now + continue + + delta = chunk.choices[0].delta + if hasattr(delta, "reasoning_content") and delta.reasoning_content: + reasoning_content += delta.reasoning_content + thinking_begin = True + elif delta.content: + content += delta.content + thinking_end = True + else: + continue + + if first_token_time is None: + first_token_time = now + prev_decode_time = now + decode_time = [] + decode_times = decode_time + + dt = now - prev_decode_time + prev_decode_time = now + decode_times.append(dt) + + elapsed = now - last_print_time + if ( + print_streaming_every_seconds > 0 + and elapsed >= print_streaming_every_seconds + ): + total_elapsed = now - start_time + avg = np.mean(decode_times) * 1000 + median = np.median(decode_times) * 1000 + print( + f"[PID {pid}][Task {idx}] streaming... " + f"| received chunks: {chunk_count} " + f"| reasoning_length: {len(reasoning_content)}, content_length: {len(content)} " + f"| elapsed: {total_elapsed:.3f} (s) " + f"| avg_time_per_chunk: {avg:.3f}, median_time_per_chunk: {median:.3f} (ms) ", + flush=True, + ) + last_print_time = now + decode_times = [] + + chunk_count += 1 + + end_time = time.monotonic() + total_duration = end_time - start_time + + ttft = 0.0 + tps = 0.0 + + if first_token_time is not None: + ttft = first_token_time - start_time + if complete_time is not None: + tps = completion_tokens / (complete_time - first_token_time) + + print( + f"[PID {pid}][Task {idx}] COMPLETED " + f"| Total duration: {total_duration:.2f} (s) " + f"| TTFT: {ttft:.3f} (s) " + f"| TPS: {tps:.2f} (tokens/s) " + f"| prompt_tokens: {prompt_tokens}, completion_tokens: {completion_tokens}, reasoning_tokens: {reasoning_tokens} " + f"| reasoning_length: {len(reasoning_content)}, content_length: {len(content)} " + f"| total_chunks: {chunk_count} " + f"| content_preview: {content[:200]}", + flush=True, + ) + return content - except ( - openai.APIError, - openai.RateLimitError, - openai.InternalServerError, - openai.OpenAIError, - openai.APIStatusError, - openai.APITimeoutError, - openai.InternalServerError, - openai.APIConnectionError, - ) as e: - print("Exception: ", repr(e)) - print("Sleeping for 30 seconds...") - print("Consider reducing the number of parallel processes.") - sleep(30) - return DeepSeekRunner._run_single(prompt) + + except self.retryable_exceptions as e: + if retries_left <= 0: + print(f"[PID {pid}][Task {idx}] Max retries exhausted, giving up.", flush=True) + raise + + print(f"[PID {pid}][Task {idx}] Exception: ", repr(e), flush=True) + print(f"[PID {pid}][Task {idx}] Sleeping for {retry_sleep_seconds} seconds...", flush=True) + print(f"[PID {pid}][Task {idx}] Consider reducing the number of parallel processes.", flush=True) + time.sleep(retry_sleep_seconds) + return __run_single(retries_left - 1) except Exception as e: - print(f"Failed to run the model for {prompt}!") - print("Exception: ", repr(e)) + print(f"[PID {pid}][Task {idx}] Failed to run the model for {prompt}!", flush=True) + print(f"[PID {pid}][Task {idx}] Exception: ", repr(e), flush=True) raise e outputs = [] try: for _ in range(self.args.n): - outputs.append(__run_single(10)) + outputs.append(__run_single(MAX_RETRIES)) except Exception as e: raise e return outputs diff --git a/lcb_runner/runner/parser.py b/lcb_runner/runner/parser.py index a047fc0d..5a37bd30 100644 --- a/lcb_runner/runner/parser.py +++ b/lcb_runner/runner/parser.py @@ -131,6 +131,11 @@ def get_args(): default=None, help="End date for the contest to filter the evaluation file (format - YYYY-MM-DD)", ) + parser.add_argument( + "--internal_auth", + action="store_true", + help="Whether to use internal auth for OpenAI requests", + ) args = parser.parse_args() diff --git a/lcb_runner/utils/multiprocess.py b/lcb_runner/utils/multiprocess.py index 835c9c84..1656e3b0 100644 --- a/lcb_runner/utils/multiprocess.py +++ b/lcb_runner/utils/multiprocess.py @@ -1,4 +1,5 @@ """ Utilities for running functions in parallel processes. """ + import sys import resource import multiprocessing as mp @@ -6,11 +7,10 @@ import traceback from enum import Enum from typing import Callable, Optional, Dict, Any, List, Iterator -from concurrent.futures import TimeoutError +from concurrent.futures import ProcessPoolExecutor, as_completed, TimeoutError import attrs import tqdm -from pebble import concurrent, ProcessPool, ProcessExpired class FuncTimeoutError(TimeoutError): @@ -37,29 +37,17 @@ def run_func_in_process( _use_spawn: bool = True, **kwargs, ): - """ - Runs the provided function in a separate process with the supplied args - and kwargs. The args, kwargs, and - return values must all be pickle-able. - Args: - func: The function to run. - *args: Positional args, if any. - _timeout: A timeout to use for the function. - _use_spawn: The 'spawn' multiprocess context is used.'fork' otherwise. - **kwargs: Keyword args, if any. - Returns: - The result of executing the function. - """ mode = "spawn" if _use_spawn else "fork" - c_func = concurrent.process(timeout=_timeout, context=mp.get_context(mode))(func) - future = c_func(*args, **kwargs) - - try: - result = future.result() - return result - - except TimeoutError: - raise FuncTimeoutError + with ProcessPoolExecutor( + max_workers=1, + mp_context=mp.get_context(mode), + ) as pool: + future = pool.submit(func, *args, **kwargs) + try: + result = future.result(timeout=_timeout) + return result + except TimeoutError: + raise FuncTimeoutError class TaskRunStatus(Enum): @@ -75,6 +63,7 @@ class TaskResult: result: Optional[Any] = None exception_tb: Optional[str] = None + index: int = -1 def is_success(self) -> bool: return self.status == TaskRunStatus.SUCCESS @@ -106,32 +95,17 @@ def run_tasks_in_parallel_iter( use_spawn: bool = True, max_mem: int = 1024 * 1024 * 1024 * 4, ) -> Iterator[TaskResult]: - """ - Args: - func: The function to run. The function must accept a single argument. - tasks: A list of tasks i.e. arguments to func. - num_workers: Maximum number of parallel workers. - timeout_per_task: The timeout, in seconds, to use per task. - use_progress_bar: Whether to use a progress bar. Default False. - progress_bar_desc: String to display in the progress bar. Default None. - max_tasks_per_worker: Maximum number of tasks assigned - to a single process / worker. None means infinite. - Use 1 to force a restart. - use_spawn: The 'spawn' multiprocess context is used. 'fork' otherwise. - Returns: - A list of TaskResult objects, one per task. - """ - mode = "spawn" if use_spawn else "fork" - with ProcessPool( + with ProcessPoolExecutor( max_workers=num_workers, - max_tasks=0 if max_tasks_per_worker is None else max_tasks_per_worker, - context=mp.get_context(mode), + mp_context=mp.get_context(mode), ) as pool: - future = pool.map(func, tasks, timeout=timeout_per_task) + future_to_idx = {} + for idx, task in enumerate(tasks): + future = pool.submit(func, task) + future_to_idx[future] = idx - iterator = future.result() if use_progress_bar: pbar = tqdm.tqdm( desc=progress_bar_desc, @@ -142,50 +116,33 @@ def run_tasks_in_parallel_iter( else: pbar = None - succ = timeouts = exceptions = expirations = 0 + succ = timeouts = exceptions = 0 - while True: + for future in as_completed(future_to_idx, timeout=timeout_per_task): + idx = future_to_idx[future] try: - result = next(iterator) - - except StopIteration: - break - - except TimeoutError as error: + result = future.result() yield TaskResult( - status=TaskRunStatus.TIMEOUT, + status=TaskRunStatus.SUCCESS, + result=result, + index=idx, ) - + succ += 1 + except TimeoutError: + yield TaskResult(status=TaskRunStatus.TIMEOUT, index=idx) timeouts += 1 - - except ProcessExpired as error: - yield TaskResult( - status=TaskRunStatus.PROCESS_EXPIRED, - ) - expirations += 1 - - except Exception as error: + except Exception: exception_tb = traceback.format_exc() - yield TaskResult( status=TaskRunStatus.EXCEPTION, exception_tb=exception_tb, + index=idx, ) exceptions += 1 - else: - yield TaskResult( - status=TaskRunStatus.SUCCESS, - result=result, - ) - - succ += 1 - if pbar is not None: pbar.update(1) - pbar.set_postfix( - succ=succ, timeouts=timeouts, exc=exceptions, p_exp=expirations - ) + pbar.set_postfix(succ=succ, timeouts=timeouts, exc=exceptions, p_exp=0) sys.stdout.flush() sys.stderr.flush() @@ -216,7 +173,7 @@ def run_tasks_in_parallel( A list of TaskResult objects, one per task. """ - task_results: List[TaskResult] = list( + task_results: List[TaskResult] = sorted( run_tasks_in_parallel_iter( func=func, tasks=tasks, @@ -226,7 +183,8 @@ def run_tasks_in_parallel( progress_bar_desc=progress_bar_desc, max_tasks_per_worker=max_tasks_per_worker, use_spawn=use_spawn, - ) + ), + key=lambda r: r.index, ) return task_results diff --git a/live_code_bench.sh b/live_code_bench.sh new file mode 100644 index 00000000..f241d1bb --- /dev/null +++ b/live_code_bench.sh @@ -0,0 +1,27 @@ +source env.sh + +# export HF_DATASETS_CACHE="/data1/datasets" +# export HF_DATASETS_OFFLINE=1 + +export CODE_GENERATION_DATASET_PATH="/data1/datasets/livecodebench/code_generation_lite" + +export DEEPSEEK_API_URL="https://api-cs.siliconflow.cn" +export DEEPSEEK_API_KEY="sk-xxxx" + +MODEL="deepseek-ai/DeepSeek-V4-Pro/preview-test" + +python -m lcb_runner.runner.main \ + --model ${MODEL} \ + --scenario codegeneration \ + --release_version v7 \ + --evaluate \ + --n 1 \ + --temperature 1.0 \ + --top_p 1.0 \ + --max_tokens 393216 \ + --multiprocess 4 \ + --num_process_evaluate 2 \ + --openai_timeout 3600 \ + --internal_auth \ + + # --debug \ \ No newline at end of file