OpenHands · juanmichelini · Mar 12, 2026 · Mar 3, 2026 · Mar 5, 2026 · Mar 9, 2026
diff --git a/benchmarks/swebenchmultilingual/Dockerfile.swebench-deps b/benchmarks/swebenchmultilingual/Dockerfile.swebench-deps
@@ -0,0 +1,13 @@
+ARG SDK_IMAGE
+FROM ${SDK_IMAGE}
+
+USER root
+
+# Ensure the conda test environment used by SWE-Bench has docutils<0.21 and roman
+SHELL ["/bin/bash", "-c"]
+RUN conda run -n testbed pip install --no-deps --force-reinstall 'docutils<0.21' 'roman' \
+    || (source /opt/miniconda3/bin/activate testbed && pip install --no-deps --force-reinstall 'docutils<0.21' 'roman')
+# Also install into the base environment for safety
+RUN pip install --no-deps --force-reinstall 'docutils<0.21' 'roman'
+
+USER openhands
diff --git a/benchmarks/swebenchmultilingual/README.md b/benchmarks/swebenchmultilingual/README.md
@@ -0,0 +1,118 @@
+# SWE-Bench Multilingual Benchmark Evaluation
+
+This directory contains the implementation for running SWE-Bench Multilingual evaluation using OpenHands agents.
+
+## Overview
+
+SWE-Bench Multilingual is a benchmark for evaluating AI agents on real-world software engineering tasks from non-English GitHub repositories. The benchmark tests an agent's ability to understand problem statements in multiple languages, navigate codebases with non-English comments and documentation, and generate patches that resolve issues.
+
+## Dataset
+
+- **Source**: SWE-bench organization
+- **Dataset**: `SWE-bench/SWE-bench_Multilingual`
+- **Splits**: `test`
+
+## Usage
+
+### Docker Workspace (Local Evaluation)
+
+#### Step 1: Build Docker Images
+
+Before running inference, you need to build Docker images for the SWE-Bench Multilingual instances. Each instance requires a specific environment setup based on the repository and issue.
+
+```bash
+uv run python -m benchmarks.swebenchmultilingual.build_images \
+  --dataset SWE-bench/SWE-bench_Multilingual \
+  --split test \
+  --image ghcr.io/openhands/eval-agent-server \
+  --target source-minimal
+```
+
+#### Step 2: Run Inference
+
+Run evaluation using the built Docker images:
+
+```bash
+uv run swebenchmultilingual-infer path/to/llm_config.json \
+    --dataset SWE-bench/SWE-bench_Multilingual \
+    --split test \
+    --max-iterations 100 \
+    --workspace docker
+```
+
+**Selecting specific instances:**
+
+You can run evaluation on a specific subset by creating a text file with instance IDs:
+
+```bash
+# Create instances.txt with one instance ID per line
+echo "django__django-11333" > instances.txt
+
+# Run with selection
+uv run swebenchmultilingual-infer path/to/llm_config.json \
+    --select instances.txt \
+    --workspace docker
+```
+
+### Remote Workspace (Scalable Cloud Evaluation)
+
+Remote workspace enables running evaluations at scale by using a cloud-based runtime API to provision containers.
+
+#### Step 1: Run Inference with Remote Workspace
+
+```bash
+uv run swebenchmultilingual-infer path/to/llm_config.json \
+    --dataset SWE-bench/SWE-bench_Multilingual \
+    --split test \
+    --max-iterations 100 \
+    --workspace remote
+```
+
+### Evaluation
+
+After running inference, evaluate the results:
+
+```bash
+uv run swebenchmultilingual-eval <path_to_output.jsonl>
+```
+
+This will:
+1. Convert the OpenHands output format to SWE-Bench prediction format
+2. Run the SWE-Bench evaluation with the appropriate settings
+3. Generate a cost report
+
+Example:
+```bash
+uv run swebenchmultilingual-eval ./output/output.jsonl --workers 8
+```
+
+For more evaluation options:
+```bash
+uv run swebenchmultilingual-eval --help
+```
+
+## Configuration
+
+The benchmark uses similar configuration options as regular SWE-Bench:
+
+- `--dataset`: Dataset name (should be `SWE-bench/SWE-bench_Multilingual`)
+- `--split`: Dataset split (e.g., `test`)
+- `--llm-config`: Path to LLM configuration file
+- `--max-iterations`: Maximum number of agent iterations
+- `--workspace`: Either `docker` or `remote`
+- `--num-workers`: Number of parallel workers
+
+## Environment Variables
+
+- `SKIP_BUILD=1`: Skip building docker images (use pre-built images)
+- `RUNTIME_API_KEY`: Required for remote workspace
+- `RUNTIME_API_URL`: Runtime API URL (defaults to https://runtime.eval.all-hands.dev)
+
+## Multilingual Considerations
+
+When working with multilingual instances:
+
+- Problem statements may be in various languages
+- Code comments and documentation may be multilingual
+- Test output and error messages may be in non-English languages
+- The agent should be able to handle multilingual contexts effectively
diff --git a/benchmarks/swebenchmultilingual/__init__.py b/benchmarks/swebenchmultilingual/__init__.py
@@ -0,0 +1 @@
+"""SWE-Bench Multilingual benchmark implementation."""
diff --git a/benchmarks/swebenchmultilingual/build_images.py b/benchmarks/swebenchmultilingual/build_images.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+"""
+Build agent-server images for all unique SWE-Bench Multilingual base images in a dataset split,
+optionally wrapping them with a lightweight layer that pins docutils<0.21 and installs roman.
+
+Example:
+  uv run benchmarks/swebenchmultilingual/build_images.py \
+    --dataset SWE-bench/SWE-bench_Multilingual --split test \
+    --image ghcr.io/openhands/eval-agent-server --target source-minimal
+"""
+
+import sys
+from pathlib import Path
+
+from benchmarks.swebenchmultilingual import constants
+from benchmarks.swebenchmultilingual.config import BUILD_DEFAULTS
+from benchmarks.utils.build_utils import (
+    BuildOutput,
+    build_all_images,
+    default_build_output_dir,
+    get_build_parser,
+    run_docker_build_layer,
+)
+from benchmarks.utils.dataset import get_dataset
+from benchmarks.utils.image_utils import remote_image_exists
+from openhands.sdk import get_logger
+
+
+logger = get_logger(__name__)
+WRAPPER_DOCKERFILE = Path(__file__).with_name("Dockerfile.swebench-deps")
+
+
+def get_official_docker_image(
+    instance_id: str,
+    docker_image_prefix: str = constants.DOCKER_IMAGE_PREFIX,
+) -> str:
+    # Official SWE-Bench image
+    # swebench/sweb.eval.x86_64.django_1776_django-11333:v1
+    repo, name = instance_id.split("__")
+    official_image_name = docker_image_prefix.rstrip("/")
+    official_image_name += (
+        f"/sweb.eval.x86_64.{repo}_1776_{name}:{constants.DOCKER_IMAGE_TAG}".lower()
+    )
+    logger.debug(f"Official SWE-Bench image: {official_image_name}")
+    return official_image_name
+
+
+def extract_custom_tag(base_image: str) -> str:
+    """
+    Extract SWE-Bench instance ID from official SWE-Bench image name.
+
+    Example:
+        docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest
+        -> sweb.eval.x86_64.django_1776_django-12155
+    """
+    name_tag = base_image.split("/")[-1]
+    name = name_tag.split(":")[0]
+    return name
+
+
+def should_wrap_custom_tag(custom_tag: str) -> bool:
+    prefix = "sweb.eval.x86_64."
+    if custom_tag.startswith(prefix):
+        custom_tag = custom_tag[len(prefix) :]
+    return custom_tag.split("_", 1)[0] in constants.WRAPPED_REPOS
+
+
+def should_wrap_instance_id(instance_id: str) -> bool:
+    repo = instance_id.split("__")[0]
+    return repo in constants.WRAPPED_REPOS
+
+
+def collect_unique_base_images(
+    dataset,
+    split,
+    n_limit,
+    selected_instances_file: str | None = None,
+):
+    df = get_dataset(
+        dataset_name=dataset,
+        split=split,
+        eval_limit=n_limit if n_limit else None,
+        selected_instances_file=selected_instances_file,
+    )
+    return sorted(
+        {get_official_docker_image(str(row["instance_id"])) for _, row in df.iterrows()}
+    )
+
+
+def wrap_image(agent_image: str, push: bool = False) -> BuildOutput:
+    """
+    Wrap an agent-server image with pinned docutils/roman.
+
+    For pushes, verify the base tag exists in the registry. For local builds,
+    assume the tag is available locally or resolvable by Docker during buildx.
+    """
+    if push and not remote_image_exists(agent_image):
+        return BuildOutput(
+            base_image=agent_image,
+            tags=[],
+            error=(
+                f"Agent-server image {agent_image} not found in registry. "
+                "Build and push it before wrapping."
+            ),
+        )
+
+    if not WRAPPER_DOCKERFILE.exists():
+        return BuildOutput(
+            base_image=agent_image,
+            tags=[],
+            error=f"Wrapper Dockerfile not found at {WRAPPER_DOCKERFILE}",
+        )
+
+    logger.info("Wrapping %s in-place", agent_image)
+
+    return run_docker_build_layer(
+        dockerfile=WRAPPER_DOCKERFILE,
+        context=WRAPPER_DOCKERFILE.parent,
+        tags=[agent_image],
+        build_args={"SDK_IMAGE": agent_image},
+        push=push,
+        platform="linux/amd64",
+        load=not push,
+    )
+
+
+def _wrap_if_needed(result: BuildOutput, push: bool) -> BuildOutput:
+    """
+    Post-build callback that wraps images for repos that need docutils/roman.
+
+    This is passed to build_all_images as post_build_fn, integrating wrapping
+    into the main build pass with automatic retry support.
+    """
+    if not result.tags:
+        return result
+
+    agent_image = result.tags[0]
+    # Extract custom tag from the built image tag to check if wrapping is needed
+    # Format: ghcr.io/openhands/eval-agent-server:SHA-sweb.eval.x86_64.REPO_...-target
+    tag_part = agent_image.split(":")[-1] if ":" in agent_image else ""
+    # Remove SDK SHA prefix and target suffix to get the custom tag
+    parts = tag_part.split("-", 1)
+    custom_tag = parts[1].rsplit("-", 1)[0] if len(parts) > 1 else tag_part
+
+    if not should_wrap_custom_tag(custom_tag):
+        return result
+
+    logger.info("Image %s needs wrapping, applying docutils/roman layer", agent_image)
+    wrap_result = wrap_image(agent_image, push)
+    if wrap_result.error:
+        return BuildOutput(
+            base_image=result.base_image,
+            tags=result.tags,
+            error=f"Wrapping failed: {wrap_result.error}",
+        )
+
+    return result
+
+
+def main(argv: list[str]) -> int:
+    parser = get_build_parser()
+    parser.set_defaults(**BUILD_DEFAULTS)
+    args = parser.parse_args(argv)
+
+    base_images: list[str] = collect_unique_base_images(
+        args.dataset,
+        args.split,
+        args.n_limit,
+        args.select,
+    )
+    build_dir = default_build_output_dir(args.dataset, args.split)
+
+    return build_all_images(
+        base_images=base_images,
+        target=args.target,
+        build_dir=build_dir,
+        image=args.image,
+        push=args.push,
+        max_workers=args.max_workers,
+        dry_run=args.dry_run,
+        max_retries=args.max_retries,
+        base_image_to_custom_tag_fn=extract_custom_tag,
+        post_build_fn=_wrap_if_needed,
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/benchmarks/swebenchmultilingual/config.py b/benchmarks/swebenchmultilingual/config.py
@@ -0,0 +1,26 @@
+"""
+SWE-bench Multilingual benchmark configuration.
+
+Default values aligned with evaluation repository (OpenHands/evaluation).
+"""
+
+# Inference defaults (used by run_infer.py)
+INFER_DEFAULTS = {
+    "dataset": "SWE-bench/SWE-bench_Multilingual",
+    "split": "test",
+    "num_workers": 30,
+}
+
+# Evaluation defaults (used by eval_infer.py)
+EVAL_DEFAULTS = {
+    "dataset": "SWE-bench/SWE-bench_Multilingual",
+    "split": "test",
+    "workers": 12,
+    "modal": True,
+    "timeout": 3600,
+}
+
+# Build defaults (used by build_images.py)
+BUILD_DEFAULTS = {
+    "max_workers": 32,
+}
diff --git a/benchmarks/swebenchmultilingual/constants.py b/benchmarks/swebenchmultilingual/constants.py
@@ -0,0 +1,39 @@
+"""
+SWE-Bench Multilingual hyperparameters and constant values.
+
+This module provides constant values used in the SWE-Bench Multilingual evaluation workflow.
+For dataset, model, and worker defaults, see config.py (INFER_DEFAULTS, EVAL_DEFAULTS).
+"""
+
+from typing import Final, Literal
+
+
+# Docker
+DOCKER_IMAGE_PREFIX: Final[str] = "docker.io/swebench/"
+DOCKER_IMAGE_TAG: Final[str] = "latest"
+WRAPPED_REPOS: Final[frozenset[str]] = frozenset(
+    {"sphinx-doc"}
+)  # Repos requiring docutils/roman wrapper
+
+# Build target type (matches openhands.agent_server.docker.build.TargetType)
+TargetType = Literal["binary", "binary-minimal", "source", "source-minimal"]
+BUILD_TARGET_SOURCE_MINIMAL: Final[TargetType] = "source-minimal"
+BUILD_TARGET_BINARY: Final[TargetType] = "binary"
+DEFAULT_BUILD_TARGET: Final[TargetType] = BUILD_TARGET_SOURCE_MINIMAL
+
+# Runtime
+DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev"
+DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT: Final[int] = 600
+
+
+# Git
+GIT_USER_EMAIL: Final[str] = "evaluation@openhands.dev"
+GIT_USER_NAME: Final[str] = "OpenHands Evaluation"
+GIT_COMMIT_MESSAGE: Final[str] = "patch"
+
+# Patch Processing
+SETUP_FILES_TO_REMOVE: Final[tuple[str, ...]] = (
+    "pyproject.toml",
+    "tox.ini",
+    "setup.py",
+)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""SWE-Bench Multilingual benchmark implementation."""