google-deepmind · huangyz0918 · Jun 12, 2026 · Jun 12, 2026
diff --git a/.gitignore b/.gitignore
@@ -224,4 +224,7 @@ data/checkpoints
 .python-version
 
 # Non-commercial Data
-data/toxicchat_*
+data/toxicchat_*
+
+# Local configuration file
+CLAUDE.md
diff --git a/experiment/README.md b/experiment/README.md
@@ -173,6 +173,12 @@ python -m experiment.aggregate_results --results-dir results/
 
 End-to-end pipeline: GMM model selection → encoder training → unified experiment → aggregation.
 
+> [!NOTE]
+> The `script/` directory and its `*.sh` wrappers are **not included** in this
+> repository. Run the `python -m experiment.*` commands above directly (section
+> 6 covers the train + sample pipeline). The variables below document the
+> intended wrapper configuration if you recreate it.
+
 ```bash
 # Run all datasets with defaults
 bash script/run_performance_estimation_all.sh
@@ -227,7 +233,9 @@ python -m experiment.exp_run_all_groups --skip-training --n-runs 10
 | `--skip-training` | off | Reuse existing encoder checkpoints |
 
 > [!NOTE]
-> The shell script `run_all_case_2.sh` wraps this command with pre-configured settings.
+> A `run_all_case_2.sh` wrapper for this command is referenced in places but is
+> not included in the repository — invoke `python -m experiment.exp_run_all_groups`
+> directly.
 
 ---
 
@@ -240,8 +248,8 @@ python -m experiment.exp_performance_estimation \
     --encoder-path data/checkpoints/encoder_svamp_new_pair.pth \
     --n-runs 5
 
-# 2. Or use the shell script (GMM → encoder training → experiment → aggregate)
-bash script/run_performance_estimation_all.sh
+# 2. Or run the full train + sample pipeline (GMM → encoder training → sampling)
+python -m experiment.exp_run_all_groups --setting new_pair --n-runs 5
 
 # 3. Failure discovery
 python -m experiment.exp_failure_discovery \

diff --git a/proeval/README.md b/proeval/README.md
@@ -309,20 +309,26 @@ for i in range(5):
 
 ## 3. Dataset — Bring Your Own Data
 
-`Dataset` bundles **questions + ground truths + a `DatasetConfig`** in a single
-object that the predictor (and, later, the sampler) operates on. Use it whenever
-you want to evaluate models on data that isn't already wired into
+`Dataset` is the single object that flows through the whole pipeline —
+**prediction → sampling → generation**. It bundles **questions + ground truths
++ a `DatasetConfig`**, and (when built from a predictions CSV) also exposes the
+prediction matrix and embeddings the sampler/generator need. Use it whenever you
+want to evaluate or sample on data that isn't already wired into
 `DATASET_CONFIGS`.
 
 ### Constructors
 
 ```python
 from proeval import Dataset, DATASET_CONFIGS, LLMPredictor
 
-# (a) Built-in: load one of the 9 datasets shipped with ProEval
+# (a) Built-in: load one of the 10 built-in datasets from HuggingFace
 ds = Dataset.from_builtin("svamp")
 
-# (b) From in-memory lists (simplest custom case)
+# (b) From a pre-computed predictions CSV — offline, and the bridge to sampling.
+#     Carries questions/ground_truths AND the label_<model> matrix + embeddings.
+ds = Dataset.from_predictions("svamp")
+
+# (c) From in-memory lists (simplest custom case)
 ds = Dataset.from_lists(
     name="my_yesno",
     questions=["Is the sky blue?", "Is fire cold?"],
@@ -333,7 +339,7 @@ ds = Dataset.from_lists(
     compare_predictions=lambda p, g: 0.0 if str(p).lower() == g else 1.0,
 )
 
-# (c) From a CSV file
+# (d) From a CSV file
 ds = Dataset.from_csv(
     "my_data.csv",
     question_col="question",
@@ -343,7 +349,35 @@ ds = Dataset.from_csv(
 ```
 
 If you already have a built-in `DatasetConfig` that fits your scoring needs,
-pass it via `config=...` and skip the four eval-function arguments.
+pass it via `config=...` and skip the four eval-function arguments. A `config`
+is only required for `predict()`; a `Dataset` built purely for sampling can omit
+it.
+
+### Use a Dataset everywhere
+
+A `Dataset` can be passed directly to the sampler and generator in place of a
+dataset-name string — one object carries the data through every stage:
+
+```python
+from proeval import BQPriorSampler, TopicAwareGenerator
+
+ds = Dataset.from_predictions("svamp")
+
+# Sampling: equivalent to sample(predictions="svamp", ...), but the Dataset
+# also supplies its name (for GMM selection) and cached predictions.
+result = BQPriorSampler(noise_variance=0.3).sample(
+    predictions=ds, target_model="gemini25_flash", budget=50,
+)
+
+# The accessors the sampler relies on are also available directly:
+matrix, model_names = ds.prediction_matrix()   # (n_samples, n_models), 1=failure
+embeddings = ds.embeddings()                    # (n_samples, d)
+
+# Generation: pass the Dataset as `df`; its name drives the prompt format.
+gen = TopicAwareGenerator(df=ds, prior_u=prior_u, prior_S=prior_S)
+```
+
+Passing a name string or a raw DataFrame still works exactly as before.
 
 ### Predict
 
@@ -411,6 +445,7 @@ results = predictor.predict_batch_parallel(
 | GSM8K      | `"gsm8k"`      | Math problem solving    |
 | SVAMP      | `"svamp"`      | Math word problems      |
 | MMLU       | `"mmlu"`       | Multiple choice         |
+| MMLU (Law) | `"mmlu_professionallaw"` | Multiple choice |
 | Jigsaw     | `"jigsaw"`     | Toxicity classification |
 | ToxicChat  | `"toxicchat"`  | Toxicity classification |
 | GQA        | `"gqa"`        | Visual QA               |
@@ -626,5 +661,9 @@ python -m experiment.exp_performance_estimation \
 
 ## Available Data Files
 
-The `data/` directory contains pre-computed prediction CSVs and embeddings for:
-`gsm8k`, `svamp`, `strategyqa`, `mmlu`, `mmlu_professionallaw`, `jigsaw`, `toxicchat`, `gqa`, `dices`.
+The `data/` directory contains pre-computed prediction CSVs and embeddings for
+8 datasets:
+`gsm8k`, `svamp`, `strategyqa`, `mmlu`, `jigsaw`, `gqa`, `dices`, `dices_t2i`.
+
+(`mmlu_professionallaw` and `toxicchat` have `DATASET_CONFIGS` entries for
+prediction via `LLMPredictor`, but no pre-computed files ship in `data/`.)
diff --git a/proeval/__init__.py b/proeval/__init__.py
@@ -31,9 +31,9 @@
         "Is the sky blue?", True, DATASET_CONFIGS["strategyqa"]
     )
 
-    # Test case generation
-    gen = TopicAwareGenerator(topics=["arithmetic"], hard_examples=[...])
-    case = gen.generate(strategy="hss_gen")
+    # Test case generation — pass a Dataset (or DataFrame) plus a GP prior
+    gen = TopicAwareGenerator(df=df, dataset="gsm8k", prior_u=u, prior_S=S)
+    case = gen.generate(strategy="tss")
 """
 
 __version__ = "0.1.0"

diff --git a/proeval/generator/core.py b/proeval/generator/core.py
@@ -431,7 +431,9 @@ class TopicAwareGenerator:
        neutral prior (0.5).  No encoder or model predictions needed.
 
     Args:
-        df: Source DataFrame with ``question`` and ``ground_truth`` columns.
+        df: Source DataFrame with ``question`` and ``ground_truth`` columns,
+            or a :class:`~proeval.utils.Dataset` (its frame and ``name`` are
+            used; a non-default *dataset* still overrides the prompt format).
         dataset: ``"gsm8k"`` or ``"strategyqa"``.
         api_key: OpenRouter API key (or set ``OPENROUTER_API_KEY`` env var).
         model: Model to use for generation.
@@ -477,6 +479,16 @@ def __init__(
         ss_threshold: float = 0.0,
         ss_beta: float = 1.96,
     ):
+        # Accept a Dataset in place of a DataFrame: derive the question frame
+        # and the dataset name from it. An explicit, non-default `dataset`
+        # still wins (it controls the prompt format).
+        from proeval.utils.dataset import Dataset
+
+        if isinstance(df, Dataset):
+            if dataset == "gsm8k":
+                dataset = df.name
+            df = df.to_frame()
+
         self.df = df
         self.dataset = dataset
         self.client = OpenRouterClient(api_key=api_key)

diff --git a/proeval/sampler/bq.py b/proeval/sampler/bq.py
@@ -68,6 +68,26 @@
 )
 
 
+def _resolve_predictions(
+    predictions: Union[str, "pd.DataFrame", "Dataset"],  # noqa: F821
+    data_dir: Optional[str] = None,
+) -> Tuple["pd.DataFrame", Optional[str]]:
+    """Normalise the ``predictions`` argument to ``(DataFrame, dataset_name)``.
+
+    Accepts a dataset name (loaded by convention), a pre-loaded DataFrame, or a
+    :class:`~proeval.utils.Dataset` (uses its cached/lazily-loaded predictions
+    and its ``name``). ``dataset_name`` is ``None`` for a bare DataFrame, which
+    disables name-dependent behaviour (GMM selection, DICES label binarisation).
+    """
+    # Local import keeps the sampler import-light and avoids any cycle.
+    from proeval.utils.dataset import Dataset
+
+    if isinstance(predictions, Dataset):
+        return predictions.predictions(data_dir=data_dir), predictions.name
+    if isinstance(predictions, str):
+        return load_predictions(predictions, data_dir=data_dir), predictions
+    return predictions, None
+
 
 # Result container
 @dataclass
@@ -681,7 +701,7 @@ def __init__(
 
     def sample(
         self,
-        predictions: Union[str, pd.DataFrame],
+        predictions: Union[str, pd.DataFrame, "Dataset"],  # noqa: F821
         target_model: Union[int, str] = "gemini25_flash",
         budget: int = 50,
         data_dir: str = None,
@@ -693,8 +713,10 @@ def sample(
         """Run BQ active sampling.
 
         Args:
-            predictions: Either a dataset name (e.g., ``"svamp"``) which will be
-                loaded from ``data_dir``, or a pre-loaded DataFrame.
+            predictions: A dataset name (e.g. ``"svamp"``) loaded from
+                ``data_dir``, a pre-loaded DataFrame, or a
+                :class:`~proeval.utils.Dataset` (its ``name`` is used for GMM
+                selection and label binarisation).
             target_model: Index or name of the model to target for testing.
             budget: Number of samples to acquire.
             data_dir: Directory containing prediction CSVs (default: ``data/``).
@@ -715,13 +737,8 @@ def sample(
         if seed is not None:
             np.random.seed(seed)
 
-        # Load data
-        if isinstance(predictions, str):
-            df = load_predictions(predictions, data_dir=data_dir)
-            dataset_name = predictions
-        else:
-            df = predictions
-            dataset_name = None
+        # Load data (accepts a dataset name, a DataFrame, or a Dataset)
+        df, dataset_name = _resolve_predictions(predictions, data_dir)
 
         pred_matrix, model_names = extract_model_predictions(df, dataset_name)
 
@@ -1010,7 +1027,7 @@ def __init__(
 
     def sample(
         self,
-        predictions: Union[str, pd.DataFrame],
+        predictions: Union[str, pd.DataFrame, "Dataset"],  # noqa: F821
         target_model: Union[int, str] = "gemini25_flash",
         budget: int = 50,
         data_dir: str = None,
@@ -1019,7 +1036,8 @@ def sample(
         """Run BQ active sampling with encoder prior.
 
         Args:
-            predictions: Dataset name or pre-loaded DataFrame.
+            predictions: Dataset name, pre-loaded DataFrame, or a
+                :class:`~proeval.utils.Dataset`.
             target_model: Index or name of the target model.
             budget: Number of samples to acquire.
             data_dir: Data directory path.
@@ -1031,15 +1049,10 @@ def sample(
         if seed is not None:
             np.random.seed(seed)
 
-        # Load data to get test_y (the target model's labels)
-        if isinstance(predictions, str):
-            df = load_predictions(predictions, data_dir=data_dir)
-        else:
-            df = predictions
+        # Load data to get test_y (accepts a name, DataFrame, or Dataset)
+        df, dataset_name = _resolve_predictions(predictions, data_dir)
 
-        pred_matrix, model_names = extract_model_predictions(
-            df, predictions if isinstance(predictions, str) else None
-        )
+        pred_matrix, model_names = extract_model_predictions(df, dataset_name)
 
         # Resolve target model
         if isinstance(target_model, str):