diff --git a/.gitignore b/.gitignore
index f515cd1..9c94a9b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -224,4 +224,7 @@ data/checkpoints
 .python-version
 
 # Non-commercial Data
-data/toxicchat_*
\ No newline at end of file
+data/toxicchat_*
+
+# Local configuration file
+CLAUDE.md
\ No newline at end of file
diff --git a/experiment/README.md b/experiment/README.md
index 8609eee..c2b2504 100644
--- a/experiment/README.md
+++ b/experiment/README.md
@@ -173,6 +173,12 @@ python -m experiment.aggregate_results --results-dir results/
 
 End-to-end pipeline: GMM model selection → encoder training → unified experiment → aggregation.
 
+> [!NOTE]
+> The `script/` directory and its `*.sh` wrappers are **not included** in this
+> repository. Run the `python -m experiment.*` commands above directly (section
+> 6 covers the train + sample pipeline). The variables below document the
+> intended wrapper configuration if you recreate it.
+
 ```bash
 # Run all datasets with defaults
 bash script/run_performance_estimation_all.sh
@@ -227,7 +233,9 @@ python -m experiment.exp_run_all_groups --skip-training --n-runs 10
 | `--skip-training` | off | Reuse existing encoder checkpoints |
 
 > [!NOTE]
-> The shell script `run_all_case_2.sh` wraps this command with pre-configured settings.
+> A `run_all_case_2.sh` wrapper for this command is referenced in places but is
+> not included in the repository — invoke `python -m experiment.exp_run_all_groups`
+> directly.
 
 ---
 
@@ -240,8 +248,8 @@ python -m experiment.exp_performance_estimation \
     --encoder-path data/checkpoints/encoder_svamp_new_pair.pth \
     --n-runs 5
 
-# 2. Or use the shell script (GMM → encoder training → experiment → aggregate)
-bash script/run_performance_estimation_all.sh
+# 2. Or run the full train + sample pipeline (GMM → encoder training → sampling)
+python -m experiment.exp_run_all_groups --setting new_pair --n-runs 5
 
 # 3. Failure discovery
 python -m experiment.exp_failure_discovery \
diff --git a/proeval/README.md b/proeval/README.md
index 70f4bb4..5d023f7 100644
--- a/proeval/README.md
+++ b/proeval/README.md
@@ -309,9 +309,11 @@ for i in range(5):
 
 ## 3. Dataset — Bring Your Own Data
 
-`Dataset` bundles **questions + ground truths + a `DatasetConfig`** in a single
-object that the predictor (and, later, the sampler) operates on. Use it whenever
-you want to evaluate models on data that isn't already wired into
+`Dataset` is the single object that flows through the whole pipeline —
+**prediction → sampling → generation**. It bundles **questions + ground truths
++ a `DatasetConfig`**, and (when built from a predictions CSV) also exposes the
+prediction matrix and embeddings the sampler/generator need. Use it whenever you
+want to evaluate or sample on data that isn't already wired into
 `DATASET_CONFIGS`.
 
 ### Constructors
@@ -319,10 +321,14 @@ you want to evaluate models on data that isn't already wired into
 ```python
 from proeval import Dataset, DATASET_CONFIGS, LLMPredictor
 
-# (a) Built-in: load one of the 9 datasets shipped with ProEval
+# (a) Built-in: load one of the 10 built-in datasets from HuggingFace
 ds = Dataset.from_builtin("svamp")
 
-# (b) From in-memory lists (simplest custom case)
+# (b) From a pre-computed predictions CSV — offline, and the bridge to sampling.
+#     Carries questions/ground_truths AND the label_<model> matrix + embeddings.
+ds = Dataset.from_predictions("svamp")
+
+# (c) From in-memory lists (simplest custom case)
 ds = Dataset.from_lists(
     name="my_yesno",
     questions=["Is the sky blue?", "Is fire cold?"],
@@ -333,7 +339,7 @@ ds = Dataset.from_lists(
     compare_predictions=lambda p, g: 0.0 if str(p).lower() == g else 1.0,
 )
 
-# (c) From a CSV file
+# (d) From a CSV file
 ds = Dataset.from_csv(
     "my_data.csv",
     question_col="question",
@@ -343,7 +349,35 @@ ds = Dataset.from_csv(
 ```
 
 If you already have a built-in `DatasetConfig` that fits your scoring needs,
-pass it via `config=...` and skip the four eval-function arguments.
+pass it via `config=...` and skip the four eval-function arguments. A `config`
+is only required for `predict()`; a `Dataset` built purely for sampling can omit
+it.
+
+### Use a Dataset everywhere
+
+A `Dataset` can be passed directly to the sampler and generator in place of a
+dataset-name string — one object carries the data through every stage:
+
+```python
+from proeval import BQPriorSampler, TopicAwareGenerator
+
+ds = Dataset.from_predictions("svamp")
+
+# Sampling: equivalent to sample(predictions="svamp", ...), but the Dataset
+# also supplies its name (for GMM selection) and cached predictions.
+result = BQPriorSampler(noise_variance=0.3).sample(
+    predictions=ds, target_model="gemini25_flash", budget=50,
+)
+
+# The accessors the sampler relies on are also available directly:
+matrix, model_names = ds.prediction_matrix()   # (n_samples, n_models), 1=failure
+embeddings = ds.embeddings()                    # (n_samples, d)
+
+# Generation: pass the Dataset as `df`; its name drives the prompt format.
+gen = TopicAwareGenerator(df=ds, prior_u=prior_u, prior_S=prior_S)
+```
+
+Passing a name string or a raw DataFrame still works exactly as before.
 
 ### Predict
 
@@ -411,6 +445,7 @@ results = predictor.predict_batch_parallel(
 | GSM8K      | `"gsm8k"`      | Math problem solving    |
 | SVAMP      | `"svamp"`      | Math word problems      |
 | MMLU       | `"mmlu"`       | Multiple choice         |
+| MMLU (Law) | `"mmlu_professionallaw"` | Multiple choice |
 | Jigsaw     | `"jigsaw"`     | Toxicity classification |
 | ToxicChat  | `"toxicchat"`  | Toxicity classification |
 | GQA        | `"gqa"`        | Visual QA               |
@@ -626,5 +661,9 @@ python -m experiment.exp_performance_estimation \
 
 ## Available Data Files
 
-The `data/` directory contains pre-computed prediction CSVs and embeddings for:
-`gsm8k`, `svamp`, `strategyqa`, `mmlu`, `mmlu_professionallaw`, `jigsaw`, `toxicchat`, `gqa`, `dices`.
+The `data/` directory contains pre-computed prediction CSVs and embeddings for
+8 datasets:
+`gsm8k`, `svamp`, `strategyqa`, `mmlu`, `jigsaw`, `gqa`, `dices`, `dices_t2i`.
+
+(`mmlu_professionallaw` and `toxicchat` have `DATASET_CONFIGS` entries for
+prediction via `LLMPredictor`, but no pre-computed files ship in `data/`.)
diff --git a/proeval/__init__.py b/proeval/__init__.py
index 25b733d..6667730 100644
--- a/proeval/__init__.py
+++ b/proeval/__init__.py
@@ -31,9 +31,9 @@
         "Is the sky blue?", True, DATASET_CONFIGS["strategyqa"]
     )
 
-    # Test case generation
-    gen = TopicAwareGenerator(topics=["arithmetic"], hard_examples=[...])
-    case = gen.generate(strategy="hss_gen")
+    # Test case generation — pass a Dataset (or DataFrame) plus a GP prior
+    gen = TopicAwareGenerator(df=df, dataset="gsm8k", prior_u=u, prior_S=S)
+    case = gen.generate(strategy="tss")
 """
 
 __version__ = "0.1.0"
diff --git a/proeval/generator/core.py b/proeval/generator/core.py
index 31e6452..0d8a6ee 100644
--- a/proeval/generator/core.py
+++ b/proeval/generator/core.py
@@ -431,7 +431,9 @@ class TopicAwareGenerator:
        neutral prior (0.5).  No encoder or model predictions needed.
 
     Args:
-        df: Source DataFrame with ``question`` and ``ground_truth`` columns.
+        df: Source DataFrame with ``question`` and ``ground_truth`` columns,
+            or a :class:`~proeval.utils.Dataset` (its frame and ``name`` are
+            used; a non-default *dataset* still overrides the prompt format).
         dataset: ``"gsm8k"`` or ``"strategyqa"``.
         api_key: OpenRouter API key (or set ``OPENROUTER_API_KEY`` env var).
         model: Model to use for generation.
@@ -477,6 +479,16 @@ def __init__(
         ss_threshold: float = 0.0,
         ss_beta: float = 1.96,
     ):
+        # Accept a Dataset in place of a DataFrame: derive the question frame
+        # and the dataset name from it. An explicit, non-default `dataset`
+        # still wins (it controls the prompt format).
+        from proeval.utils.dataset import Dataset
+
+        if isinstance(df, Dataset):
+            if dataset == "gsm8k":
+                dataset = df.name
+            df = df.to_frame()
+
         self.df = df
         self.dataset = dataset
         self.client = OpenRouterClient(api_key=api_key)
diff --git a/proeval/sampler/bq.py b/proeval/sampler/bq.py
index 342c368..40a57b7 100644
--- a/proeval/sampler/bq.py
+++ b/proeval/sampler/bq.py
@@ -68,6 +68,26 @@
 )
 
 
+def _resolve_predictions(
+    predictions: Union[str, "pd.DataFrame", "Dataset"],  # noqa: F821
+    data_dir: Optional[str] = None,
+) -> Tuple["pd.DataFrame", Optional[str]]:
+    """Normalise the ``predictions`` argument to ``(DataFrame, dataset_name)``.
+
+    Accepts a dataset name (loaded by convention), a pre-loaded DataFrame, or a
+    :class:`~proeval.utils.Dataset` (uses its cached/lazily-loaded predictions
+    and its ``name``). ``dataset_name`` is ``None`` for a bare DataFrame, which
+    disables name-dependent behaviour (GMM selection, DICES label binarisation).
+    """
+    # Local import keeps the sampler import-light and avoids any cycle.
+    from proeval.utils.dataset import Dataset
+
+    if isinstance(predictions, Dataset):
+        return predictions.predictions(data_dir=data_dir), predictions.name
+    if isinstance(predictions, str):
+        return load_predictions(predictions, data_dir=data_dir), predictions
+    return predictions, None
+
 
 # Result container
 @dataclass
@@ -681,7 +701,7 @@ def __init__(
 
     def sample(
         self,
-        predictions: Union[str, pd.DataFrame],
+        predictions: Union[str, pd.DataFrame, "Dataset"],  # noqa: F821
         target_model: Union[int, str] = "gemini25_flash",
         budget: int = 50,
         data_dir: str = None,
@@ -693,8 +713,10 @@ def sample(
         """Run BQ active sampling.
 
         Args:
-            predictions: Either a dataset name (e.g., ``"svamp"``) which will be
-                loaded from ``data_dir``, or a pre-loaded DataFrame.
+            predictions: A dataset name (e.g. ``"svamp"``) loaded from
+                ``data_dir``, a pre-loaded DataFrame, or a
+                :class:`~proeval.utils.Dataset` (its ``name`` is used for GMM
+                selection and label binarisation).
             target_model: Index or name of the model to target for testing.
             budget: Number of samples to acquire.
             data_dir: Directory containing prediction CSVs (default: ``data/``).
@@ -715,13 +737,8 @@ def sample(
         if seed is not None:
             np.random.seed(seed)
 
-        # Load data
-        if isinstance(predictions, str):
-            df = load_predictions(predictions, data_dir=data_dir)
-            dataset_name = predictions
-        else:
-            df = predictions
-            dataset_name = None
+        # Load data (accepts a dataset name, a DataFrame, or a Dataset)
+        df, dataset_name = _resolve_predictions(predictions, data_dir)
 
         pred_matrix, model_names = extract_model_predictions(df, dataset_name)
 
@@ -1010,7 +1027,7 @@ def __init__(
 
     def sample(
         self,
-        predictions: Union[str, pd.DataFrame],
+        predictions: Union[str, pd.DataFrame, "Dataset"],  # noqa: F821
         target_model: Union[int, str] = "gemini25_flash",
         budget: int = 50,
         data_dir: str = None,
@@ -1019,7 +1036,8 @@ def sample(
         """Run BQ active sampling with encoder prior.
 
         Args:
-            predictions: Dataset name or pre-loaded DataFrame.
+            predictions: Dataset name, pre-loaded DataFrame, or a
+                :class:`~proeval.utils.Dataset`.
             target_model: Index or name of the target model.
             budget: Number of samples to acquire.
             data_dir: Data directory path.
@@ -1031,15 +1049,10 @@ def sample(
         if seed is not None:
             np.random.seed(seed)
 
-        # Load data to get test_y (the target model's labels)
-        if isinstance(predictions, str):
-            df = load_predictions(predictions, data_dir=data_dir)
-        else:
-            df = predictions
+        # Load data to get test_y (accepts a name, DataFrame, or Dataset)
+        df, dataset_name = _resolve_predictions(predictions, data_dir)
 
-        pred_matrix, model_names = extract_model_predictions(
-            df, predictions if isinstance(predictions, str) else None
-        )
+        pred_matrix, model_names = extract_model_predictions(df, dataset_name)
 
         # Resolve target model
         if isinstance(target_model, str):
diff --git a/proeval/utils/dataset.py b/proeval/utils/dataset.py
index 7b0a040..fb89caa 100644
--- a/proeval/utils/dataset.py
+++ b/proeval/utils/dataset.py
@@ -24,13 +24,22 @@
 Use one of the constructors to build a :class:`Dataset`:
 
 - :meth:`Dataset.from_builtin` — one of the 9 datasets shipped with ProEval
-  (``svamp``, ``gsm8k``, ``strategyqa``, ...).
+  (``svamp``, ``gsm8k``, ``strategyqa``, ...). Loads questions/ground_truths
+  from HuggingFace (requires the ``[datasets]`` extra).
+- :meth:`Dataset.from_predictions` — build from a pre-computed
+  ``<name>_predictions.csv``. Offline, and doubles as the bridge to the
+  sampler/generator (carries the prediction matrix + embeddings by name).
 - :meth:`Dataset.from_lists` — pass questions/ground_truths/eval functions
   directly. The simplest way to bring a custom dataset.
 - :meth:`Dataset.from_csv` — load questions and ground truths from a CSV.
 
 Run predictions with :meth:`Dataset.predict` (or
-:meth:`~proeval.evaluator.LLMPredictor.predict_dataset`).
+:meth:`~proeval.evaluator.LLMPredictor.predict_dataset`). A :class:`Dataset`
+can also be passed straight to
+:meth:`~proeval.sampler.BQPriorSampler.sample` and
+:class:`~proeval.generator.TopicAwareGenerator` — see those for the sampling
+side, and :meth:`prediction_matrix` / :meth:`embeddings` for the accessors
+they rely on.
 
 Example — built-in::
 
@@ -93,7 +102,10 @@ def __init__(
         name: str,
         questions: List[Any],
         ground_truths: List[Any],
-        config: DatasetConfig,
+        config: Optional[DatasetConfig] = None,
+        *,
+        data_dir: Optional[str] = None,
+        predictions_df: Optional["pd.DataFrame"] = None,  # noqa: F821
     ):
         if len(questions) != len(ground_truths):
             raise ValueError(
@@ -103,7 +115,15 @@ def __init__(
         self.name = name
         self.questions = list(questions)
         self.ground_truths = list(ground_truths)
+        #: Scoring config. Required by :meth:`predict`; optional for datasets
+        #: built only for sampling/generation (e.g. :meth:`from_predictions`).
         self.config = config
+        #: Directory holding ``<name>_predictions.csv`` / ``<name>_embeddings_*``
+        #: used by the sampling accessors. ``None`` → the package ``data/`` dir.
+        self.data_dir = data_dir
+        #: Cached predictions DataFrame, populated when the dataset was built
+        #: from a predictions CSV. ``None`` → resolved lazily by *name*.
+        self._predictions_df = predictions_df
 
     # Container protocol — supports len(), indexing, iteration. This also
     # gives the future sampler a uniform interface to operate on.
@@ -148,6 +168,54 @@ def from_builtin(cls, name: str) -> "Dataset":
             config=DATASET_CONFIGS[name],
         )
 
+    @classmethod
+    def from_predictions(
+        cls,
+        name: str,
+        data_dir: Optional[str] = None,
+        config: Optional[DatasetConfig] = None,
+    ) -> "Dataset":
+        """Build a Dataset from a pre-computed predictions CSV.
+
+        Loads ``<data_dir>/<name>_predictions.csv`` (the same file the sampler
+        consumes) and uses its ``question`` / ``ground_truth`` columns. The
+        loaded frame is cached, so the sampling accessors
+        (:meth:`predictions`, :meth:`prediction_matrix`, :meth:`embeddings`)
+        resolve without re-reading the file.
+
+        This is the offline bridge between evaluation and sampling: the
+        resulting :class:`Dataset` can be passed directly to
+        ``LLMPredictor`` (if a *config* is available), to
+        :meth:`~proeval.sampler.BQPriorSampler.sample`, and to
+        :class:`~proeval.generator.TopicAwareGenerator`.
+
+        Args:
+            name: Dataset name, e.g. ``"svamp"``. Resolves the CSV by the
+                standard ``<name>_predictions.csv`` convention.
+            data_dir: Directory holding the CSV. ``None`` → package ``data/``.
+            config: Scoring config. ``None`` → ``DATASET_CONFIGS[name]`` when
+                *name* is a built-in dataset, otherwise left unset (sampling
+                still works; :meth:`predict` will require a config).
+        """
+        from proeval.sampler.data import load_predictions
+
+        df = load_predictions(name, data_dir=data_dir)
+        for col in ("question", "ground_truth"):
+            if col not in df.columns:
+                raise ValueError(
+                    f"Predictions CSV for {name!r} is missing a {col!r} column; "
+                    f"found {list(df.columns)[:6]}..."
+                )
+        resolved_config = config if config is not None else DATASET_CONFIGS.get(name)
+        return cls(
+            name=name,
+            questions=df["question"].tolist(),
+            ground_truths=df["ground_truth"].tolist(),
+            config=resolved_config,
+            data_dir=data_dir,
+            predictions_df=df,
+        )
+
     @classmethod
     def from_lists(
         cls,
@@ -265,6 +333,12 @@ def predict(
             score)`` tuples — the same shape as
             :meth:`~proeval.evaluator.LLMPredictor.predict_batch_parallel`.
         """
+        if self.config is None:
+            raise ValueError(
+                f"Dataset {self.name!r} has no scoring config, so it cannot be "
+                "predicted. Build it with a `config=` (or the four eval "
+                "functions), or use a built-in dataset."
+            )
         if parallel:
             return predictor.predict_batch_parallel(
                 self.questions,
@@ -282,6 +356,63 @@ def predict(
             show_progress=show_progress,
         )
 
+    # Sampling data accessors
+    #
+    # These bridge the Dataset to the sampler/generator. They resolve the
+    # pre-computed prediction CSV / embeddings by *name* (cached when the
+    # dataset was built via from_predictions / from_builtin).
+
+    def predictions(self, data_dir: Optional[str] = None) -> "pd.DataFrame":  # noqa: F821
+        """Return the predictions DataFrame (``label_<model>`` columns).
+
+        Uses the cached frame when available, otherwise loads
+        ``<name>_predictions.csv`` by convention.
+        """
+        if self._predictions_df is not None and data_dir is None:
+            return self._predictions_df
+        from proeval.sampler.data import load_predictions
+
+        df = load_predictions(self.name, data_dir=data_dir or self.data_dir)
+        if data_dir is None:
+            self._predictions_df = df
+        return df
+
+    def prediction_matrix(self, data_dir: Optional[str] = None):
+        """Return ``(prediction_matrix, model_names)`` for this dataset.
+
+        Thin wrapper over
+        :func:`~proeval.sampler.data.extract_model_predictions` that passes
+        *name* so DICES-style continuous labels are binarised correctly.
+        """
+        from proeval.sampler.data import extract_model_predictions
+
+        return extract_model_predictions(self.predictions(data_dir), self.name)
+
+    def embeddings(self, data_dir: Optional[str] = None):
+        """Return pre-computed question embeddings ``(n_samples, d)``.
+
+        Resolves ``<name>_embeddings_*.npy`` by the standard convention.
+        """
+        from proeval.sampler.data import load_embeddings
+
+        return load_embeddings(self.name, data_dir=data_dir or self.data_dir)
+
+    def to_frame(self) -> "pd.DataFrame":  # noqa: F821
+        """Return a DataFrame with ``question`` / ``ground_truth`` columns.
+
+        Returns the cached predictions frame when present (preserving its
+        ``label_<model>`` columns); otherwise builds a minimal frame from the
+        in-memory questions/ground_truths. This is what
+        :class:`~proeval.generator.TopicAwareGenerator` consumes.
+        """
+        if self._predictions_df is not None:
+            return self._predictions_df
+        import pandas as pd
+
+        return pd.DataFrame(
+            {"question": self.questions, "ground_truth": self.ground_truths}
+        )
+
 
 # Internal helpers