diff --git a/.gitignore b/.gitignore index f515cd1..9c94a9b 100644 --- a/.gitignore +++ b/.gitignore @@ -224,4 +224,7 @@ data/checkpoints .python-version # Non-commercial Data -data/toxicchat_* \ No newline at end of file +data/toxicchat_* + +# Local configuration file +CLAUDE.md \ No newline at end of file diff --git a/experiment/README.md b/experiment/README.md index 8609eee..c2b2504 100644 --- a/experiment/README.md +++ b/experiment/README.md @@ -173,6 +173,12 @@ python -m experiment.aggregate_results --results-dir results/ End-to-end pipeline: GMM model selection → encoder training → unified experiment → aggregation. +> [!NOTE] +> The `script/` directory and its `*.sh` wrappers are **not included** in this +> repository. Run the `python -m experiment.*` commands above directly (section +> 6 covers the train + sample pipeline). The variables below document the +> intended wrapper configuration if you recreate it. + ```bash # Run all datasets with defaults bash script/run_performance_estimation_all.sh @@ -227,7 +233,9 @@ python -m experiment.exp_run_all_groups --skip-training --n-runs 10 | `--skip-training` | off | Reuse existing encoder checkpoints | > [!NOTE] -> The shell script `run_all_case_2.sh` wraps this command with pre-configured settings. +> A `run_all_case_2.sh` wrapper for this command is referenced in places but is +> not included in the repository — invoke `python -m experiment.exp_run_all_groups` +> directly. --- @@ -240,8 +248,8 @@ python -m experiment.exp_performance_estimation \ --encoder-path data/checkpoints/encoder_svamp_new_pair.pth \ --n-runs 5 -# 2. Or use the shell script (GMM → encoder training → experiment → aggregate) -bash script/run_performance_estimation_all.sh +# 2. Or run the full train + sample pipeline (GMM → encoder training → sampling) +python -m experiment.exp_run_all_groups --setting new_pair --n-runs 5 # 3. Failure discovery python -m experiment.exp_failure_discovery \ diff --git a/proeval/README.md b/proeval/README.md index 70f4bb4..5d023f7 100644 --- a/proeval/README.md +++ b/proeval/README.md @@ -309,9 +309,11 @@ for i in range(5): ## 3. Dataset — Bring Your Own Data -`Dataset` bundles **questions + ground truths + a `DatasetConfig`** in a single -object that the predictor (and, later, the sampler) operates on. Use it whenever -you want to evaluate models on data that isn't already wired into +`Dataset` is the single object that flows through the whole pipeline — +**prediction → sampling → generation**. It bundles **questions + ground truths ++ a `DatasetConfig`**, and (when built from a predictions CSV) also exposes the +prediction matrix and embeddings the sampler/generator need. Use it whenever you +want to evaluate or sample on data that isn't already wired into `DATASET_CONFIGS`. ### Constructors @@ -319,10 +321,14 @@ you want to evaluate models on data that isn't already wired into ```python from proeval import Dataset, DATASET_CONFIGS, LLMPredictor -# (a) Built-in: load one of the 9 datasets shipped with ProEval +# (a) Built-in: load one of the 10 built-in datasets from HuggingFace ds = Dataset.from_builtin("svamp") -# (b) From in-memory lists (simplest custom case) +# (b) From a pre-computed predictions CSV — offline, and the bridge to sampling. +# Carries questions/ground_truths AND the label_ matrix + embeddings. +ds = Dataset.from_predictions("svamp") + +# (c) From in-memory lists (simplest custom case) ds = Dataset.from_lists( name="my_yesno", questions=["Is the sky blue?", "Is fire cold?"], @@ -333,7 +339,7 @@ ds = Dataset.from_lists( compare_predictions=lambda p, g: 0.0 if str(p).lower() == g else 1.0, ) -# (c) From a CSV file +# (d) From a CSV file ds = Dataset.from_csv( "my_data.csv", question_col="question", @@ -343,7 +349,35 @@ ds = Dataset.from_csv( ``` If you already have a built-in `DatasetConfig` that fits your scoring needs, -pass it via `config=...` and skip the four eval-function arguments. +pass it via `config=...` and skip the four eval-function arguments. A `config` +is only required for `predict()`; a `Dataset` built purely for sampling can omit +it. + +### Use a Dataset everywhere + +A `Dataset` can be passed directly to the sampler and generator in place of a +dataset-name string — one object carries the data through every stage: + +```python +from proeval import BQPriorSampler, TopicAwareGenerator + +ds = Dataset.from_predictions("svamp") + +# Sampling: equivalent to sample(predictions="svamp", ...), but the Dataset +# also supplies its name (for GMM selection) and cached predictions. +result = BQPriorSampler(noise_variance=0.3).sample( + predictions=ds, target_model="gemini25_flash", budget=50, +) + +# The accessors the sampler relies on are also available directly: +matrix, model_names = ds.prediction_matrix() # (n_samples, n_models), 1=failure +embeddings = ds.embeddings() # (n_samples, d) + +# Generation: pass the Dataset as `df`; its name drives the prompt format. +gen = TopicAwareGenerator(df=ds, prior_u=prior_u, prior_S=prior_S) +``` + +Passing a name string or a raw DataFrame still works exactly as before. ### Predict @@ -411,6 +445,7 @@ results = predictor.predict_batch_parallel( | GSM8K | `"gsm8k"` | Math problem solving | | SVAMP | `"svamp"` | Math word problems | | MMLU | `"mmlu"` | Multiple choice | +| MMLU (Law) | `"mmlu_professionallaw"` | Multiple choice | | Jigsaw | `"jigsaw"` | Toxicity classification | | ToxicChat | `"toxicchat"` | Toxicity classification | | GQA | `"gqa"` | Visual QA | @@ -626,5 +661,9 @@ python -m experiment.exp_performance_estimation \ ## Available Data Files -The `data/` directory contains pre-computed prediction CSVs and embeddings for: -`gsm8k`, `svamp`, `strategyqa`, `mmlu`, `mmlu_professionallaw`, `jigsaw`, `toxicchat`, `gqa`, `dices`. +The `data/` directory contains pre-computed prediction CSVs and embeddings for +8 datasets: +`gsm8k`, `svamp`, `strategyqa`, `mmlu`, `jigsaw`, `gqa`, `dices`, `dices_t2i`. + +(`mmlu_professionallaw` and `toxicchat` have `DATASET_CONFIGS` entries for +prediction via `LLMPredictor`, but no pre-computed files ship in `data/`.) diff --git a/proeval/__init__.py b/proeval/__init__.py index 25b733d..6667730 100644 --- a/proeval/__init__.py +++ b/proeval/__init__.py @@ -31,9 +31,9 @@ "Is the sky blue?", True, DATASET_CONFIGS["strategyqa"] ) - # Test case generation - gen = TopicAwareGenerator(topics=["arithmetic"], hard_examples=[...]) - case = gen.generate(strategy="hss_gen") + # Test case generation — pass a Dataset (or DataFrame) plus a GP prior + gen = TopicAwareGenerator(df=df, dataset="gsm8k", prior_u=u, prior_S=S) + case = gen.generate(strategy="tss") """ __version__ = "0.1.0" diff --git a/proeval/generator/core.py b/proeval/generator/core.py index 31e6452..0d8a6ee 100644 --- a/proeval/generator/core.py +++ b/proeval/generator/core.py @@ -431,7 +431,9 @@ class TopicAwareGenerator: neutral prior (0.5). No encoder or model predictions needed. Args: - df: Source DataFrame with ``question`` and ``ground_truth`` columns. + df: Source DataFrame with ``question`` and ``ground_truth`` columns, + or a :class:`~proeval.utils.Dataset` (its frame and ``name`` are + used; a non-default *dataset* still overrides the prompt format). dataset: ``"gsm8k"`` or ``"strategyqa"``. api_key: OpenRouter API key (or set ``OPENROUTER_API_KEY`` env var). model: Model to use for generation. @@ -477,6 +479,16 @@ def __init__( ss_threshold: float = 0.0, ss_beta: float = 1.96, ): + # Accept a Dataset in place of a DataFrame: derive the question frame + # and the dataset name from it. An explicit, non-default `dataset` + # still wins (it controls the prompt format). + from proeval.utils.dataset import Dataset + + if isinstance(df, Dataset): + if dataset == "gsm8k": + dataset = df.name + df = df.to_frame() + self.df = df self.dataset = dataset self.client = OpenRouterClient(api_key=api_key) diff --git a/proeval/sampler/bq.py b/proeval/sampler/bq.py index 342c368..40a57b7 100644 --- a/proeval/sampler/bq.py +++ b/proeval/sampler/bq.py @@ -68,6 +68,26 @@ ) +def _resolve_predictions( + predictions: Union[str, "pd.DataFrame", "Dataset"], # noqa: F821 + data_dir: Optional[str] = None, +) -> Tuple["pd.DataFrame", Optional[str]]: + """Normalise the ``predictions`` argument to ``(DataFrame, dataset_name)``. + + Accepts a dataset name (loaded by convention), a pre-loaded DataFrame, or a + :class:`~proeval.utils.Dataset` (uses its cached/lazily-loaded predictions + and its ``name``). ``dataset_name`` is ``None`` for a bare DataFrame, which + disables name-dependent behaviour (GMM selection, DICES label binarisation). + """ + # Local import keeps the sampler import-light and avoids any cycle. + from proeval.utils.dataset import Dataset + + if isinstance(predictions, Dataset): + return predictions.predictions(data_dir=data_dir), predictions.name + if isinstance(predictions, str): + return load_predictions(predictions, data_dir=data_dir), predictions + return predictions, None + # Result container @dataclass @@ -681,7 +701,7 @@ def __init__( def sample( self, - predictions: Union[str, pd.DataFrame], + predictions: Union[str, pd.DataFrame, "Dataset"], # noqa: F821 target_model: Union[int, str] = "gemini25_flash", budget: int = 50, data_dir: str = None, @@ -693,8 +713,10 @@ def sample( """Run BQ active sampling. Args: - predictions: Either a dataset name (e.g., ``"svamp"``) which will be - loaded from ``data_dir``, or a pre-loaded DataFrame. + predictions: A dataset name (e.g. ``"svamp"``) loaded from + ``data_dir``, a pre-loaded DataFrame, or a + :class:`~proeval.utils.Dataset` (its ``name`` is used for GMM + selection and label binarisation). target_model: Index or name of the model to target for testing. budget: Number of samples to acquire. data_dir: Directory containing prediction CSVs (default: ``data/``). @@ -715,13 +737,8 @@ def sample( if seed is not None: np.random.seed(seed) - # Load data - if isinstance(predictions, str): - df = load_predictions(predictions, data_dir=data_dir) - dataset_name = predictions - else: - df = predictions - dataset_name = None + # Load data (accepts a dataset name, a DataFrame, or a Dataset) + df, dataset_name = _resolve_predictions(predictions, data_dir) pred_matrix, model_names = extract_model_predictions(df, dataset_name) @@ -1010,7 +1027,7 @@ def __init__( def sample( self, - predictions: Union[str, pd.DataFrame], + predictions: Union[str, pd.DataFrame, "Dataset"], # noqa: F821 target_model: Union[int, str] = "gemini25_flash", budget: int = 50, data_dir: str = None, @@ -1019,7 +1036,8 @@ def sample( """Run BQ active sampling with encoder prior. Args: - predictions: Dataset name or pre-loaded DataFrame. + predictions: Dataset name, pre-loaded DataFrame, or a + :class:`~proeval.utils.Dataset`. target_model: Index or name of the target model. budget: Number of samples to acquire. data_dir: Data directory path. @@ -1031,15 +1049,10 @@ def sample( if seed is not None: np.random.seed(seed) - # Load data to get test_y (the target model's labels) - if isinstance(predictions, str): - df = load_predictions(predictions, data_dir=data_dir) - else: - df = predictions + # Load data to get test_y (accepts a name, DataFrame, or Dataset) + df, dataset_name = _resolve_predictions(predictions, data_dir) - pred_matrix, model_names = extract_model_predictions( - df, predictions if isinstance(predictions, str) else None - ) + pred_matrix, model_names = extract_model_predictions(df, dataset_name) # Resolve target model if isinstance(target_model, str): diff --git a/proeval/utils/dataset.py b/proeval/utils/dataset.py index 7b0a040..fb89caa 100644 --- a/proeval/utils/dataset.py +++ b/proeval/utils/dataset.py @@ -24,13 +24,22 @@ Use one of the constructors to build a :class:`Dataset`: - :meth:`Dataset.from_builtin` — one of the 9 datasets shipped with ProEval - (``svamp``, ``gsm8k``, ``strategyqa``, ...). + (``svamp``, ``gsm8k``, ``strategyqa``, ...). Loads questions/ground_truths + from HuggingFace (requires the ``[datasets]`` extra). +- :meth:`Dataset.from_predictions` — build from a pre-computed + ``_predictions.csv``. Offline, and doubles as the bridge to the + sampler/generator (carries the prediction matrix + embeddings by name). - :meth:`Dataset.from_lists` — pass questions/ground_truths/eval functions directly. The simplest way to bring a custom dataset. - :meth:`Dataset.from_csv` — load questions and ground truths from a CSV. Run predictions with :meth:`Dataset.predict` (or -:meth:`~proeval.evaluator.LLMPredictor.predict_dataset`). +:meth:`~proeval.evaluator.LLMPredictor.predict_dataset`). A :class:`Dataset` +can also be passed straight to +:meth:`~proeval.sampler.BQPriorSampler.sample` and +:class:`~proeval.generator.TopicAwareGenerator` — see those for the sampling +side, and :meth:`prediction_matrix` / :meth:`embeddings` for the accessors +they rely on. Example — built-in:: @@ -93,7 +102,10 @@ def __init__( name: str, questions: List[Any], ground_truths: List[Any], - config: DatasetConfig, + config: Optional[DatasetConfig] = None, + *, + data_dir: Optional[str] = None, + predictions_df: Optional["pd.DataFrame"] = None, # noqa: F821 ): if len(questions) != len(ground_truths): raise ValueError( @@ -103,7 +115,15 @@ def __init__( self.name = name self.questions = list(questions) self.ground_truths = list(ground_truths) + #: Scoring config. Required by :meth:`predict`; optional for datasets + #: built only for sampling/generation (e.g. :meth:`from_predictions`). self.config = config + #: Directory holding ``_predictions.csv`` / ``_embeddings_*`` + #: used by the sampling accessors. ``None`` → the package ``data/`` dir. + self.data_dir = data_dir + #: Cached predictions DataFrame, populated when the dataset was built + #: from a predictions CSV. ``None`` → resolved lazily by *name*. + self._predictions_df = predictions_df # Container protocol — supports len(), indexing, iteration. This also # gives the future sampler a uniform interface to operate on. @@ -148,6 +168,54 @@ def from_builtin(cls, name: str) -> "Dataset": config=DATASET_CONFIGS[name], ) + @classmethod + def from_predictions( + cls, + name: str, + data_dir: Optional[str] = None, + config: Optional[DatasetConfig] = None, + ) -> "Dataset": + """Build a Dataset from a pre-computed predictions CSV. + + Loads ``/_predictions.csv`` (the same file the sampler + consumes) and uses its ``question`` / ``ground_truth`` columns. The + loaded frame is cached, so the sampling accessors + (:meth:`predictions`, :meth:`prediction_matrix`, :meth:`embeddings`) + resolve without re-reading the file. + + This is the offline bridge between evaluation and sampling: the + resulting :class:`Dataset` can be passed directly to + ``LLMPredictor`` (if a *config* is available), to + :meth:`~proeval.sampler.BQPriorSampler.sample`, and to + :class:`~proeval.generator.TopicAwareGenerator`. + + Args: + name: Dataset name, e.g. ``"svamp"``. Resolves the CSV by the + standard ``_predictions.csv`` convention. + data_dir: Directory holding the CSV. ``None`` → package ``data/``. + config: Scoring config. ``None`` → ``DATASET_CONFIGS[name]`` when + *name* is a built-in dataset, otherwise left unset (sampling + still works; :meth:`predict` will require a config). + """ + from proeval.sampler.data import load_predictions + + df = load_predictions(name, data_dir=data_dir) + for col in ("question", "ground_truth"): + if col not in df.columns: + raise ValueError( + f"Predictions CSV for {name!r} is missing a {col!r} column; " + f"found {list(df.columns)[:6]}..." + ) + resolved_config = config if config is not None else DATASET_CONFIGS.get(name) + return cls( + name=name, + questions=df["question"].tolist(), + ground_truths=df["ground_truth"].tolist(), + config=resolved_config, + data_dir=data_dir, + predictions_df=df, + ) + @classmethod def from_lists( cls, @@ -265,6 +333,12 @@ def predict( score)`` tuples — the same shape as :meth:`~proeval.evaluator.LLMPredictor.predict_batch_parallel`. """ + if self.config is None: + raise ValueError( + f"Dataset {self.name!r} has no scoring config, so it cannot be " + "predicted. Build it with a `config=` (or the four eval " + "functions), or use a built-in dataset." + ) if parallel: return predictor.predict_batch_parallel( self.questions, @@ -282,6 +356,63 @@ def predict( show_progress=show_progress, ) + # Sampling data accessors + # + # These bridge the Dataset to the sampler/generator. They resolve the + # pre-computed prediction CSV / embeddings by *name* (cached when the + # dataset was built via from_predictions / from_builtin). + + def predictions(self, data_dir: Optional[str] = None) -> "pd.DataFrame": # noqa: F821 + """Return the predictions DataFrame (``label_`` columns). + + Uses the cached frame when available, otherwise loads + ``_predictions.csv`` by convention. + """ + if self._predictions_df is not None and data_dir is None: + return self._predictions_df + from proeval.sampler.data import load_predictions + + df = load_predictions(self.name, data_dir=data_dir or self.data_dir) + if data_dir is None: + self._predictions_df = df + return df + + def prediction_matrix(self, data_dir: Optional[str] = None): + """Return ``(prediction_matrix, model_names)`` for this dataset. + + Thin wrapper over + :func:`~proeval.sampler.data.extract_model_predictions` that passes + *name* so DICES-style continuous labels are binarised correctly. + """ + from proeval.sampler.data import extract_model_predictions + + return extract_model_predictions(self.predictions(data_dir), self.name) + + def embeddings(self, data_dir: Optional[str] = None): + """Return pre-computed question embeddings ``(n_samples, d)``. + + Resolves ``_embeddings_*.npy`` by the standard convention. + """ + from proeval.sampler.data import load_embeddings + + return load_embeddings(self.name, data_dir=data_dir or self.data_dir) + + def to_frame(self) -> "pd.DataFrame": # noqa: F821 + """Return a DataFrame with ``question`` / ``ground_truth`` columns. + + Returns the cached predictions frame when present (preserving its + ``label_`` columns); otherwise builds a minimal frame from the + in-memory questions/ground_truths. This is what + :class:`~proeval.generator.TopicAwareGenerator` consumes. + """ + if self._predictions_df is not None: + return self._predictions_df + import pandas as pd + + return pd.DataFrame( + {"question": self.questions, "ground_truth": self.ground_truths} + ) + # Internal helpers