From b6be787cc860feed1c1b552228b5eedb0f6645bc Mon Sep 17 00:00:00 2001 From: voorhs Date: Sat, 23 May 2026 19:31:59 +0300 Subject: [PATCH 01/16] add spec --- compute-feasibility-advisor-proposal.md | 201 ++++++++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 compute-feasibility-advisor-proposal.md diff --git a/compute-feasibility-advisor-proposal.md b/compute-feasibility-advisor-proposal.md new file mode 100644 index 000000000..2560d1279 --- /dev/null +++ b/compute-feasibility-advisor-proposal.md @@ -0,0 +1,201 @@ +# Compute Feasibility Advisor for AutoIntent + +- **Date:** 2026-05-23 +- **Status:** Proposal (pre-implementation) +- **Audience:** AutoIntent maintainers / contributor picking up the task + +## Problem + +AutoIntent's main strength is letting a user kick off a full search-space optimization with one call: + +```python +pipeline = Pipeline.from_preset("transformers-heavy") +pipeline.fit(dataset) +``` + +The cost of that convenience is that users — especially those running on a laptop, a single consumer GPU, or a free cloud instance — cannot tell ahead of time whether their hardware can carry the configuration they have just selected. + +Concrete failure cases we see today: + +- `transformers-heavy` fine-tunes `microsoft/deberta-v3-large` for up to 30 epochs across 40 HPO trials. That needs ~12–18 GB VRAM (full fine-tune, fp32) and many hours of wall time on a single GPU. A user with an 8 GB card finds out by OOM, often several minutes into a run. +- Swapping `intfloat/multilingual-e5-large-instruct` (2 GB) for `sentence-transformers/all-MiniLM-L6-v2` (90 MB) changes the resource bill by an order of magnitude — but nothing surfaces this difference up front. +- Disk is a silent failure mode: a search space referencing several large checkpoints can pull >10 GB into the HF cache before any training starts. + +The target audience for this feature is users with limited resources who pick a preset, hit `fit()`, and want to know within a second whether they should change something. + +## Proposed solution: pre-flight resource advisor + +Add a **pre-flight advisor** that, given a parsed search space and a dataset, estimates worst-case disk, RAM, VRAM, and wall-time requirements from public Hugging Face Hub metadata and a small set of formulas, then prints a clear summary with red/yellow/green warnings. By default it is **report-only and never blocks the run**; an opt-in **reduce-to-fit** mode additionally prunes the search space to fit detected hardware. + +### Scope + +The advisor analyses only the **local, model-bearing** modules whose footprint can be derived from HF Hub metadata. Everything else is either trivial or out of band. + + +| Module category | In scope? | Reason | +| -------------------------------------------------------------------------------- | --------- | -------------------------------------------------- | +| `SentenceTransformerEmbeddingConfig` | yes | local transformer, dominant cost on small machines | +| `VllmEmbeddingConfig` | yes | local transformer with extra engine overhead | +| `HFModelConfig`-based scorers (`bert`, `lora`, `ptuning`, `dnnc`, cross-encoder) | yes | the actual heavyweights | +| GCN scorer when configured with a transformer backbone | yes | inherits the backbone cost | +| `OpenaiEmbeddingConfig` | no | no local resources to estimate | +| `HashingVectorizerEmbeddingConfig` | no | trivial cost | +| `knn`, `mlknn`, `linear`, `sklearn`, `catboost`, `description` | no | negligible next to a fine-tune | +| `decision` and `regex` nodes | no | negligible | + + +Rationale: the user's real risk is the heavy transformer-backed modules. A cheap module cannot be the reason a run fails for resource reasons; we don't owe an estimate for it. + +### Inputs + +- The parsed `OptimizationConfig` (search space, HPO config, embedder/transformer configs). +- The training `Dataset` (for `dataset_size` and an approximate token-length distribution). +- Detected local hardware: + - Total / available RAM via `psutil`. + - Free disk on the AutoIntent / HF cache directory via `shutil.disk_usage`. + - Accelerator detection, in priority order: + - **CUDA:** per-GPU VRAM and device name via `torch.cuda`. + - **MPS (Apple Silicon):** detected via `torch.backends.mps.is_available()`. Apple chips use unified memory, so there is no separate VRAM pool — the "VRAM budget" is a fraction of total system RAM. Default budget = 70 % of total RAM (matching the macOS `PYTORCH_MPS_HIGH_WATERMARK_RATIO` default) with the remainder reserved for the OS and other apps. The fraction is exposed as a knob. + - **CPU only:** when neither is available. + +### Output + +A structured estimate plus a human-readable summary printed to the logger. Example: + +``` +Compute feasibility check +───────────────────────── +Available : 8 GB VRAM (NVIDIA RTX 3060), 32 GB RAM, 120 GB free disk +Estimated worst-case requirements for this search space: + Disk : 5.2 GB (3 unique checkpoints) + RAM : ~4 GB + VRAM : ~14 GB ⚠ exceeds available + Time : ~6 h (single-GPU, fp32, rough) + +Drivers of cost: + scoring.bert microsoft/deberta-v3-large full fine-tune × 40 trials × 30 epochs → ~14 GB VRAM, ~5 h + embedder intfloat/multilingual-e5-large-instruct → ~2.2 GB VRAM + +Suggestions: + • Enable mixed precision (fp16/bf16) on the bert scorer + • Reduce batch_size from 64 to 16 or 32 + • Try preset `transformers-light` or `classic-medium` + +These numbers are heuristic upper bounds, not measurements. +``` + +Numbers are reported with honest precision (one significant figure for time, two for memory) and an explicit "estimate, not measurement" disclaimer. + +### Algorithm (proposal, allowed to adjust) + +1. **Collect candidates.** Walk the search space; collect every unique `(module_type, model_name, mode)` triple, where `mode ∈ {inference, lora, full-finetune}`. Also collect HPO knobs that drive cost: `n_trials`, `epochs`, `batch_size`, `max_length`, `dtype` (fp16/bf16/fp32). +2. **Resolve checkpoints.** For each unique `model_name`, query HF Hub for safetensors metadata to read parameter count and weight dtype. Fall back to file-size aggregation if safetensors metadata is missing. Fall back to a "unknown — heuristic only" tag with low-confidence labelling if HF Hub is offline or the repo is private. +3. **Apply formulas.** + - **Disk** = sum over unique checkpoints of total file size, plus a small fixed overhead per checkpoint for tokenizers and config. + - **RAM** = max over modules of `params × dtype_bytes + dataset_tokens × 4 bytes`, treated as a loose upper bound for tokenized buffers. + - **VRAM per module:** + - Inference embedder: `params × dtype_bytes × ~1.3` (small constant for activations). + - Full fine-tune (`bert`, GCN backbone, soft-prompt `ptuning`): `params × dtype_bytes × (1 + 1 + 2)` for weights + grads + Adam state, halved when fp16/bf16 mixed precision is configured. + - LoRA: inference VRAM + a small adapter constant. + - Reranker (cross-encoder, `dnnc`): inference VRAM × small factor for the reranking pass. + - **Time per module** = `n_trials × epochs × (dataset_size / batch_size) × per_step_seconds(params, max_length, device_class)`, where `per_step_seconds` is a small static lookup table keyed on coarse device class (`cpu`, `low-gpu`, `mid-gpu`, `high-gpu`, `apple-silicon`) auto-detected from `torch.cuda.get_device_name` or `platform`/`torch.backends.mps`. Total time = sum across modules. MPS time numbers are coarser than CUDA's (one tier for now); we accept that. +4. **Compare to detected hardware.** Per-dimension status is green / yellow / red against a configurable headroom (defaults: **red** if estimate > 100 % of available, **yellow** if > 70 %). On MPS, "VRAM" and "RAM" estimates draw from the same physical pool; we compare *the larger of the two* against the unified-memory budget rather than each independently. +5. **Render summary.** Log at INFO. If any dimension is red, emit at WARNING so it shows in non-logging contexts. + +### Failure modes + +- **HF Hub offline or private repo:** fall back to "unknown model — name-pattern heuristic only", explicit low-confidence label, never raise. +- **No accelerator (no CUDA and no MPS):** report VRAM as N/A and mark GPU-only modules as "requires GPU" without estimating a (misleading) CPU wall time. +- **MPS configured but a module is incompatible:** vLLM in particular does not run on MPS. Flag the module as "unsupported on MPS" rather than estimating; do not raise. +- **MPS with CPU fallback ops:** some PyTorch ops fall back to CPU on MPS, inflating system-RAM usage and wall time beyond the heuristic. Note this in the disclaimer; we don't try to model it. +- **vLLM configured but not installed:** still estimate (the VRAM accounting is similar), note that the engine itself has additional overhead not captured. +- **Estimate wildly wrong vs. reality:** always-on disclaimer in the printed summary that these are heuristic upper bounds. + +### Reduce-to-fit mode + +The feasibility check has two modes sharing the same estimation pipeline: + +- **Report mode (default).** Print the summary, return the structured estimate, let the run proceed regardless of severity. +- **Reduce-to-fit mode (opt-in).** Additionally prune the search space to fit detected hardware before the run starts. Same estimates, same comparisons — just one extra step that produces a reduced search space. + +Using the same per-module estimates, the pruner applies three least-destructive steps in order: + +1. **Filter discrete-choice hyperparameters.** For lists of cost-driving values (model name, batch size, training epochs), keep only entries whose worst-case estimate fits. +2. **Cap continuous ranges.** For `{low, high}` ranges of cost-driving parameters, lower the upper bound to the largest fitting value. Ranges of non-cost parameters (learning rate, decision thresholds) are not touched. +3. **Drop module variants.** If a module entry has any required hyperparameter with no satisfiable value left, drop that module entry from its node's search space. + +Guard rails: + +- If pruning would leave any node's search space empty, the pruner **raises**. We don't silently produce a non-runnable pipeline, and we don't quietly fall back to report-only — failing loudly is the right contract for a mode whose whole purpose is to make the run feasible. The error message points the user toward a lighter preset. +- Time is not used as a filter — only memory and disk are. Time is still reported. +- Headroom thresholds are intentionally generous to avoid over-pruning and are configurable. + +Alongside the standard estimate, the caller receives a structured description of what was filtered, capped, and dropped, plus the resulting search space and its recomputed (now green) estimate. + +**Drawbacks worth surfacing.** + +- **Silent narrowing of intent.** A search space deliberately written to include heavy/light variants for comparison gets halved. The mode is opt-in for this reason. +- **Over-pruning when our formulas overestimate.** A 30 %-high estimate on a borderline configuration throws away a run that would have succeeded. Generous headroom defaults mitigate; the knob is exposed. +- **Hard failure when nothing fits.** Raising is intentional — silent degradation to report-only would defeat the mode's purpose — but it is a sharper edge than report mode has. +- **Pre-trial only.** The rewrite happens before any HPO trial starts. This is fine because the search space is treated as immutable across a study, but worth calling out so nobody tries to make this dynamic later. + +## Alternatives considered and rejected + +### B. Smoke-test calibration + +Run each unique module for one mini-batch / one step before the real fit, measure peak RAM and VRAM with `psutil`, `tracemalloc`, and `torch.cuda.max_memory_allocated`, time the step, and extrapolate to the full search space. + +Rejected because: + +- It **downloads weights just to estimate** — the disk-headroom check we wanted to provide is defeated by the act of performing it. +- It can **OOM while predicting OOM**, exactly on the constrained hardware that is the target audience. +- It adds **seconds to minutes** of wall time before `fit()` does anything, surprising users. +- It needs per-module "tiny run" hooks; not every scorer has a clean "stop after one step" path. +- For OpenAI- or vLLM-served embedders, a smoke test costs real money or starts the engine. +- Still not accurate due to CUDA and CPU cache, memory heating and so on. + +### C. Curated benchmark table + +Ship a JSON in the package with measured VRAM and per-step time for the bundled-preset checkpoints, broken out by hardware class (cpu / mid-gpu / high-gpu) and mode (inference / lora / full-finetune). Fall back to heuristics for unknown checkpoints. + +Rejected because: + +- **Maintenance burden:** every new model added to a preset would need entries across the hardware × precision × mode matrix. +- Numbers **go stale** when `transformers` updates change defaults (attention impl, dtype, gradient checkpointing). +- It still needs the chosen-solution heuristics as a long-tail fallback — so it adds work on top of Option A without replacing it. +- **Confident-but-wrong is worse than honest-but-fuzzy.** A table that says "4 GB on 4090" when the user OOMs at 4.5 GB damages trust more than a clearly-labelled range would. + +### D. Layered (A by default, opt-in B, embedded table from C, local actuals cache) + +Combine all three: ship A as the fast path, allow `calibrate=True` to trigger B for heavy modules only, embed a small table from C for the bundled-preset checkpoints, and write actuals from every real run to a local cache that feeds back into future estimates. + +Rejected because: + +- **Implementation surface multiplies:** two estimation code paths to keep consistent, a cache schema with versioning and eviction, two failure modes to document. +- **Discoverability:** users may not learn about `calibrate=True` and the realized value compresses back to roughly Option A anyway. +- The team's bandwidth doesn't justify the marginal accuracy gain over A for the target audience. + +## Comparison + + +| Dimension | A (chosen) | B (smoke-test) | C (benchmark table) | D (layered) | +| -------------------------------- | ------------------------------ | ---------------------- | ---------------------------------- | ------------------------------------- | +| Wall time at pre-flight | < 1 s | seconds–minutes | < 1 s | < 1 s default, s–min when calibrating | +| Accuracy on common checkpoints | medium | high | high | high | +| Accuracy on custom checkpoints | medium | high | medium (fallback) | medium–high | +| Time-estimate quality | low–medium | high | high | high | +| Disk pre-download required | no | yes | no | only when calibrating | +| Risk of OOM during the check | none | real | none | only when calibrating | +| Network usage | 1 cached call per unique model | none beyond normal fit | none | combination | +| Implementation effort | small | large | medium + ongoing benchmark refresh | large + cache infra | +| Ongoing maintenance | low (formulas only) | low | high | high | +| Friendly to offline / air-gapped | with fallback | yes | yes | partial | + + +The chosen solution accepts a real accuracy gap on time and a moderate accuracy gap on VRAM in exchange for the only profile that fits the target audience's constraints: zero added wall time, zero added downloads, zero added failure modes, and a small one-time implementation cost. + +## Out of scope (possible follow-ups) + +- Live resource observability during `fit()` (peak RAM / VRAM per trial, abort on overrun). +- A learned calibration cache from real runs to refine estimates over time. + From dceb9854e0dd51171dd42291086266cfb3c269f4 Mon Sep 17 00:00:00 2001 From: voorhs Date: Fri, 5 Jun 2026 11:05:27 +0300 Subject: [PATCH 02/16] upd tech spec --- compute-feasibility-advisor-proposal.md | 37 +++++++++++++++++++------ 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/compute-feasibility-advisor-proposal.md b/compute-feasibility-advisor-proposal.md index 2560d1279..9e7833e3d 100644 --- a/compute-feasibility-advisor-proposal.md +++ b/compute-feasibility-advisor-proposal.md @@ -3,6 +3,7 @@ - **Date:** 2026-05-23 - **Status:** Proposal (pre-implementation) - **Audience:** AutoIntent maintainers / contributor picking up the task +- **Scope of this document:** technical specification — *what* the advisor estimates and the formulas it uses. Architectural and system-design choices (where the advisor lives in the codebase, how it integrates with the optimizer, the public API surface, file/module layout) are deliberately left to the implementer. ## Problem @@ -38,13 +39,15 @@ The advisor analyses only the **local, model-bearing** modules whose footprint c | `VllmEmbeddingConfig` | yes | local transformer with extra engine overhead | | `HFModelConfig`-based scorers (`bert`, `lora`, `ptuning`, `dnnc`, cross-encoder) | yes | the actual heavyweights | | GCN scorer when configured with a transformer backbone | yes | inherits the backbone cost | +| `LinearScorer` (sklearn `LogisticRegression` / `LogisticRegressionCV`) | yes | dominant cost on presets with no transformer fine-tune; the CV path multiplies a single fit by ~30 | +| `CatBoostScorer` | yes | dominant cost on presets with no transformer fine-tune; high default `iterations` | | `OpenaiEmbeddingConfig` | no | no local resources to estimate | | `HashingVectorizerEmbeddingConfig` | no | trivial cost | -| `knn`, `mlknn`, `linear`, `sklearn`, `catboost`, `description` | no | negligible next to a fine-tune | +| `knn`, `mlknn`, generic `sklearn` classifiers via `SklearnScorer`, `description` | no | bounded so far below any in-scope module that they cannot plausibly be the bottleneck | | `decision` and `regex` nodes | no | negligible | -Rationale: the user's real risk is the heavy transformer-backed modules. A cheap module cannot be the reason a run fails for resource reasons; we don't owe an estimate for it. +Rationale: the user's real risk is whichever module is the actual bottleneck. On heavy presets that is a transformer fine-tune; on light presets it shifts to `linear` (CV-multiplied) or `catboost` (1000 default iterations × dataset shape). Modules left out of scope are ones whose cost is bounded so far below any in-scope module that they cannot plausibly be the reason a run fails. ### Inputs @@ -88,17 +91,33 @@ Numbers are reported with honest precision (one significant figure for time, two ### Algorithm (proposal, allowed to adjust) -1. **Collect candidates.** Walk the search space; collect every unique `(module_type, model_name, mode)` triple, where `mode ∈ {inference, lora, full-finetune}`. Also collect HPO knobs that drive cost: `n_trials`, `epochs`, `batch_size`, `max_length`, `dtype` (fp16/bf16/fp32). -2. **Resolve checkpoints.** For each unique `model_name`, query HF Hub for safetensors metadata to read parameter count and weight dtype. Fall back to file-size aggregation if safetensors metadata is missing. Fall back to a "unknown — heuristic only" tag with low-confidence labelling if HF Hub is offline or the repo is private. -3. **Apply formulas.** - - **Disk** = sum over unique checkpoints of total file size, plus a small fixed overhead per checkpoint for tokenizers and config. - - **RAM** = max over modules of `params × dtype_bytes + dataset_tokens × 4 bytes`, treated as a loose upper bound for tokenized buffers. +1. **Collect candidates.** Walk the search space; collect every unique in-scope module. For transformer-bearing modules the identity is `(module_type, model_name, mode)` with `mode ∈ {inference, lora, full-finetune}`. For `linear` and `catboost` the identity is `(module_type, embedder_name, task_kind)` with `task_kind ∈ {multiclass, multilabel}` — the routing through `LogisticRegressionCV` vs `MultiOutputClassifier`, and CatBoost's per-class trees, both depend on it. Also collect the HPO knobs that drive cost: `n_trials` plus per-module knobs — transformer (`epochs`, `batch_size`, `max_length`, `dtype` ∈ {fp16, bf16, fp32}), `linear` (`cv`, `max_iter`), `catboost` (`iterations`, `depth`, `task_type`, `features_type`). +2. **Resolve checkpoints.** For each unique `model_name`, query HF Hub for safetensors metadata to read parameter count and weight dtype. Fall back to file-size aggregation if safetensors metadata is missing. Fall back to a "unknown — heuristic only" tag with low-confidence labelling if HF Hub is offline or the repo is private. `LinearScorer` and `CatBoostScorer` have no checkpoint of their own; they reuse the embedder resolved by this step in their formulas (their cost is parameterised by `embedder_dim`, not parameter count). +3. **Apply formulas.** All values are honest upper bounds; convergence and early stopping often terminate well below them. + - **Disk** = sum over unique downloadable checkpoints of total file size, plus a small fixed overhead per checkpoint for tokenizers and config. `LinearScorer` and `CatBoostScorer` contribute zero (they consume embedder output that is already accounted for upstream). + - **RAM per module:** + - Transformer modules (any mode): `params × dtype_bytes + dataset_tokens × 4 bytes`, treated as a loose upper bound for tokenized buffers. + - `LinearScorer`: `8 × n_samples × embedder_dim` (float64 data matrix — the dominant term) `+ 8 × n_classes × embedder_dim` (coefficients) `+ ~10 × 8 × embedder_dim` (L-BFGS history). + - `CatBoostScorer`: `4 × n_samples × n_features` (data, float32 internally) `+ 4 × n_features × n_bins` (histograms; default `n_bins = 254`) `+ iterations × 2^depth × ~32 bytes` (tree storage). For `features_type ∈ {embedding, both}`, `n_features = embedder_dim`. For `features_type = text`, `n_features` is the BoW vocab discovered at fit; bound with a coarse default (e.g. 50 000) and tag the estimate low-confidence. + - For `linear` and `catboost`, `embedder_dim` is taken from the largest embedder in the same node group — same worst-case stance as the rest of the estimate. - **VRAM per module:** - Inference embedder: `params × dtype_bytes × ~1.3` (small constant for activations). - Full fine-tune (`bert`, GCN backbone, soft-prompt `ptuning`): `params × dtype_bytes × (1 + 1 + 2)` for weights + grads + Adam state, halved when fp16/bf16 mixed precision is configured. - LoRA: inference VRAM + a small adapter constant. - Reranker (cross-encoder, `dnnc`): inference VRAM × small factor for the reranking pass. - - **Time per module** = `n_trials × epochs × (dataset_size / batch_size) × per_step_seconds(params, max_length, device_class)`, where `per_step_seconds` is a small static lookup table keyed on coarse device class (`cpu`, `low-gpu`, `mid-gpu`, `high-gpu`, `apple-silicon`) auto-detected from `torch.cuda.get_device_name` or `platform`/`torch.backends.mps`. Total time = sum across modules. MPS time numbers are coarser than CUDA's (one tier for now); we accept that. + - `LinearScorer`: N/A (sklearn is CPU-only). + - `CatBoostScorer`: 0 by default; if `task_type="GPU"` is configured, the RAM formula above lives on device instead. + - **Time per module:** + - Transformer modules: `n_trials × epochs × (dataset_size / batch_size) × per_step_seconds(params, max_length, device_class)`, where `per_step_seconds` is a small static lookup keyed on coarse device class (`cpu`, `low-gpu`, `mid-gpu`, `high-gpu`, `apple-silicon`) auto-detected from `torch.cuda.get_device_name` or `platform`/`torch.backends.mps`. + - `LinearScorer`: `n_trials × C_cpu × n_samples × embedder_dim × max_iter × cv_multiplier × class_multiplier`, where: + - `C_cpu ≈ 1e-8 s` per `(sample × feature × iteration)` on a single modern CPU core. + - `cv_multiplier = Cs × cv + 1 ≈ 31` for the multiclass path (`LogisticRegressionCV` with default `Cs = 10`, repo default `cv = 3`, plus one final refit). `cv_multiplier = 1` for the multilabel path (no inner CV). + - `class_multiplier = n_classes` for the multilabel path (`MultiOutputClassifier` fits one binary LogReg per class); `class_multiplier = 1` otherwise. + - `CatBoostScorer`: `n_trials × iterations × C_device × n_samples × n_features × depth × class_multiplier`, where: + - `C_device ≈ 1e-9 s` on CPU, ~5–20× faster on GPU. Resolve `C_device` via the same `device_class` lookup as the transformer time formula. + - `class_multiplier = n_classes` for both the multiclass `MultiClass` loss (per-class trees per iteration) and the multilabel routing (one CatBoost per class). + - Early stopping is not modelled; `iterations` is treated as the upper bound. + - Total time = sum across modules. MPS time numbers are coarser than CUDA's (one tier for now); we accept that. 4. **Compare to detected hardware.** Per-dimension status is green / yellow / red against a configurable headroom (defaults: **red** if estimate > 100 % of available, **yellow** if > 70 %). On MPS, "VRAM" and "RAM" estimates draw from the same physical pool; we compare *the larger of the two* against the unified-memory budget rather than each independently. 5. **Render summary.** Log at INFO. If any dimension is red, emit at WARNING so it shows in non-logging contexts. @@ -120,7 +139,7 @@ The feasibility check has two modes sharing the same estimation pipeline: Using the same per-module estimates, the pruner applies three least-destructive steps in order: -1. **Filter discrete-choice hyperparameters.** For lists of cost-driving values (model name, batch size, training epochs), keep only entries whose worst-case estimate fits. +1. **Filter discrete-choice hyperparameters.** For lists of cost-driving values (model name, batch size, training epochs, CatBoost `iterations` / `depth`, sklearn `cv`), keep only entries whose worst-case estimate fits. 2. **Cap continuous ranges.** For `{low, high}` ranges of cost-driving parameters, lower the upper bound to the largest fitting value. Ranges of non-cost parameters (learning rate, decision thresholds) are not touched. 3. **Drop module variants.** If a module entry has any required hyperparameter with no satisfiable value left, drop that module entry from its node's search space. From 94b4e121a7cad8527ceab66984d2e23086799775 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Wed, 10 Jun 2026 02:16:01 +0300 Subject: [PATCH 03/16] add feasibility advisor: CLI script, package, tests; expand proposal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - proposal: introduce 3-phase framing (resource/data/config), add resource-phase refinements (warm cache, n_jobs × VRAM, refit_after, Hub reachability, CatBoost GPU sanity), data-quality phase (token truncation, split readiness, partial descriptions, embedder dim), config sanity phase, updated example output, CLI surface, out-of- scope deferrals - _advisor package: hardware detection (CUDA/MPS/CPU with broken-CUDA fallback), HF Hub metadata + warm-cache probe + offline heuristics, three-phase run_preflight returning structured PreflightReport, text + JSON renderers - autointent-advisor CLI: inspect and recommend subcommands; placeholder dataset stats when no --dataset given - 88 offline tests covering hardware fallbacks, every bundled preset, severity routing, report serialization, name-pattern heuristics, AMP invariant, dump_modules / refit_after, CLI flows Co-Authored-By: Claude Opus 4.7 --- compute-feasibility-advisor-proposal.md | 71 +++- pyproject.toml | 1 + src/autointent/_advisor/__init__.py | 23 ++ src/autointent/_advisor/_cli.py | 243 ++++++++++++++ src/autointent/_advisor/_estimates.py | 382 ++++++++++++++++++++++ src/autointent/_advisor/_hardware.py | 160 +++++++++ src/autointent/_advisor/_hub.py | 183 +++++++++++ src/autointent/_advisor/_render.py | 104 ++++++ src/autointent/_advisor/_report.py | 113 +++++++ tests/advisor/__init__.py | 0 tests/advisor/test_estimates_and_cli.py | 198 +++++++++++ tests/advisor/test_estimates_internals.py | 319 ++++++++++++++++++ tests/advisor/test_hardware_detection.py | 72 ++++ tests/advisor/test_hub_heuristics.py | 81 +++++ tests/advisor/test_render.py | 151 +++++++++ tests/advisor/test_report.py | 85 +++++ 16 files changed, 2181 insertions(+), 5 deletions(-) create mode 100644 src/autointent/_advisor/__init__.py create mode 100644 src/autointent/_advisor/_cli.py create mode 100644 src/autointent/_advisor/_estimates.py create mode 100644 src/autointent/_advisor/_hardware.py create mode 100644 src/autointent/_advisor/_hub.py create mode 100644 src/autointent/_advisor/_render.py create mode 100644 src/autointent/_advisor/_report.py create mode 100644 tests/advisor/__init__.py create mode 100644 tests/advisor/test_estimates_and_cli.py create mode 100644 tests/advisor/test_estimates_internals.py create mode 100644 tests/advisor/test_hardware_detection.py create mode 100644 tests/advisor/test_hub_heuristics.py create mode 100644 tests/advisor/test_render.py create mode 100644 tests/advisor/test_report.py diff --git a/compute-feasibility-advisor-proposal.md b/compute-feasibility-advisor-proposal.md index 9e7833e3d..7ebf70bd9 100644 --- a/compute-feasibility-advisor-proposal.md +++ b/compute-feasibility-advisor-proposal.md @@ -49,6 +49,16 @@ The advisor analyses only the **local, model-bearing** modules whose footprint c Rationale: the user's real risk is whichever module is the actual bottleneck. On heavy presets that is a transformer fine-tune; on light presets it shifts to `linear` (CV-multiplied) or `catboost` (1000 default iterations × dataset shape). Modules left out of scope are ones whose cost is bounded so far below any in-scope module that they cannot plausibly be the reason a run fails. +### Phases + +The advisor is one entry point, but internally splits work into three phases that share a single `PreflightReport` object. The split is internal organization — all three run at the same hook point (after `validate_modules`, before `_fit(context)`) and the user sees one summary. Separating them keeps each phase's inputs, formulas, and failure modes scoped: + +- **Resource phase.** Disk / RAM / VRAM / wall-time estimates and comparisons against detected hardware. Most of the formulas in this document live here. This is the only phase consumed by the reduce-to-fit pruner. +- **Data quality phase.** Findings derived from the dataset jointly with the active search space — token-length truncation, split readiness (auto-invokes the existing `check_split_readiness` utility rather than re-implementing it), partial intent descriptions paired with the `description` scorer, embedder/scorer dimension consistency. Reports red/yellow lines but never prunes the search space; the user fixes the dataset or the config. +- **Configuration sanity phase.** Joint checks across dataset + search-space + hardware that don't slot cleanly into the other two — e.g., `hpo_config.n_jobs > 1` × per-trial VRAM contention, CatBoost `task_type="GPU"` with no CUDA. Pydantic schema validation already runs upstream on `OptimizationConfig`; this phase only adds checks that need joint inspection. + +The advisor consumes `validate_modules`'s *post-filter* view of `self.nodes` — it does not duplicate that mutating filter. + ### Inputs - The parsed `OptimizationConfig` (search space, HPO config, embedder/transformer configs). @@ -68,12 +78,19 @@ A structured estimate plus a human-readable summary printed to the logger. Examp ``` Compute feasibility check ───────────────────────── -Available : 8 GB VRAM (NVIDIA RTX 3060), 32 GB RAM, 120 GB free disk -Estimated worst-case requirements for this search space: - Disk : 5.2 GB (3 unique checkpoints) +Resource: + Available : 8 GB VRAM (NVIDIA RTX 3060), 32 GB RAM, 120 GB free disk + Disk : 5.2 GB to download, 1.1 GB already cached (3 unique checkpoints) RAM : ~4 GB - VRAM : ~14 GB ⚠ exceeds available - Time : ~6 h (single-GPU, fp32, rough) + VRAM : ~14 GB × 2 parallel trials (n_jobs=2) ⚠ exceeds available + Time : ~6 h (+~12 min for refit_after) (single-GPU, fp32, rough) + +Data: + Train tokens p95 : 612 (exceeds bert.max_length=512) ⚠ ~7% truncated + Split readiness : 2 classes have <3 samples — LogisticRegressionCV cv=3 will fail ✗ + +Config: + CatBoost task_type=GPU but no CUDA detected — will fall back to CPU ⚠ Drivers of cost: scoring.bert microsoft/deberta-v3-large full fine-tune × 40 trials × 30 epochs → ~14 GB VRAM, ~5 h @@ -82,6 +99,7 @@ Drivers of cost: Suggestions: • Enable mixed precision (fp16/bf16) on the bert scorer • Reduce batch_size from 64 to 16 or 32 + • Set hpo_config.n_jobs=1 — parallel trials are doubling VRAM demand • Try preset `transformers-light` or `classic-medium` These numbers are heuristic upper bounds, not measurements. @@ -121,6 +139,34 @@ Numbers are reported with honest precision (one significant figure for time, two 4. **Compare to detected hardware.** Per-dimension status is green / yellow / red against a configurable headroom (defaults: **red** if estimate > 100 % of available, **yellow** if > 70 %). On MPS, "VRAM" and "RAM" estimates draw from the same physical pool; we compare *the larger of the two* against the unified-memory budget rather than each independently. 5. **Render summary.** Log at INFO. If any dimension is red, emit at WARNING so it shows in non-logging contexts. +#### Resource-phase refinements + +These adjust the formulas above for situations that look fine in single-trial isolation but blow up in practice: + +- **Cold-vs-warm HF cache (Tier 1).** Before reporting disk, probe each unique `model_name` against the local HF cache via `huggingface_hub.try_to_load_from_cache` / `scan_cache_dir`, keyed off `HF_HOME`. Split the disk line into `to_download` vs `already_cached`. Treat a repo as cached only if the weight shard (`model.safetensors` or equivalent) is present — not just config/tokenizer files. Without this, a repeated run on the same machine alarms the user about gigabytes they already have. +- **Concurrent-trial × per-trial VRAM (Tier 1).** Multiply the per-trial VRAM estimate by `hpo_config.n_jobs` when `n_jobs > 1` and the active accelerator is GPU. Same for the `dump_modules=True` path on disk: each trial writes module weights to the dump dir, so multiply per-module dump-disk by `n_trials`. vLLM is process-isolated and its contention model differs; note this in the disclaimer. +- **`refit_after=True` time delta (Tier 2).** When `Pipeline.fit(refit_after=True)`, add one full-data training pass per node to the time estimate. Small term but easy to forget; users running close to their time budget care about it. +- **HF Hub reachability probe (Tier 2).** One up-front `HfApi().whoami()` (or unauthenticated `HEAD` to `huggingface.co`) at the start of the phase. On failure, consistently downgrade *all* model entries to the "unknown — heuristic only" path instead of timing out per-model 10× on a 10-model search space. +- **CatBoost `task_type="GPU"` sanity (Tier 2).** When CatBoost is in the search space with `task_type="GPU"` but `torch.cuda.is_available()` is false, tag yellow — CatBoost silently falls back to CPU and the user otherwise sees CPU speeds with no warning. + +### Data quality phase + +The resource phase predicts whether the run *fits*. The data quality phase predicts whether the run *produces a meaningful result*. Both are caught at the same hook point because both have the same failure mode from the user's perspective: hours of compute followed by a cryptic error or a silently degraded model. + +- **Token-length truncation (Tier 1).** Sample ~1000 utterances from the train split, tokenize against each unique transformer's tokenizer, compute `p95_tokens` and `% truncated` against the module's `max_length`. Yellow when >1% truncated; red when >10%. Reuse the tokenizer the resource phase already loaded for parameter-count resolution — don't double-fetch. The existing pipeline silently truncates (sentence-transformers and the HF Trainer both default to `truncation=True`); there is no warning anywhere today. +- **Auto-invoke `check_split_readiness` (Tier 1).** Call the existing utility at `context/data_handler/_readiness_util.py:44–109` with the active `data_config` and surface its `SplitReadinessResult` — it already returns `underpopulated_classes`, `ready`, and a `reason` string, but is not called anywhere from `Pipeline.fit()` today. When `LinearScorer` with CV is in the search space and any class has `n < cv`, name the module by name in the red line ("`LogisticRegressionCV` cv=3 will fail: classes [X, Y] have <3 samples") rather than emitting a generic split-readiness message. +- **Partial intent descriptions × `description` scorer (Tier 1).** The dataset constructor already warns once at import when *some* but not all intents have descriptions (`_dataset/_dataset.py:199–207`). The advisor escalates this to red when the `description` scorer is also present in the active search space — otherwise the run will produce NaN embeddings for the missing intents. Action message: "fill in N missing descriptions", not "drop the scorer". +- **Embedder ↔ scorer dimension consistency (Tier 2).** For `LinearScorer` / `CatBoostScorer` with `features_type="both"`, verify the embedder reachable from the same node group exposes a stable, expected dimension. Cross-node walk; surface as yellow when the resolved dimension cannot be confirmed pre-flight. + +### Configuration sanity phase + +Pydantic schema validation on `OptimizationConfig` runs upstream at config-load time; this phase only adds checks that require *joint* inspection of dataset + search-space + hardware. With Tier 1 + Tier 2 in scope today, this phase holds two items: + +- The `n_jobs × VRAM` callout, surfaced jointly with the resource phase (single line in the rendered output). +- The CatBoost `task_type="GPU"` without CUDA check, same. + +Both could live entirely in the resource phase; they get their own phase because future additions — joint scorer↔decision shape checks, OOS-support mismatches detected up front rather than at module instantiation, embedder-dimension mismatches — slot here naturally. Keep the phase scaffold even if it is currently thin. + ### Failure modes - **HF Hub offline or private repo:** fall back to "unknown model — name-pattern heuristic only", explicit low-confidence label, never raise. @@ -137,6 +183,8 @@ The feasibility check has two modes sharing the same estimation pipeline: - **Report mode (default).** Print the summary, return the structured estimate, let the run proceed regardless of severity. - **Reduce-to-fit mode (opt-in).** Additionally prune the search space to fit detected hardware before the run starts. Same estimates, same comparisons — just one extra step that produces a reduced search space. +Reduce-to-fit consumes only the **resource phase** output. Data-quality and config-sanity findings are reported but never trigger pruning — they require user action (fix the dataset, change a config flag), not search-space narrowing. + Using the same per-module estimates, the pruner applies three least-destructive steps in order: 1. **Filter discrete-choice hyperparameters.** For lists of cost-driving values (model name, batch size, training epochs, CatBoost `iterations` / `depth`, sklearn `cv`), keep only entries whose worst-case estimate fits. @@ -158,6 +206,15 @@ Alongside the standard estimate, the caller receives a structured description of - **Hard failure when nothing fits.** Raising is intentional — silent degradation to report-only would defeat the mode's purpose — but it is a sharper edge than report mode has. - **Pre-trial only.** The rewrite happens before any HPO trial starts. This is fine because the search space is treated as immutable across a study, but worth calling out so nobody tries to make this dynamic later. +### CLI surface + +The advisor is also exposed as a console script (`autointent-advisor`) so users can answer "what will this cost?" and "what should I run?" without writing Python. Two subcommands: + +- **`autointent-advisor inspect `.** Resolves the preset (or a user-supplied `OptimizationConfig`), detects local hardware, runs the same three-phase advisor that `Pipeline.fit()` runs, and prints the same report. Accepts `--dataset` for a real dataset, or `--n-samples / --n-classes / --avg-tokens` placeholders when the dataset is not yet built — so the script is useful before any training data exists. `--json` emits the structured `PreflightReport` for scripting. +- **`autointent-advisor recommend [--n-samples ... | --dataset ...] [--budget-time 12h] [--budget-vram-gb 8]`.** Detects local hardware (with manual overrides applied), iterates over the bundled presets in `_presets/`, and tags each as `feasible` / `feasible-with-reduce` / `infeasible`. Ranks feasible presets by quality tier (`heavy > medium > light`) then estimated wall-time; picks the top one as the recommendation. For the heaviest infeasible preset, surfaces the single most-impactful knob change that would make it fit (e.g., "`transformers-heavy` would fit if `batch_size` ≤ 16 and `dtype=fp16`"), reusing the reduce-to-fit pruner's per-knob delta info. + +**Constraints (both subcommands).** No model downloads — only HF Hub metadata endpoints (`HfApi().model_info`); never `from_pretrained`. Offline-safe — on Hub unreachability, fall back to the same "heuristic only" path and mark the report low-confidence; do not raise. Hardware-detection failures (broken CUDA install where `torch.cuda.mem_get_info()` raises) fall back to CPU detection and tag the report rather than crashing. + ## Alternatives considered and rejected ### B. Smoke-test calibration @@ -217,4 +274,8 @@ The chosen solution accepts a real accuracy gap on time and a moderate accuracy - Live resource observability during `fit()` (peak RAM / VRAM per trial, abort on overrun). - A learned calibration cache from real runs to refine estimates over time. +- **Determinism / `cudnn.deterministic` check.** Belongs in seed-setting code (`set_seed` utility, `Pipeline.__init__`), not in a feasibility advisor — reproducibility is not a hardware-budget question. +- **OpenAI / Generator token-cost ($) estimation.** Real value, but pricing tables age badly, the `StructuredOutputCache` hit rate is unknowable upfront, and the API-paying audience overlaps poorly with this advisor's stated audience (resource-constrained local users). Push to a separate `cost_estimator` tool. +- **Predictive CO₂ / emissions.** `_callbacks/emissions_tracker.py` already does this retrospectively, accurately. A predictive version multiplies our (loose) time estimate by a regional kWh/CO₂ factor — two sources of imprecision compounded. The retrospective number is the trustworthy one. +- **vLLM startup compile time.** Minutes of overhead before any work, but vLLM is unsupported on MPS, isn't the dominant cost on CUDA once running, and modelling it needs a startup-time lookup table. Note once in the disclaimer; do not model. diff --git a/pyproject.toml b/pyproject.toml index 202a8cb3f..b47993faf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -141,6 +141,7 @@ Documentation = "https://deeppavlov.github.io/AutoIntent/" [project.scripts] "basic-aug" = "autointent.generation.utterances.basic.cli:main" "evolution-aug" = "autointent.generation.utterances.evolution.cli:main" +"autointent-advisor" = "autointent._advisor._cli:main" [build-system] requires = ["uv_build>=0.8.7,<0.9.0"] diff --git a/src/autointent/_advisor/__init__.py b/src/autointent/_advisor/__init__.py new file mode 100644 index 000000000..5f29b028e --- /dev/null +++ b/src/autointent/_advisor/__init__.py @@ -0,0 +1,23 @@ +"""Pre-flight compute feasibility advisor. + +Exposes a small surface used by both ``Pipeline.fit()`` (future integration) and +the ``autointent-advisor`` CLI script. See ``compute-feasibility-advisor-proposal.md`` +at the repo root for the design document. +""" + +from __future__ import annotations + +from ._hardware import HardwareProfile, detect_hardware +from ._report import DatasetStats, Finding, PreflightReport, ResourceEstimate, Severity +from ._estimates import run_preflight + +__all__ = [ + "DatasetStats", + "Finding", + "HardwareProfile", + "PreflightReport", + "ResourceEstimate", + "Severity", + "detect_hardware", + "run_preflight", +] diff --git a/src/autointent/_advisor/_cli.py b/src/autointent/_advisor/_cli.py new file mode 100644 index 000000000..4e7eae000 --- /dev/null +++ b/src/autointent/_advisor/_cli.py @@ -0,0 +1,243 @@ +"""Console-script entry point for the pre-flight advisor. + +Two subcommands: + +* ``inspect`` — show what a given preset / config will cost on this machine. +* ``recommend`` — pick the best-fitting bundled preset for this machine. + +Both subcommands accept either a real ``--dataset`` (path to load with +``Dataset.from_*`` constructors) or ``--n-samples / --n-classes / --avg-tokens`` +placeholders so the script is useful before the user has built a dataset. +""" + +from __future__ import annotations + +import argparse +import logging +import sys +from pathlib import Path +from typing import Any + +import yaml + +from ._estimates import run_preflight +from ._hardware import detect_hardware +from ._render import render_json, render_recommendation, render_text +from ._report import DatasetStats, PreflightReport + +logger = logging.getLogger("autointent.advisor") + +BUNDLED_PRESETS = [ + "transformers-heavy", + "transformers-light", + "transformers-no-hpo", + "nn-heavy", + "nn-medium", + "classic-heavy", + "classic-medium", + "classic-light", + "zero-shot-encoders", + "zero-shot-llm", +] + +# rough quality tiering used by `recommend` +_QUALITY_TIER = { + "transformers-heavy": 5, + "nn-heavy": 4, + "transformers-light": 4, + "nn-medium": 3, + "classic-heavy": 3, + "transformers-no-hpo": 3, + "classic-medium": 2, + "classic-light": 1, + "zero-shot-encoders": 2, + "zero-shot-llm": 4, +} + + +def _load_config(target: str) -> tuple[dict[str, Any], str]: + """Return (config_dict, friendly_name) for either a preset or a path.""" + path = Path(target) + if path.is_file(): + with path.open(encoding="utf-8") as f: + return yaml.safe_load(f), path.stem + # treat as a bundled preset name + from autointent.utils import load_preset + + return load_preset(target), target # type: ignore[arg-type] + + +def _stats_from_args(args: argparse.Namespace) -> DatasetStats: + if args.dataset: + return _stats_from_dataset(args.dataset, multilabel=args.task == "multilabel") + return DatasetStats.placeholder( + n_samples=args.n_samples, + n_classes=args.n_classes, + avg_tokens=args.avg_tokens, + multilabel=args.task == "multilabel", + ) + + +def _stats_from_dataset(path: str, *, multilabel: bool) -> DatasetStats: + """Best-effort: load a dataset from disk via the existing Dataset constructor.""" + try: + from autointent import Dataset + except ImportError: + logger.warning("autointent.Dataset unavailable; falling back to placeholders.") + return DatasetStats.placeholder(multilabel=multilabel) + + try: + ds = Dataset.from_json(path) if path.endswith(".json") else Dataset.from_hub(path) + except Exception as e: # noqa: BLE001 + logger.warning("Failed to load dataset %s: %s", path, e) + return DatasetStats.placeholder(multilabel=multilabel) + + train = ds.get("train") or next(iter(ds.values()), None) + if train is None: + return DatasetStats.placeholder(multilabel=multilabel) + + utt_col = getattr(ds, "utterance_feature", "utterance") + sample = train[:1000] if len(train) > 1000 else train[:] + lengths = [len(str(s).split()) for s in sample.get(utt_col, [])] + avg_tokens = int(sum(lengths) / max(1, len(lengths))) if lengths else 32 + p95 = sorted(lengths)[int(len(lengths) * 0.95)] if lengths else avg_tokens * 2 + + return DatasetStats( + n_samples=len(train), + n_classes=getattr(ds, "n_classes", 0) or 0, + avg_tokens=avg_tokens, + p95_tokens=p95, + multilabel=getattr(ds, "multilabel", multilabel), + has_descriptions=getattr(ds, "has_descriptions", None), + source=f"dataset:{path}", + ) + + +def _add_common_dataset_args(p: argparse.ArgumentParser) -> None: + p.add_argument("--dataset", help="Path or hub id of a dataset; overrides placeholders.") + p.add_argument("--n-samples", type=int, default=1_000, help="Placeholder training set size.") + p.add_argument("--n-classes", type=int, default=10, help="Placeholder class count.") + p.add_argument("--avg-tokens", type=int, default=32, help="Placeholder average token length.") + p.add_argument( + "--task", + choices=("multiclass", "multilabel"), + default="multiclass", + help="Placeholder task type when --dataset isn't given.", + ) + + +def cmd_inspect(args: argparse.Namespace) -> int: + config, name = _load_config(args.target) + hardware = detect_hardware( + vram_budget_gb=args.budget_vram_gb, + ) + stats = _stats_from_args(args) + report = run_preflight(config, stats, hardware, preset_name=name) + if args.json: + sys.stdout.write(render_json(report)) + sys.stdout.write("\n") + else: + sys.stdout.write(render_text(report)) + sys.stdout.write("\n") + return 0 if report.is_feasible else 1 + + +def cmd_recommend(args: argparse.Namespace) -> int: + hardware = detect_hardware(vram_budget_gb=args.budget_vram_gb) + stats = _stats_from_args(args) + + results: list[tuple[str, PreflightReport]] = [] + from autointent.utils import load_preset + + for preset in BUNDLED_PRESETS: + try: + cfg = load_preset(preset) # type: ignore[arg-type] + except Exception as e: # noqa: BLE001 + logger.debug("Skipping preset %s: %s", preset, e) + continue + report = run_preflight(cfg, stats, hardware, preset_name=preset) + if args.budget_time_h is not None and report.resource.time_hours > args.budget_time_h: + report.add( + "resource", + report.worst_severity if report.worst_severity.value == "red" else report.worst_severity, # noqa: PLW0125 - explicit + f"Estimated time {report.resource.time_hours:.1f} h exceeds budget {args.budget_time_h} h.", + ) + results.append((preset, report)) + + feasible = [(name, r) for name, r in results if r.is_feasible] + feasible.sort( + key=lambda pair: (-_QUALITY_TIER.get(pair[0], 0), pair[1].resource.time_hours, pair[0]) + ) + chosen = feasible[0][0] if feasible else None + + if args.json: + import json + + out = { + "chosen": chosen, + "results": [ + {"preset": name, "report": r.to_dict()} for name, r in results + ], + } + sys.stdout.write(json.dumps(out, indent=2, default=str)) + sys.stdout.write("\n") + else: + sys.stdout.write(render_recommendation(results, chosen)) + sys.stdout.write("\n") + if chosen: + sys.stdout.write("\n") + sys.stdout.write(render_text(dict(results)[chosen])) + sys.stdout.write("\n") + return 0 if chosen else 1 + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="autointent-advisor", + description="Pre-flight feasibility advisor for AutoIntent search-space optimization.", + ) + parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging.") + + sub = parser.add_subparsers(dest="cmd", required=True) + + p_inspect = sub.add_parser( + "inspect", + help="Inspect a preset or OptimizationConfig and print a feasibility report.", + ) + p_inspect.add_argument("target", help="Preset name (e.g. transformers-light) or path to a YAML config.") + p_inspect.add_argument("--json", action="store_true", help="Emit a structured JSON report.") + p_inspect.add_argument( + "--budget-vram-gb", type=float, default=None, help="Override detected VRAM budget." + ) + _add_common_dataset_args(p_inspect) + p_inspect.set_defaults(func=cmd_inspect) + + p_rec = sub.add_parser( + "recommend", + help="Detect hardware and recommend the best-fitting bundled preset.", + ) + p_rec.add_argument("--json", action="store_true", help="Emit a structured JSON report.") + p_rec.add_argument( + "--budget-vram-gb", type=float, default=None, help="Override detected VRAM budget." + ) + p_rec.add_argument( + "--budget-time-h", type=float, default=None, help="Optional wall-time ceiling in hours." + ) + _add_common_dataset_args(p_rec) + p_rec.set_defaults(func=cmd_recommend) + + return parser + + +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + logging.basicConfig( + level=logging.DEBUG if args.verbose else logging.WARNING, + format="%(levelname)s %(name)s: %(message)s", + ) + return args.func(args) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/autointent/_advisor/_estimates.py b/src/autointent/_advisor/_estimates.py new file mode 100644 index 000000000..f60f940a6 --- /dev/null +++ b/src/autointent/_advisor/_estimates.py @@ -0,0 +1,382 @@ +"""Resource-phase estimation: walk the search space and aggregate cost. + +Implements an honest worst-case for the modules the proposal lists as +in-scope. Formulas are intentionally coarse — the advisor's contract is +"heuristic upper bound, not measurement". Time and VRAM are the noisiest; +treat them as ballparks, not budgets. +""" + +from __future__ import annotations + +import logging +from typing import Any, Iterable + +from ._hardware import HardwareProfile +from ._hub import ModelMeta, hub_reachable, resolve_model +from ._report import DatasetStats, PreflightReport, ResourceEstimate, Severity + +logger = logging.getLogger(__name__) + +# yellow / red thresholds as fraction of available budget +_YELLOW = 0.7 +_RED = 1.0 + +# rough per-step seconds, keyed on device class. Scaled by params_millions / 100. +_PER_STEP_BASELINE_S = { + "cpu": 0.5, + "low-gpu": 0.04, + "mid-gpu": 0.02, + "high-gpu": 0.01, + "apple-silicon": 0.08, +} + +TRANSFORMER_SCORER_MODULES = {"bert", "lora", "ptuning", "dnnc"} + + +def _extract_model_names(module_entry: dict[str, Any]) -> list[str]: + """Pull model name(s) from a search-space module entry.""" + candidates: list[str] = [] + cfg = module_entry.get("classification_model_config") + if isinstance(cfg, list): + for c in cfg: + if isinstance(c, dict) and c.get("model_name"): + candidates.append(c["model_name"]) + elif isinstance(cfg, dict) and cfg.get("model_name"): + candidates.append(cfg["model_name"]) + embedder_cfg = module_entry.get("embedder_config") + if isinstance(embedder_cfg, list): + for c in embedder_cfg: + if isinstance(c, dict) and c.get("model_name"): + candidates.append(c["model_name"]) + elif isinstance(embedder_cfg, dict) and embedder_cfg.get("model_name"): + candidates.append(embedder_cfg["model_name"]) + return candidates + + +def _max_int(value: Any, default: int) -> int: + if value is None: + return default + if isinstance(value, list) and value: + return max(int(x) for x in value) + if isinstance(value, dict): + return int(value.get("high", default)) + try: + return int(value) + except (TypeError, ValueError): + return default + + +def _walk_modules(search_space: list[dict[str, Any]]) -> Iterable[tuple[str, dict[str, Any]]]: + """Yield (node_type, module_entry) pairs.""" + for node in search_space or []: + node_type = node.get("node_type", "?") + for entry in node.get("search_space", []) or []: + yield node_type, entry + + +def _vram_for_transformer(meta: ModelMeta, mode: str, mixed_precision: bool) -> float: + """VRAM in GB for one trial of a transformer-based module. + + Conservative AMP accounting (the proposal flags the prior naive halving + as too generous; keep optimizer state at fp32 even in AMP). + """ + weights_gb = meta.weights_gb + if mode == "inference": + return weights_gb * 1.3 + if mode == "lora": + return weights_gb * 1.3 + 0.5 + if mode == "reranker": + return weights_gb * 1.5 + # full fine-tune (bert, ptuning, gcn-with-backbone) + if mixed_precision: + # fp16 weights+grads + fp32 master+adam moments + return (weights_gb * 0.5) * 2 + weights_gb * 1 + weights_gb * 2 + return weights_gb * (1 + 1 + 2) + + +def _ram_for_module(meta: ModelMeta, stats: DatasetStats) -> float: + """RAM in GB. Loose upper bound.""" + return meta.weights_gb + (stats.n_samples * stats.avg_tokens * 4) / (1024**3) + + +def _time_for_transformer( + *, + meta: ModelMeta, + n_trials: int, + epochs: int, + batch_size: int, + n_samples: int, + device_class: str, +) -> float: + per_step = _PER_STEP_BASELINE_S[device_class] * (meta.params_millions / 100.0) + steps = max(1, (n_samples // max(1, batch_size))) * epochs + return (n_trials * steps * per_step) / 3600.0 + + +def _classify_severity(estimate: float, budget: float) -> Severity: + if budget <= 0: + return Severity.YELLOW + ratio = estimate / budget + if ratio >= _RED: + return Severity.RED + if ratio >= _YELLOW: + return Severity.YELLOW + return Severity.GREEN + + +def _resource_phase( # noqa: PLR0912 - kept linear for clarity + config: dict[str, Any], + stats: DatasetStats, + hardware: HardwareProfile, + report: PreflightReport, +) -> None: + hpo = config.get("hpo_config") or {} + n_trials = int(hpo.get("n_trials", 1)) + n_jobs = int(hpo.get("n_jobs", 1)) + refit_after = bool(config.get("refit_after", False)) + dump_modules = bool(config.get("dump_modules", False)) + + if not hub_reachable(): + report.low_confidence = True + report.notes.append("HF Hub unreachable — all model sizes are name-pattern heuristics.") + + seen_models: dict[str, ModelMeta] = {} + estimate = ResourceEstimate(parallel_factor=max(1, n_jobs)) + + embedder_cfg = config.get("embedder_config") or {} + global_embedder = embedder_cfg.get("model_name") if isinstance(embedder_cfg, dict) else None + if global_embedder: + seen_models[global_embedder] = resolve_model(global_embedder) + + for node_type, entry in _walk_modules(config.get("search_space") or []): + module = entry.get("module_name", "?") + model_names = _extract_model_names(entry) + if not model_names and global_embedder and module in {"linear", "catboost", "knn", "mlknn"}: + model_names = [global_embedder] + + for name in model_names: + meta = seen_models.setdefault(name, resolve_model(name)) + + mixed_precision = entry.get("dtype") in {"fp16", "bf16"} + if module == "bert": + mode = "full-finetune" + elif module == "lora": + mode = "lora" + elif module == "dnnc": + mode = "reranker" + elif module == "ptuning": + mode = "full-finetune" + else: + mode = "inference" + + batch_size = _max_int(entry.get("batch_size"), 32) + epochs = _max_int(entry.get("num_train_epochs"), 1 if mode == "inference" else 10) + + vram = _vram_for_transformer(meta, mode, mixed_precision) + ram = _ram_for_module(meta, stats) + + time_h = 0.0 + if mode != "inference": + time_h = _time_for_transformer( + meta=meta, + n_trials=n_trials, + epochs=epochs, + batch_size=batch_size, + n_samples=stats.n_samples, + device_class=hardware.device_class, + ) + if refit_after and mode != "inference": + time_h *= 1 + 1.0 / max(1, n_trials) + + estimate.vram_gb = max(estimate.vram_gb, vram) + estimate.ram_gb = max(estimate.ram_gb, ram) + estimate.time_hours += time_h + estimate.drivers.append( + { + "node_type": node_type, + "module": module, + "model": name, + "mode": mode, + "vram_gb": round(vram, 2), + "ram_gb": round(ram, 2), + "time_hours": round(time_h, 2), + "confidence": meta.confidence, + } + ) + + for meta in seen_models.values(): + if meta.cached_locally: + estimate.disk_cached_gb += meta.disk_gb + else: + estimate.disk_download_gb += meta.disk_gb + + if dump_modules: + weights_total = sum(m.weights_gb for m in seen_models.values()) + estimate.disk_dump_gb = weights_total * n_trials + + if n_jobs > 1 and hardware.accelerator in {"cuda", "mps"}: + effective_vram = estimate.vram_gb * n_jobs + else: + effective_vram = estimate.vram_gb + + report.resource = estimate + + # render findings + vram_sev = _classify_severity(effective_vram, hardware.vram_gb) + if hardware.accelerator == "cpu" and effective_vram > 0: + report.add( + "resource", + Severity.YELLOW, + f"No GPU detected; transformer modules will be very slow (worst case ~{estimate.time_hours:.1f} h).", + metric="vram", + ) + else: + msg = f"VRAM ~{effective_vram:.1f} GB" + if n_jobs > 1: + msg += f" (= per-trial {estimate.vram_gb:.1f} GB × {n_jobs} parallel trials)" + msg += f" vs available {hardware.vram_gb:.1f} GB" + report.add("resource", vram_sev, msg, metric="vram") + + ram_sev = _classify_severity(estimate.ram_gb, hardware.ram_gb) + report.add( + "resource", + ram_sev, + f"RAM ~{estimate.ram_gb:.1f} GB vs available {hardware.ram_gb:.1f} GB", + metric="ram", + ) + + disk_total = estimate.disk_download_gb + estimate.disk_dump_gb + disk_sev = _classify_severity(disk_total, hardware.free_disk_gb) + disk_msg = f"Disk ~{estimate.disk_download_gb:.1f} GB to download" + if estimate.disk_cached_gb > 0: + disk_msg += f", {estimate.disk_cached_gb:.1f} GB already cached" + if estimate.disk_dump_gb > 0: + disk_msg += f", +{estimate.disk_dump_gb:.1f} GB during training (dump_modules=True)" + disk_msg += f" vs {hardware.free_disk_gb:.0f} GB free" + report.add("resource", disk_sev, disk_msg, metric="disk") + + if estimate.time_hours > 0: + time_msg = f"Time ~{estimate.time_hours:.1f} h (worst case, no HPO pruning)" + report.add("resource", Severity.GREEN, time_msg, metric="time") + + +def _config_phase( + config: dict[str, Any], + hardware: HardwareProfile, + report: PreflightReport, +) -> None: + hpo = config.get("hpo_config") or {} + n_jobs = int(hpo.get("n_jobs", 1)) + + if n_jobs > 1 and hardware.accelerator in {"cuda", "mps"}: + report.add( + "config", + Severity.YELLOW, + f"hpo_config.n_jobs={n_jobs} on a single GPU multiplies VRAM demand by {n_jobs}×.", + ) + + uses_catboost_gpu = False + for _, entry in _walk_modules(config.get("search_space") or []): + if entry.get("module_name") == "catboost" and entry.get("task_type") == "GPU": + uses_catboost_gpu = True + break + if uses_catboost_gpu and hardware.accelerator != "cuda": + report.add( + "config", + Severity.YELLOW, + "CatBoost task_type=GPU configured but no CUDA detected — will fall back to CPU.", + ) + + +def _data_phase( + config: dict[str, Any], + stats: DatasetStats, + report: PreflightReport, +) -> None: + # token-length truncation (heuristic — we use stats.p95_tokens vs configured max_length) + p95 = stats.p95_tokens or int(stats.avg_tokens * 2.5) + for _, entry in _walk_modules(config.get("search_space") or []): + max_len_value = entry.get("max_length") + if max_len_value is None: + continue + max_len = _max_int(max_len_value, 512) + if p95 > max_len: + severity = Severity.RED if p95 > max_len * 1.5 else Severity.YELLOW + report.add( + "data", + severity, + f"Train tokens p95~{p95} exceeds {entry.get('module_name', '?')}.max_length={max_len}; expect silent truncation.", + ) + + # rare class × linear-CV + has_linear = any( + e.get("module_name") == "linear" for _, e in _walk_modules(config.get("search_space") or []) + ) + if has_linear and stats.rare_classes: + report.add( + "data", + Severity.RED, + ( + "LogisticRegressionCV (cv=3) will fail: classes " + f"{stats.rare_classes[:5]} have <3 samples." + ), + ) + + # partial descriptions × description scorer + has_description = any( + e.get("module_name") == "description" + for _, e in _walk_modules(config.get("search_space") or []) + ) + if has_description and stats.has_descriptions is False: + report.add( + "data", + Severity.RED, + "description scorer present but intent descriptions are missing — fill them in or drop the scorer.", + ) + + +def run_preflight( + config: dict[str, Any], + stats: DatasetStats, + hardware: HardwareProfile, + *, + preset_name: str | None = None, +) -> PreflightReport: + """Run all three phases and return one report. + + Args: + config: parsed preset / OptimizationConfig dict (top-level keys: + ``search_space``, ``hpo_config``, optional ``embedder_config``). + stats: dataset statistics (real or placeholder). + hardware: detected hardware profile. + preset_name: optional friendly name for the report header. + + Returns: + PreflightReport with findings across resource/data/config phases. + """ + report = PreflightReport( + preset_name=preset_name, + hardware={ + "accelerator": hardware.accelerator, + "device_name": hardware.device_name, + "vram_gb": round(hardware.vram_gb, 2), + "ram_gb": round(hardware.ram_gb, 2), + "free_disk_gb": round(hardware.free_disk_gb, 2), + "device_class": hardware.device_class, + }, + dataset={ + "n_samples": stats.n_samples, + "n_classes": stats.n_classes, + "avg_tokens": stats.avg_tokens, + "p95_tokens": stats.p95_tokens, + "multilabel": stats.multilabel, + "source": stats.source, + }, + ) + report.notes.extend(hardware.notes) + + _resource_phase(config, stats, hardware, report) + _data_phase(config, stats, report) + _config_phase(config, hardware, report) + + return report diff --git a/src/autointent/_advisor/_hardware.py b/src/autointent/_advisor/_hardware.py new file mode 100644 index 000000000..2bda6120f --- /dev/null +++ b/src/autointent/_advisor/_hardware.py @@ -0,0 +1,160 @@ +"""Local hardware detection. + +Probes CPU / RAM / disk and the highest-priority accelerator available +(CUDA → MPS → CPU). All probes are wrapped to fall back safely on a +broken install (e.g. CUDA driver mismatch) rather than crash the advisor. +""" + +from __future__ import annotations + +import logging +import os +import platform +import shutil +from dataclasses import dataclass, field +from typing import Literal + +logger = logging.getLogger(__name__) + +Accelerator = Literal["cuda", "mps", "cpu"] + +# matches macOS PYTORCH_MPS_HIGH_WATERMARK_RATIO default +MPS_DEFAULT_BUDGET_RATIO = 0.7 + + +@dataclass +class HardwareProfile: + accelerator: Accelerator + device_name: str + vram_gb: float + ram_gb: float + free_disk_gb: float + cpu_count: int + notes: list[str] = field(default_factory=list) + + @property + def device_class(self) -> str: + if self.accelerator == "cpu": + return "cpu" + if self.accelerator == "mps": + return "apple-silicon" + if self.vram_gb >= 24: + return "high-gpu" + if self.vram_gb >= 12: + return "mid-gpu" + return "low-gpu" + + +def _detect_ram_gb() -> float: + try: + import psutil + + return psutil.virtual_memory().total / (1024**3) + except ImportError: + logger.debug("psutil unavailable; RAM unknown") + return 0.0 + + +def _detect_free_disk_gb(path: str | None = None) -> float: + cache = path or os.environ.get("HF_HOME") or os.path.expanduser("~/.cache/huggingface") + probe_path = cache if os.path.exists(cache) else os.path.expanduser("~") + try: + usage = shutil.disk_usage(probe_path) + return usage.free / (1024**3) + except OSError as e: + logger.debug("disk usage probe failed at %s: %s", probe_path, e) + return 0.0 + + +def _detect_cuda() -> tuple[float, str] | None: + try: + import torch + + if not torch.cuda.is_available(): + return None + idx = 0 + try: + free, total = torch.cuda.mem_get_info(idx) + vram_gb = total / (1024**3) + except (RuntimeError, AttributeError) as e: + logger.debug("torch.cuda.mem_get_info failed: %s", e) + return None + name = torch.cuda.get_device_name(idx) + return vram_gb, name + except ImportError: + return None + except Exception as e: # noqa: BLE001 - protect the advisor from torch quirks + logger.debug("CUDA detection raised: %s", e) + return None + + +def _detect_mps(ram_gb: float, budget_ratio: float = MPS_DEFAULT_BUDGET_RATIO) -> tuple[float, str] | None: + try: + import torch + + if not (hasattr(torch.backends, "mps") and torch.backends.mps.is_available()): + return None + # apple silicon: unified memory; budget is fraction of total RAM + return ram_gb * budget_ratio, f"Apple Silicon ({platform.machine()})" + except ImportError: + return None + except Exception as e: # noqa: BLE001 + logger.debug("MPS detection raised: %s", e) + return None + + +def detect_hardware( + *, + vram_budget_gb: float | None = None, + mps_budget_ratio: float = MPS_DEFAULT_BUDGET_RATIO, +) -> HardwareProfile: + """Detect the local hardware, with optional manual overrides. + + Args: + vram_budget_gb: when set, overrides the detected VRAM (use for + shared-GPU machines where part of the device is taken). + mps_budget_ratio: fraction of total RAM treated as the MPS + "VRAM" budget on Apple Silicon. + + Returns: + HardwareProfile reflecting current machine state. + """ + notes: list[str] = [] + ram_gb = _detect_ram_gb() + free_disk_gb = _detect_free_disk_gb() + cpu_count = os.cpu_count() or 1 + + cuda = _detect_cuda() + if cuda is not None: + vram_gb, device_name = cuda + accel: Accelerator = "cuda" + else: + mps = _detect_mps(ram_gb, mps_budget_ratio) + if mps is not None: + vram_gb, device_name = mps + accel = "mps" + notes.append( + f"MPS unified memory: VRAM budget = {mps_budget_ratio:.0%} of RAM." + ) + else: + vram_gb = 0.0 + device_name = platform.processor() or "cpu" + accel = "cpu" + + if vram_budget_gb is not None: + if vram_gb and vram_budget_gb > vram_gb: + notes.append( + f"Manual --budget-vram-gb={vram_budget_gb} exceeds detected {vram_gb:.1f} GB; using override." + ) + notes.append(f"Using manual VRAM budget: {vram_budget_gb} GB.") + vram_gb = vram_budget_gb + + return HardwareProfile( + accelerator=accel, + device_name=device_name, + vram_gb=vram_gb, + ram_gb=ram_gb, + free_disk_gb=free_disk_gb, + cpu_count=cpu_count, + notes=notes, + ) diff --git a/src/autointent/_advisor/_hub.py b/src/autointent/_advisor/_hub.py new file mode 100644 index 000000000..80ccb7133 --- /dev/null +++ b/src/autointent/_advisor/_hub.py @@ -0,0 +1,183 @@ +"""HF Hub metadata lookups + warm-cache probe. + +Memoized per-process. Offline-safe: every probe falls back to a +heuristic value rather than raising. The advisor flips the report's +``low_confidence`` flag when a fallback is taken. +""" + +from __future__ import annotations + +import logging +import os +import re +from dataclasses import dataclass +from functools import lru_cache +from typing import Any + +logger = logging.getLogger(__name__) + +# Coarse heuristic estimates keyed on name fragments. Used only when HF Hub +# is unreachable and we can't get safetensors metadata. Values in millions. +_NAME_HEURISTICS = [ + (re.compile(r"(?i)(deberta|roberta|bert).*(xxlarge|huge)"), 1_500), + (re.compile(r"(?i)(deberta|roberta|bert).*xlarge"), 750), + (re.compile(r"(?i)(deberta|roberta|bert).*large"), 350), + (re.compile(r"(?i)e5.*large"), 560), + (re.compile(r"(?i)e5.*small"), 33), + (re.compile(r"(?i)mpnet"), 110), + (re.compile(r"(?i)minilm"), 33), + (re.compile(r"(?i)distil"), 66), + (re.compile(r"(?i)small"), 60), + (re.compile(r"(?i)base"), 110), + (re.compile(r"(?i)large"), 350), +] + + +@dataclass +class ModelMeta: + name: str + params_millions: float + weight_bytes_per_param: int + total_file_bytes: int + cached_locally: bool + confidence: str # "hub" | "heuristic" + + @property + def disk_gb(self) -> float: + return self.total_file_bytes / (1024**3) + + @property + def weights_gb(self) -> float: + return (self.params_millions * 1_000_000 * self.weight_bytes_per_param) / (1024**3) + + +@lru_cache(maxsize=1) +def hub_reachable(timeout_s: float = 2.0) -> bool: + """Single up-front probe. Memoized per process.""" + try: + from huggingface_hub import HfApi + + HfApi().list_models(limit=1) + except ImportError: + logger.debug("huggingface_hub not installed; assuming offline") + return False + except Exception as e: # noqa: BLE001 + logger.debug("HF Hub probe failed: %s", e) + return False + else: + return True + + +def _heuristic_params_millions(model_name: str) -> float: + for pattern, m in _NAME_HEURISTICS: + if pattern.search(model_name): + return float(m) + return 110.0 # generic BERT-base default + + +def _is_warm_cached(model_name: str) -> bool: + """True when the weight shard is present in the local HF cache.""" + try: + from huggingface_hub import scan_cache_dir, try_to_load_from_cache + except ImportError: + return False + + weight_files = ["model.safetensors", "pytorch_model.bin", "model.safetensors.index.json"] + for fname in weight_files: + path = try_to_load_from_cache(model_name, fname) + if path is not None and path is not False: + return True + + # sharded models won't match the single-file probe; fall back to a scan + try: + cache = scan_cache_dir() + except Exception as e: # noqa: BLE001 + logger.debug("scan_cache_dir failed: %s", e) + return False + return any(repo.repo_id == model_name for repo in cache.repos) + + +def _hub_metadata(model_name: str) -> ModelMeta | None: + try: + from huggingface_hub import HfApi + except ImportError: + return None + + try: + info = HfApi().model_info(model_name, files_metadata=True) + except Exception as e: # noqa: BLE001 + logger.debug("model_info(%s) failed: %s", model_name, e) + return None + + params_millions = 0.0 + weight_bytes_per_param = 4 + safetensors = getattr(info, "safetensors", None) + if safetensors is not None: + params_total = getattr(safetensors, "total", None) or sum( + getattr(safetensors, "parameters", {}).values() or [0] + ) + if params_total: + params_millions = params_total / 1_000_000 + params_map: dict[str, Any] = getattr(safetensors, "parameters", {}) or {} + if any("F16" in k or "BF16" in k for k in params_map): + weight_bytes_per_param = 2 + + total_file_bytes = 0 + for sibling in getattr(info, "siblings", []) or []: + size = getattr(sibling, "size", None) + if size: + total_file_bytes += int(size) + + if params_millions == 0: + params_millions = _heuristic_params_millions(model_name) + + if total_file_bytes == 0: + total_file_bytes = int(params_millions * 1_000_000 * weight_bytes_per_param) + + return ModelMeta( + name=model_name, + params_millions=params_millions, + weight_bytes_per_param=weight_bytes_per_param, + total_file_bytes=total_file_bytes, + cached_locally=_is_warm_cached(model_name), + confidence="hub", + ) + + +def _heuristic_metadata(model_name: str) -> ModelMeta: + params_millions = _heuristic_params_millions(model_name) + weight_bytes_per_param = 4 + total_file_bytes = int(params_millions * 1_000_000 * weight_bytes_per_param) + return ModelMeta( + name=model_name, + params_millions=params_millions, + weight_bytes_per_param=weight_bytes_per_param, + total_file_bytes=total_file_bytes, + cached_locally=_is_warm_cached(model_name), + confidence="heuristic", + ) + + +@lru_cache(maxsize=64) +def resolve_model(model_name: str) -> ModelMeta: + """Resolve metadata for a single model name. Memoized per process. + + Always returns a value — never raises — so the advisor can keep going + on offline machines or for unknown checkpoints. + """ + if model_name.startswith("local:") or os.path.isabs(model_name): + return ModelMeta( + name=model_name, + params_millions=_heuristic_params_millions(model_name), + weight_bytes_per_param=4, + total_file_bytes=0, + cached_locally=True, + confidence="heuristic", + ) + + if hub_reachable(): + meta = _hub_metadata(model_name) + if meta is not None: + return meta + + return _heuristic_metadata(model_name) diff --git a/src/autointent/_advisor/_render.py b/src/autointent/_advisor/_render.py new file mode 100644 index 000000000..52168aa75 --- /dev/null +++ b/src/autointent/_advisor/_render.py @@ -0,0 +1,104 @@ +"""Rendering for the pre-flight report. + +Text output is grouped by phase (Resource / Data / Config) plus a Drivers +section and the always-on disclaimer. JSON output dumps the structured +report straight through. +""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from ._report import PreflightReport + +_SEVERITY_TAG = {"green": "✓", "yellow": "⚠", "red": "✗"} +_PHASE_ORDER = ("resource", "data", "config") +_PHASE_LABEL = {"resource": "Resource", "data": "Data", "config": "Config"} + + +def render_text(report: "PreflightReport") -> str: + lines: list[str] = [] + title = "Compute feasibility check" + if report.preset_name: + title += f" — {report.preset_name}" + lines.append(title) + lines.append("─" * len(title)) + + hw = report.hardware + lines.append( + f"Hardware: {hw.get('accelerator', '?')} ({hw.get('device_name', '?')})," + f" {hw.get('vram_gb', 0):.1f} GB VRAM, {hw.get('ram_gb', 0):.0f} GB RAM," + f" {hw.get('free_disk_gb', 0):.0f} GB free disk" + ) + ds = report.dataset + lines.append( + f"Dataset: n_samples={ds.get('n_samples')}, n_classes={ds.get('n_classes')}," + f" avg_tokens={ds.get('avg_tokens')} ({ds.get('source')})" + ) + lines.append("") + + for phase in _PHASE_ORDER: + bucket = [f for f in report.findings if f.phase == phase] + if not bucket: + continue + lines.append(f"{_PHASE_LABEL[phase]}:") + for f in bucket: + tag = _SEVERITY_TAG.get(f.severity.value, "·") + lines.append(f" {tag} {f.message}") + lines.append("") + + if report.resource.drivers: + lines.append("Drivers of cost:") + for d in report.resource.drivers[:8]: + lines.append( + f" {d['node_type']}.{d['module']:<10} {d['model']:<48}" + f" {d['mode']:<14} VRAM ~{d['vram_gb']} GB, time ~{d['time_hours']} h" + f" [{d['confidence']}]" + ) + if len(report.resource.drivers) > 8: + lines.append(f" … and {len(report.resource.drivers) - 8} more") + lines.append("") + + if report.notes: + lines.append("Notes:") + for note in report.notes: + lines.append(f" • {note}") + lines.append("") + + summary = f"Verdict: {'feasible' if report.is_feasible else 'INFEASIBLE'} " + summary += f"(worst severity: {report.worst_severity.value})" + if report.low_confidence: + summary += " — low-confidence (heuristic fallback in use)" + lines.append(summary) + lines.append("Note: estimates are heuristic upper bounds, not measurements.") + return "\n".join(lines) + + +def render_json(report: "PreflightReport") -> str: + return json.dumps(report.to_dict(), indent=2, default=str) + + +def render_recommendation( + results: list[tuple[str, "PreflightReport"]], + chosen: str | None, +) -> str: + """Compact table for the ``recommend`` subcommand.""" + lines = ["", "Recommendation:"] + if chosen: + lines.append(f" → {chosen}") + else: + lines.append(" → none of the bundled presets fit your hardware as-is.") + lines.append("") + lines.append(f"{'Preset':<24} {'Status':<14} {'VRAM':<10} {'Time':<10} {'Worst':<8}") + lines.append("-" * 68) + for name, report in results: + verdict = "feasible" if report.is_feasible else "infeasible" + lines.append( + f"{name:<24} {verdict:<14} " + f"{report.resource.vram_gb:>4.1f} GB " + f"{report.resource.time_hours:>4.1f} h " + f"{report.worst_severity.value:<8}" + ) + return "\n".join(lines) diff --git a/src/autointent/_advisor/_report.py b/src/autointent/_advisor/_report.py new file mode 100644 index 000000000..0250482a5 --- /dev/null +++ b/src/autointent/_advisor/_report.py @@ -0,0 +1,113 @@ +"""Dataclasses for the pre-flight advisor's structured report.""" + +from __future__ import annotations + +from dataclasses import asdict, dataclass, field +from enum import Enum +from typing import Any, Literal + + +class Severity(str, Enum): + GREEN = "green" + YELLOW = "yellow" + RED = "red" + + +Phase = Literal["resource", "data", "config"] + + +@dataclass(frozen=True) +class Finding: + """A single advisor finding rendered as one line in the summary.""" + + phase: Phase + severity: Severity + message: str + metric: str | None = None + + +@dataclass +class ResourceEstimate: + """Aggregated resource numbers across the search space.""" + + disk_download_gb: float = 0.0 + disk_cached_gb: float = 0.0 + disk_dump_gb: float = 0.0 + ram_gb: float = 0.0 + vram_gb: float = 0.0 + time_hours: float = 0.0 + parallel_factor: int = 1 + drivers: list[dict[str, Any]] = field(default_factory=list) + + @property + def total_disk_gb(self) -> float: + return self.disk_download_gb + self.disk_dump_gb + + +@dataclass +class DatasetStats: + """Minimal stats the advisor needs about the user's dataset. + + Built either from a real ``Dataset`` or from CLI placeholder flags. + """ + + n_samples: int + n_classes: int + avg_tokens: int + p95_tokens: int | None = None + multilabel: bool = False + has_descriptions: bool | None = None + rare_classes: list[str] = field(default_factory=list) + source: str = "placeholder" + + @classmethod + def placeholder( + cls, + n_samples: int = 1_000, + n_classes: int = 10, + avg_tokens: int = 32, + multilabel: bool = False, + ) -> "DatasetStats": + return cls( + n_samples=n_samples, + n_classes=n_classes, + avg_tokens=avg_tokens, + p95_tokens=int(avg_tokens * 2.5), + multilabel=multilabel, + ) + + +@dataclass +class PreflightReport: + """One report covering all three phases.""" + + findings: list[Finding] = field(default_factory=list) + resource: ResourceEstimate = field(default_factory=ResourceEstimate) + hardware: dict[str, Any] = field(default_factory=dict) + dataset: dict[str, Any] = field(default_factory=dict) + preset_name: str | None = None + low_confidence: bool = False + notes: list[str] = field(default_factory=list) + + def add(self, phase: Phase, severity: Severity, message: str, metric: str | None = None) -> None: + self.findings.append(Finding(phase=phase, severity=severity, message=message, metric=metric)) + + @property + def worst_severity(self) -> Severity: + order = {Severity.GREEN: 0, Severity.YELLOW: 1, Severity.RED: 2} + if not self.findings: + return Severity.GREEN + return max((f.severity for f in self.findings), key=lambda s: order[s]) + + @property + def is_feasible(self) -> bool: + return self.worst_severity != Severity.RED + + def to_dict(self) -> dict[str, Any]: + d = asdict(self) + d["findings"] = [ + {**asdict(f), "severity": f.severity.value} for f in self.findings + ] + d["worst_severity"] = self.worst_severity.value + d["is_feasible"] = self.is_feasible + return d diff --git a/tests/advisor/__init__.py b/tests/advisor/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/advisor/test_estimates_and_cli.py b/tests/advisor/test_estimates_and_cli.py new file mode 100644 index 000000000..18c2615a6 --- /dev/null +++ b/tests/advisor/test_estimates_and_cli.py @@ -0,0 +1,198 @@ +"""End-to-end smoke tests for the advisor. + +These run offline — HF Hub probes are monkeypatched to fail so the +advisor falls back to its name-pattern heuristics. Verifies that: + +* every bundled preset can be inspected without raising; +* the recommend subcommand picks something on a generous budget and + nothing on a hostile one; +* ``--json`` emits parseable JSON. +""" + +from __future__ import annotations + +import json +import sys + +import pytest + +from autointent._advisor import DatasetStats, HardwareProfile, run_preflight +from autointent._advisor._cli import BUNDLED_PRESETS, main +from autointent.utils import load_preset + + +@pytest.fixture(autouse=True) +def _force_offline(monkeypatch: pytest.MonkeyPatch) -> None: + """Pin the HF Hub probe to "offline" so tests don't hit the network.""" + from autointent._advisor import _estimates, _hub + + _hub.hub_reachable.cache_clear() + _hub.resolve_model.cache_clear() + offline = lambda *_a, **_kw: False # noqa: E731 + monkeypatch.setattr(_hub, "hub_reachable", offline) + monkeypatch.setattr(_estimates, "hub_reachable", offline) + + +def _profile(vram_gb: float = 16.0) -> HardwareProfile: + return HardwareProfile( + accelerator="cuda" if vram_gb > 0 else "cpu", + device_name="test-gpu" if vram_gb > 0 else "test-cpu", + vram_gb=vram_gb, + ram_gb=32.0, + free_disk_gb=200.0, + cpu_count=8, + ) + + +@pytest.mark.parametrize("preset", BUNDLED_PRESETS) +def test_every_preset_inspects_without_raising(preset: str) -> None: + cfg = load_preset(preset) # type: ignore[arg-type] + stats = DatasetStats.placeholder(n_samples=500, n_classes=10, avg_tokens=24) + report = run_preflight(cfg, stats, _profile(vram_gb=16.0), preset_name=preset) + assert report.preset_name == preset + assert report.low_confidence is True # we forced offline + # always at least one resource-phase finding + assert any(f.phase == "resource" for f in report.findings) + + +def test_heavy_preset_is_infeasible_on_2gb_budget() -> None: + cfg = load_preset("transformers-heavy") # type: ignore[arg-type] + stats = DatasetStats.placeholder(n_samples=5000, n_classes=20, avg_tokens=40) + report = run_preflight(cfg, stats, _profile(vram_gb=2.0), preset_name="transformers-heavy") + assert not report.is_feasible, "deberta-v3-large should not fit in 2 GB" + + +def test_light_preset_is_feasible_on_8gb_budget() -> None: + cfg = load_preset("transformers-light") # type: ignore[arg-type] + stats = DatasetStats.placeholder(n_samples=1000, n_classes=10, avg_tokens=24) + report = run_preflight(cfg, stats, _profile(vram_gb=8.0), preset_name="transformers-light") + assert report.is_feasible + + +def test_n_jobs_doubles_vram_findings() -> None: + cfg = load_preset("transformers-light") # type: ignore[arg-type] + cfg = {**cfg, "hpo_config": {**(cfg.get("hpo_config") or {}), "n_jobs": 4}} + stats = DatasetStats.placeholder() + report = run_preflight(cfg, stats, _profile(vram_gb=4.0)) + assert any("parallel trials" in f.message for f in report.findings) + assert any(f.phase == "config" and "n_jobs" in f.message for f in report.findings) + + +def test_cli_inspect_json_is_parseable(capsys: pytest.CaptureFixture[str]) -> None: + rc = main( + [ + "inspect", + "transformers-light", + "--n-samples", + "500", + "--n-classes", + "5", + "--avg-tokens", + "20", + "--json", + "--budget-vram-gb", + "16", + ] + ) + captured = capsys.readouterr() + payload = json.loads(captured.out) + assert payload["preset_name"] == "transformers-light" + assert "findings" in payload + assert payload["worst_severity"] in {"green", "yellow", "red"} + # rc is 0 on feasible, 1 otherwise + assert rc in (0, 1) + + +def test_cli_inspect_text_runs(capsys: pytest.CaptureFixture[str]) -> None: + main( + [ + "inspect", + "transformers-light", + "--n-samples", + "200", + "--n-classes", + "5", + "--avg-tokens", + "15", + "--budget-vram-gb", + "16", + ] + ) + out = capsys.readouterr().out + assert "Compute feasibility check" in out + assert "Verdict:" in out + + +def test_cli_recommend_picks_a_preset_on_generous_hardware( + capsys: pytest.CaptureFixture[str], +) -> None: + rc = main( + [ + "recommend", + "--n-samples", + "1000", + "--n-classes", + "10", + "--avg-tokens", + "20", + "--budget-vram-gb", + "24", + ] + ) + out = capsys.readouterr().out + assert "Recommendation:" in out + assert rc == 0 + + +def test_partial_descriptions_with_description_scorer_flags_red() -> None: + cfg = { + "search_space": [ + { + "node_type": "scoring", + "search_space": [ + {"module_name": "description"}, + ], + } + ], + } + stats = DatasetStats( + n_samples=500, + n_classes=10, + avg_tokens=24, + has_descriptions=False, + ) + report = run_preflight(cfg, stats, _profile(vram_gb=16.0)) + assert any( + f.phase == "data" and "description" in f.message.lower() for f in report.findings + ) + + +def test_long_dataset_triggers_truncation_warning() -> None: + cfg = { + "search_space": [ + { + "node_type": "scoring", + "search_space": [ + { + "module_name": "bert", + "classification_model_config": [ + {"model_name": "microsoft/deberta-v3-small"} + ], + "max_length": [128], + } + ], + } + ], + } + stats = DatasetStats( + n_samples=500, + n_classes=10, + avg_tokens=80, + p95_tokens=512, # well over 128 + ) + report = run_preflight(cfg, stats, _profile(vram_gb=16.0)) + assert any("truncation" in f.message.lower() for f in report.findings) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "-v"])) diff --git a/tests/advisor/test_estimates_internals.py b/tests/advisor/test_estimates_internals.py new file mode 100644 index 000000000..0317ff27b --- /dev/null +++ b/tests/advisor/test_estimates_internals.py @@ -0,0 +1,319 @@ +"""Targeted tests for `_estimates` helpers + edge cases of `run_preflight`.""" + +from __future__ import annotations + +import pytest + +from autointent._advisor import _estimates, _hub +from autointent._advisor._estimates import ( + _classify_severity, + _extract_model_names, + _max_int, + _ram_for_module, + _vram_for_transformer, + run_preflight, +) +from autointent._advisor._hardware import HardwareProfile +from autointent._advisor._hub import ModelMeta +from autointent._advisor._report import DatasetStats, Severity + + +@pytest.fixture(autouse=True) +def _offline(monkeypatch: pytest.MonkeyPatch) -> None: + _hub.hub_reachable.cache_clear() + _hub.resolve_model.cache_clear() + offline = lambda *_a, **_kw: False # noqa: E731 + monkeypatch.setattr(_hub, "hub_reachable", offline) + monkeypatch.setattr(_estimates, "hub_reachable", offline) + monkeypatch.setattr(_hub, "_is_warm_cached", lambda _name: False) + + +def _profile(vram_gb: float = 16.0, accelerator: str = "cuda") -> HardwareProfile: + return HardwareProfile( + accelerator=accelerator, # type: ignore[arg-type] + device_name=f"test-{accelerator}", + vram_gb=vram_gb, + ram_gb=32.0, + free_disk_gb=200.0, + cpu_count=8, + ) + + +class TestMaxInt: + def test_none_returns_default(self) -> None: + assert _max_int(None, 7) == 7 + + def test_list_picks_max(self) -> None: + assert _max_int([1, 5, 3], 0) == 5 + + def test_range_dict_uses_high(self) -> None: + assert _max_int({"low": 1, "high": 9}, 0) == 9 + + def test_scalar_int_passes_through(self) -> None: + assert _max_int(42, 0) == 42 + + def test_garbage_returns_default(self) -> None: + assert _max_int("not-a-number", 11) == 11 + + +class TestExtractModelNames: + def test_classification_model_config_as_list(self) -> None: + entry = {"classification_model_config": [{"model_name": "foo/bar"}]} + assert _extract_model_names(entry) == ["foo/bar"] + + def test_classification_model_config_as_dict(self) -> None: + entry = {"classification_model_config": {"model_name": "foo/bar"}} + assert _extract_model_names(entry) == ["foo/bar"] + + def test_embedder_config_picked_up(self) -> None: + entry = {"embedder_config": [{"model_name": "e/b"}]} + assert _extract_model_names(entry) == ["e/b"] + + def test_multiple_choices_all_returned(self) -> None: + entry = { + "classification_model_config": [ + {"model_name": "a/x"}, + {"model_name": "b/y"}, + ] + } + assert _extract_model_names(entry) == ["a/x", "b/y"] + + def test_empty_entry(self) -> None: + assert _extract_model_names({}) == [] + + +class TestClassifySeverity: + def test_below_yellow_is_green(self) -> None: + assert _classify_severity(estimate=1.0, budget=10.0) == Severity.GREEN + + def test_above_yellow_threshold(self) -> None: + assert _classify_severity(estimate=8.0, budget=10.0) == Severity.YELLOW + + def test_at_or_above_red_threshold(self) -> None: + assert _classify_severity(estimate=10.0, budget=10.0) == Severity.RED + assert _classify_severity(estimate=12.0, budget=10.0) == Severity.RED + + def test_zero_budget_returns_yellow(self) -> None: + assert _classify_severity(estimate=1.0, budget=0.0) == Severity.YELLOW + + +class TestVramForTransformer: + @pytest.fixture + def meta(self) -> ModelMeta: + return ModelMeta( + name="x", + params_millions=100.0, + weight_bytes_per_param=4, + total_file_bytes=0, + cached_locally=False, + confidence="hub", + ) + + def test_full_finetune_is_larger_than_lora_is_larger_than_inference( + self, meta: ModelMeta + ) -> None: + inference = _vram_for_transformer(meta, "inference", mixed_precision=False) + lora = _vram_for_transformer(meta, "lora", mixed_precision=False) + full = _vram_for_transformer(meta, "full-finetune", mixed_precision=False) + assert inference < lora < full + + def test_amp_does_not_naively_halve(self, meta: ModelMeta) -> None: + """The proposal calls out that AMP doesn't halve total VRAM — fp32 master + weights and Adam moments don't shrink. Weight-side accounting comes out + equal to fp32; the only savings (activations) aren't modeled by us.""" + full_fp32 = _vram_for_transformer(meta, "full-finetune", mixed_precision=False) + full_amp = _vram_for_transformer(meta, "full-finetune", mixed_precision=True) + assert full_amp / full_fp32 == pytest.approx(1.0) + assert full_amp / full_fp32 > 0.5 # explicit check vs the naive-halving formula + + def test_reranker_uses_inference_class(self, meta: ModelMeta) -> None: + inference = _vram_for_transformer(meta, "inference", mixed_precision=False) + reranker = _vram_for_transformer(meta, "reranker", mixed_precision=False) + assert reranker > inference + + +def test_ram_scales_with_dataset_size() -> None: + meta = ModelMeta( + name="x", + params_millions=100.0, + weight_bytes_per_param=4, + total_file_bytes=0, + cached_locally=False, + confidence="hub", + ) + small = _ram_for_module(meta, DatasetStats.placeholder(n_samples=100)) + big = _ram_for_module(meta, DatasetStats.placeholder(n_samples=10_000_000, avg_tokens=128)) + assert big > small + + +class TestRunPreflightFeatures: + def test_dump_modules_adds_disk_during_training(self) -> None: + cfg = { + "search_space": [ + { + "node_type": "scoring", + "search_space": [ + { + "module_name": "bert", + "classification_model_config": [ + {"model_name": "microsoft/deberta-v3-small"} + ], + "num_train_epochs": [3], + "batch_size": [16], + } + ], + } + ], + "hpo_config": {"n_trials": 5}, + "dump_modules": True, + } + report = run_preflight(cfg, DatasetStats.placeholder(), _profile()) + assert report.resource.disk_dump_gb > 0 + assert any("during training" in f.message for f in report.findings) + + def test_refit_after_increases_time(self) -> None: + cfg = { + "search_space": [ + { + "node_type": "scoring", + "search_space": [ + { + "module_name": "bert", + "classification_model_config": [ + {"model_name": "microsoft/deberta-v3-small"} + ], + "num_train_epochs": [3], + "batch_size": [16], + } + ], + } + ], + "hpo_config": {"n_trials": 10}, + } + baseline = run_preflight(cfg, DatasetStats.placeholder(), _profile()) + cfg_refit = {**cfg, "refit_after": True} + bumped = run_preflight(cfg_refit, DatasetStats.placeholder(), _profile()) + assert bumped.resource.time_hours > baseline.resource.time_hours + + def test_catboost_gpu_without_cuda_flags_config(self) -> None: + cfg = { + "search_space": [ + { + "node_type": "scoring", + "search_space": [ + {"module_name": "catboost", "task_type": "GPU"}, + ], + } + ], + } + report = run_preflight(cfg, DatasetStats.placeholder(), _profile(accelerator="cpu")) + assert any( + f.phase == "config" and "CatBoost" in f.message for f in report.findings + ) + + def test_catboost_gpu_with_cuda_is_silent(self) -> None: + cfg = { + "search_space": [ + { + "node_type": "scoring", + "search_space": [ + {"module_name": "catboost", "task_type": "GPU"}, + ], + } + ], + } + report = run_preflight(cfg, DatasetStats.placeholder(), _profile(accelerator="cuda")) + assert not any( + f.phase == "config" and "CatBoost" in f.message for f in report.findings + ) + + def test_offline_flips_low_confidence(self) -> None: + cfg = { + "search_space": [ + { + "node_type": "scoring", + "search_space": [ + { + "module_name": "bert", + "classification_model_config": [{"model_name": "any/model"}], + } + ], + } + ] + } + report = run_preflight(cfg, DatasetStats.placeholder(), _profile()) + assert report.low_confidence is True + assert any("HF Hub unreachable" in n for n in report.notes) + + def test_rare_classes_with_linear_scorer_flag_red(self) -> None: + cfg = { + "search_space": [ + { + "node_type": "scoring", + "search_space": [ + {"module_name": "linear"}, + ], + } + ] + } + stats = DatasetStats( + n_samples=20, + n_classes=5, + avg_tokens=10, + rare_classes=["intent_a", "intent_b"], + ) + report = run_preflight(cfg, stats, _profile()) + assert any( + f.phase == "data" and "LogisticRegressionCV" in f.message and f.severity == Severity.RED + for f in report.findings + ) + + def test_truncation_red_when_p95_dominates_max_length(self) -> None: + cfg = { + "search_space": [ + { + "node_type": "scoring", + "search_space": [ + { + "module_name": "bert", + "max_length": [128], + "classification_model_config": [ + {"model_name": "some/model"} + ], + } + ], + } + ] + } + stats = DatasetStats(n_samples=500, n_classes=5, avg_tokens=50, p95_tokens=400) + report = run_preflight(cfg, stats, _profile()) + red = [f for f in report.findings if f.phase == "data" and f.severity == Severity.RED] + assert red, "p95=400 > 1.5 * max_length=128 should be red" + + def test_truncation_yellow_when_p95_only_slightly_exceeds(self) -> None: + cfg = { + "search_space": [ + { + "node_type": "scoring", + "search_space": [ + { + "module_name": "bert", + "max_length": [128], + "classification_model_config": [ + {"model_name": "some/model"} + ], + } + ], + } + ] + } + stats = DatasetStats(n_samples=500, n_classes=5, avg_tokens=50, p95_tokens=140) + report = run_preflight(cfg, stats, _profile()) + yellows = [ + f + for f in report.findings + if f.phase == "data" + and f.severity == Severity.YELLOW + and "truncation" in f.message.lower() + ] + assert yellows diff --git a/tests/advisor/test_hardware_detection.py b/tests/advisor/test_hardware_detection.py new file mode 100644 index 000000000..d8131fb19 --- /dev/null +++ b/tests/advisor/test_hardware_detection.py @@ -0,0 +1,72 @@ +"""Hardware detection has to be safe on every machine — broken CUDA, no GPU, +no psutil. Verify the fallbacks work without raising. +""" + +from __future__ import annotations + +from unittest.mock import patch + +import pytest + +from autointent._advisor._hardware import detect_hardware + + +def test_cpu_fallback_when_no_accelerator() -> None: + with ( + patch("autointent._advisor._hardware._detect_cuda", return_value=None), + patch("autointent._advisor._hardware._detect_mps", return_value=None), + ): + hw = detect_hardware() + assert hw.accelerator == "cpu" + assert hw.vram_gb == 0.0 + assert hw.device_class == "cpu" + + +def test_cuda_branch_classifies_low_gpu() -> None: + with ( + patch( + "autointent._advisor._hardware._detect_cuda", + return_value=(8.0, "NVIDIA RTX 3060"), + ), + ): + hw = detect_hardware() + assert hw.accelerator == "cuda" + assert hw.vram_gb == pytest.approx(8.0) + assert hw.device_class == "low-gpu" + + +def test_mps_budget_uses_ram_fraction() -> None: + with ( + patch("autointent._advisor._hardware._detect_cuda", return_value=None), + patch("autointent._advisor._hardware._detect_ram_gb", return_value=32.0), + patch( + "autointent._advisor._hardware._detect_mps", + side_effect=lambda ram, ratio: (ram * ratio, "Apple Silicon (arm64)"), + ), + ): + hw = detect_hardware() + assert hw.accelerator == "mps" + assert hw.vram_gb == pytest.approx(32.0 * 0.7) + assert any("MPS unified memory" in n for n in hw.notes) + + +def test_vram_budget_override_applies() -> None: + with ( + patch( + "autointent._advisor._hardware._detect_cuda", + return_value=(24.0, "NVIDIA RTX 4090"), + ), + ): + hw = detect_hardware(vram_budget_gb=8.0) + assert hw.vram_gb == pytest.approx(8.0) + assert any("manual VRAM budget" in n for n in hw.notes) + + +def test_broken_cuda_returns_none_does_not_crash() -> None: + # _detect_cuda swallows torch quirks already; verify the wrapper holds. + with ( + patch("autointent._advisor._hardware._detect_cuda", return_value=None), + patch("autointent._advisor._hardware._detect_mps", return_value=None), + ): + hw = detect_hardware() + assert hw.accelerator == "cpu" diff --git a/tests/advisor/test_hub_heuristics.py b/tests/advisor/test_hub_heuristics.py new file mode 100644 index 000000000..54a03431d --- /dev/null +++ b/tests/advisor/test_hub_heuristics.py @@ -0,0 +1,81 @@ +"""Tests for the offline name-pattern heuristics in `_hub`. + +The advisor must produce a sensible estimate even when HF Hub is +unreachable, so these tests pin the public `hub_reachable` to False and +exercise the heuristic path directly. +""" + +from __future__ import annotations + +import pytest + +from autointent._advisor import _hub + + +@pytest.fixture(autouse=True) +def _offline(monkeypatch: pytest.MonkeyPatch) -> None: + _hub.hub_reachable.cache_clear() + _hub.resolve_model.cache_clear() + monkeypatch.setattr(_hub, "hub_reachable", lambda *_a, **_kw: False) + monkeypatch.setattr(_hub, "_is_warm_cached", lambda _name: False) + + +@pytest.mark.parametrize( + ("name", "expected_min_m", "expected_max_m"), + [ + ("microsoft/deberta-v3-large", 200, 500), + ("microsoft/deberta-v3-small", 30, 200), + ("sentence-transformers/all-MiniLM-L6-v2", 20, 80), + ("intfloat/multilingual-e5-large-instruct", 300, 700), + ("intfloat/e5-small", 20, 80), + ("distilbert-base-uncased", 40, 150), + ("bert-base-uncased", 70, 200), + ], +) +def test_name_heuristic_picks_reasonable_bucket( + name: str, expected_min_m: int, expected_max_m: int +) -> None: + meta = _hub.resolve_model(name) + assert meta.confidence == "heuristic" + assert expected_min_m <= meta.params_millions <= expected_max_m, ( + f"{name} got {meta.params_millions}M; expected [{expected_min_m}, {expected_max_m}]" + ) + + +def test_unknown_name_falls_back_to_bert_base() -> None: + meta = _hub.resolve_model("totally-made-up/no-such-model") + assert meta.confidence == "heuristic" + assert meta.params_millions == pytest.approx(110.0) + + +def test_weights_gb_matches_params_times_bytes() -> None: + meta = _hub.resolve_model("microsoft/deberta-v3-large") + expected_gb = meta.params_millions * 1_000_000 * meta.weight_bytes_per_param / (1024**3) + assert meta.weights_gb == pytest.approx(expected_gb) + + +def test_local_path_returns_zero_disk() -> None: + meta = _hub.resolve_model("/tmp/local/path/to/model") + assert meta.total_file_bytes == 0 + assert meta.cached_locally is True + + +def test_disk_gb_falls_back_to_param_size_when_siblings_unknown() -> None: + meta = _hub.resolve_model("intfloat/multilingual-e5-large-instruct") + assert meta.disk_gb > 0 + assert meta.disk_gb == pytest.approx(meta.weights_gb, rel=0.01) + + +def test_resolve_is_memoized() -> None: + a = _hub.resolve_model("microsoft/deberta-v3-large") + b = _hub.resolve_model("microsoft/deberta-v3-large") + assert a is b + + +def test_metadata_fallback_uses_heuristic_when_hub_unreachable() -> None: + """End-to-end: resolve_model must return a usable ModelMeta even when + the live Hub is unreachable (autouse fixture forces offline).""" + meta = _hub.resolve_model("microsoft/deberta-v3-large") + assert meta.confidence == "heuristic" + assert meta.params_millions > 0 + assert meta.disk_gb > 0 diff --git a/tests/advisor/test_render.py b/tests/advisor/test_render.py new file mode 100644 index 000000000..e82d7573b --- /dev/null +++ b/tests/advisor/test_render.py @@ -0,0 +1,151 @@ +"""Output rendering: text formatting and JSON serialization.""" + +from __future__ import annotations + +import json + +import pytest + +from autointent._advisor._render import render_json, render_recommendation, render_text +from autointent._advisor._report import ( + DatasetStats, + PreflightReport, + ResourceEstimate, + Severity, +) + + +def _populated_report() -> PreflightReport: + r = PreflightReport( + preset_name="example", + hardware={ + "accelerator": "cuda", + "device_name": "RTX 3060", + "vram_gb": 8.0, + "ram_gb": 32.0, + "free_disk_gb": 100.0, + "device_class": "low-gpu", + }, + dataset={"n_samples": 500, "n_classes": 10, "avg_tokens": 30, "source": "placeholder"}, + resource=ResourceEstimate( + disk_download_gb=2.5, + disk_cached_gb=0.5, + ram_gb=1.0, + vram_gb=4.0, + time_hours=1.2, + drivers=[ + { + "node_type": "scoring", + "module": "bert", + "model": "x/y", + "mode": "full-finetune", + "vram_gb": 4.0, + "ram_gb": 1.0, + "time_hours": 1.2, + "confidence": "hub", + } + ], + ), + notes=["MPS unified memory note"], + ) + r.add("resource", Severity.YELLOW, "VRAM ~6 GB vs available 8 GB") + r.add("data", Severity.RED, "rare classes blocked") + return r + + +class TestRenderText: + def test_contains_phase_blocks(self) -> None: + out = render_text(_populated_report()) + assert "Resource:" in out + assert "Data:" in out + # Config phase has no findings → block omitted + assert "Config:" not in out + + def test_includes_drivers_block(self) -> None: + out = render_text(_populated_report()) + assert "Drivers of cost:" in out + assert "x/y" in out + + def test_verdict_reflects_worst_severity(self) -> None: + out = render_text(_populated_report()) + assert "Verdict: INFEASIBLE" in out + assert "worst severity: red" in out + + def test_disclaimer_always_present(self) -> None: + out = render_text(_populated_report()) + assert "heuristic upper bounds" in out + + def test_low_confidence_tag_when_offline(self) -> None: + r = _populated_report() + r.low_confidence = True + out = render_text(r) + assert "low-confidence" in out + + def test_preset_name_in_title(self) -> None: + out = render_text(_populated_report()) + assert "Compute feasibility check — example" in out + + def test_empty_report_still_renders(self) -> None: + out = render_text(PreflightReport()) + assert "Compute feasibility check" in out + assert "Verdict: feasible" in out + + +class TestRenderJson: + def test_is_valid_json(self) -> None: + json.loads(render_json(_populated_report())) + + def test_findings_have_string_severity(self) -> None: + d = json.loads(render_json(_populated_report())) + for f in d["findings"]: + assert f["severity"] in {"green", "yellow", "red"} + + def test_worst_severity_and_feasibility_serialized(self) -> None: + d = json.loads(render_json(_populated_report())) + assert d["worst_severity"] == "red" + assert d["is_feasible"] is False + + def test_empty_report_serializes(self) -> None: + d = json.loads(render_json(PreflightReport())) + assert d["worst_severity"] == "green" + assert d["is_feasible"] is True + + +class TestRenderRecommendation: + def _two_reports(self) -> list[tuple[str, PreflightReport]]: + a = PreflightReport(preset_name="a", resource=ResourceEstimate(vram_gb=2.0, time_hours=0.5)) + a.add("resource", Severity.GREEN, "ok") + b = PreflightReport(preset_name="b", resource=ResourceEstimate(vram_gb=8.0, time_hours=4.0)) + b.add("resource", Severity.RED, "too big") + return [("a", a), ("b", b)] + + def test_lists_chosen_preset_when_present(self) -> None: + out = render_recommendation(self._two_reports(), chosen="a") + assert "→ a" in out + + def test_handles_no_chosen(self) -> None: + out = render_recommendation(self._two_reports(), chosen=None) + assert "none of the bundled presets" in out + + def test_includes_all_presets_in_table(self) -> None: + out = render_recommendation(self._two_reports(), chosen="a") + assert "a " in out # preset name + assert "b " in out + + def test_shows_status_per_preset(self) -> None: + out = render_recommendation(self._two_reports(), chosen="a") + assert "feasible" in out + assert "infeasible" in out + + +def test_dataset_stats_in_text_block() -> None: + stats = DatasetStats.placeholder(n_samples=777, n_classes=4) + r = PreflightReport(dataset={ + "n_samples": stats.n_samples, + "n_classes": stats.n_classes, + "avg_tokens": stats.avg_tokens, + "source": stats.source, + }) + out = render_text(r) + assert "777" in out + assert "n_classes=4" in out diff --git a/tests/advisor/test_report.py b/tests/advisor/test_report.py new file mode 100644 index 000000000..52f2e675e --- /dev/null +++ b/tests/advisor/test_report.py @@ -0,0 +1,85 @@ +"""Unit tests for the report dataclasses.""" + +from __future__ import annotations + +import pytest + +from autointent._advisor._report import ( + DatasetStats, + Finding, + PreflightReport, + ResourceEstimate, + Severity, +) + + +class TestSeverityOrdering: + def test_worst_severity_on_empty_report_is_green(self) -> None: + assert PreflightReport().worst_severity == Severity.GREEN + + def test_red_beats_yellow_beats_green(self) -> None: + r = PreflightReport() + r.add("resource", Severity.GREEN, "ok") + r.add("data", Severity.YELLOW, "warn") + assert r.worst_severity == Severity.YELLOW + r.add("config", Severity.RED, "fail") + assert r.worst_severity == Severity.RED + + def test_is_feasible_flips_on_any_red(self) -> None: + r = PreflightReport() + r.add("resource", Severity.YELLOW, "warn") + assert r.is_feasible is True + r.add("data", Severity.RED, "fail") + assert r.is_feasible is False + + +class TestDatasetStatsPlaceholder: + def test_defaults_populate_p95_above_avg(self) -> None: + stats = DatasetStats.placeholder() + assert stats.n_samples == 1_000 + assert stats.p95_tokens is not None + assert stats.p95_tokens > stats.avg_tokens + assert stats.source == "placeholder" + + def test_overrides_propagate(self) -> None: + stats = DatasetStats.placeholder(n_samples=42, n_classes=3, avg_tokens=80, multilabel=True) + assert stats.n_samples == 42 + assert stats.n_classes == 3 + assert stats.avg_tokens == 80 + assert stats.multilabel is True + + +class TestResourceEstimate: + def test_total_disk_sums_download_and_dump(self) -> None: + e = ResourceEstimate(disk_download_gb=2.5, disk_dump_gb=4.0) + assert e.total_disk_gb == pytest.approx(6.5) + + def test_total_disk_ignores_cached(self) -> None: + e = ResourceEstimate(disk_download_gb=1.0, disk_cached_gb=100.0, disk_dump_gb=0.5) + assert e.total_disk_gb == pytest.approx(1.5) + + +class TestToDictSerialization: + def test_findings_round_trip_severity_as_string(self) -> None: + r = PreflightReport() + r.add("resource", Severity.RED, "boom") + d = r.to_dict() + assert d["worst_severity"] == "red" + assert d["is_feasible"] is False + assert d["findings"] == [ + {"phase": "resource", "severity": "red", "message": "boom", "metric": None}, + ] + + def test_hardware_and_dataset_pass_through(self) -> None: + r = PreflightReport( + hardware={"accelerator": "cuda", "vram_gb": 8.0}, + dataset={"n_samples": 100, "n_classes": 5}, + ) + d = r.to_dict() + assert d["hardware"]["accelerator"] == "cuda" + assert d["dataset"]["n_samples"] == 100 + + def test_finding_is_frozen(self) -> None: + f = Finding(phase="resource", severity=Severity.GREEN, message="ok") + with pytest.raises(Exception): # noqa: PT011 - dataclass.FrozenInstanceError varies + f.message = "changed" # type: ignore[misc] From c8675b9a69cf5c1492380a1bdac2c8a3885281b2 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Mon, 15 Jun 2026 23:58:59 +0300 Subject: [PATCH 04/16] fix --- src/autointent/_advisor/__init__.py | 2 +- src/autointent/_advisor/_cli.py | 40 +-- src/autointent/_advisor/_estimates.py | 217 ++++++++++++++-- src/autointent/_advisor/_hardware.py | 59 ++--- src/autointent/_advisor/_hub.py | 20 +- src/autointent/_advisor/_render.py | 6 +- src/autointent/_advisor/_report.py | 6 +- tests/advisor/test_estimates_and_cli.py | 43 +++- tests/advisor/test_estimates_internals.py | 242 +++++++++++++++--- tests/advisor/test_hub_heuristics.py | 4 +- tests/advisor/test_render.py | 16 +- .../advanced/02_embedder_configuration.py | 4 +- 12 files changed, 495 insertions(+), 164 deletions(-) diff --git a/src/autointent/_advisor/__init__.py b/src/autointent/_advisor/__init__.py index 5f29b028e..3ff898816 100644 --- a/src/autointent/_advisor/__init__.py +++ b/src/autointent/_advisor/__init__.py @@ -7,9 +7,9 @@ from __future__ import annotations +from ._estimates import run_preflight from ._hardware import HardwareProfile, detect_hardware from ._report import DatasetStats, Finding, PreflightReport, ResourceEstimate, Severity -from ._estimates import run_preflight __all__ = [ "DatasetStats", diff --git a/src/autointent/_advisor/_cli.py b/src/autointent/_advisor/_cli.py index 4e7eae000..b3f43aab5 100644 --- a/src/autointent/_advisor/_cli.py +++ b/src/autointent/_advisor/_cli.py @@ -20,10 +20,13 @@ import yaml +from autointent import Dataset +from autointent.utils import load_preset + from ._estimates import run_preflight from ._hardware import detect_hardware from ._render import render_json, render_recommendation, render_text -from ._report import DatasetStats, PreflightReport +from ._report import DatasetStats, PreflightReport, Severity logger = logging.getLogger("autointent.advisor") @@ -62,8 +65,6 @@ def _load_config(target: str) -> tuple[dict[str, Any], str]: with path.open(encoding="utf-8") as f: return yaml.safe_load(f), path.stem # treat as a bundled preset name - from autointent.utils import load_preset - return load_preset(target), target # type: ignore[arg-type] @@ -80,15 +81,9 @@ def _stats_from_args(args: argparse.Namespace) -> DatasetStats: def _stats_from_dataset(path: str, *, multilabel: bool) -> DatasetStats: """Best-effort: load a dataset from disk via the existing Dataset constructor.""" - try: - from autointent import Dataset - except ImportError: - logger.warning("autointent.Dataset unavailable; falling back to placeholders.") - return DatasetStats.placeholder(multilabel=multilabel) - try: ds = Dataset.from_json(path) if path.endswith(".json") else Dataset.from_hub(path) - except Exception as e: # noqa: BLE001 + except (OSError, ValueError) as e: logger.warning("Failed to load dataset %s: %s", path, e) return DatasetStats.placeholder(multilabel=multilabel) @@ -147,27 +142,24 @@ def cmd_recommend(args: argparse.Namespace) -> int: stats = _stats_from_args(args) results: list[tuple[str, PreflightReport]] = [] - from autointent.utils import load_preset for preset in BUNDLED_PRESETS: try: cfg = load_preset(preset) # type: ignore[arg-type] - except Exception as e: # noqa: BLE001 + except (OSError, ValueError, KeyError) as e: logger.debug("Skipping preset %s: %s", preset, e) continue report = run_preflight(cfg, stats, hardware, preset_name=preset) if args.budget_time_h is not None and report.resource.time_hours > args.budget_time_h: report.add( "resource", - report.worst_severity if report.worst_severity.value == "red" else report.worst_severity, # noqa: PLW0125 - explicit + Severity.RED, f"Estimated time {report.resource.time_hours:.1f} h exceeds budget {args.budget_time_h} h.", ) results.append((preset, report)) feasible = [(name, r) for name, r in results if r.is_feasible] - feasible.sort( - key=lambda pair: (-_QUALITY_TIER.get(pair[0], 0), pair[1].resource.time_hours, pair[0]) - ) + feasible.sort(key=lambda pair: (-_QUALITY_TIER.get(pair[0], 0), pair[1].resource.time_hours, pair[0])) chosen = feasible[0][0] if feasible else None if args.json: @@ -175,9 +167,7 @@ def cmd_recommend(args: argparse.Namespace) -> int: out = { "chosen": chosen, - "results": [ - {"preset": name, "report": r.to_dict()} for name, r in results - ], + "results": [{"preset": name, "report": r.to_dict()} for name, r in results], } sys.stdout.write(json.dumps(out, indent=2, default=str)) sys.stdout.write("\n") @@ -206,9 +196,7 @@ def build_parser() -> argparse.ArgumentParser: ) p_inspect.add_argument("target", help="Preset name (e.g. transformers-light) or path to a YAML config.") p_inspect.add_argument("--json", action="store_true", help="Emit a structured JSON report.") - p_inspect.add_argument( - "--budget-vram-gb", type=float, default=None, help="Override detected VRAM budget." - ) + p_inspect.add_argument("--budget-vram-gb", type=float, default=None, help="Override detected VRAM budget.") _add_common_dataset_args(p_inspect) p_inspect.set_defaults(func=cmd_inspect) @@ -217,12 +205,8 @@ def build_parser() -> argparse.ArgumentParser: help="Detect hardware and recommend the best-fitting bundled preset.", ) p_rec.add_argument("--json", action="store_true", help="Emit a structured JSON report.") - p_rec.add_argument( - "--budget-vram-gb", type=float, default=None, help="Override detected VRAM budget." - ) - p_rec.add_argument( - "--budget-time-h", type=float, default=None, help="Optional wall-time ceiling in hours." - ) + p_rec.add_argument("--budget-vram-gb", type=float, default=None, help="Override detected VRAM budget.") + p_rec.add_argument("--budget-time-h", type=float, default=None, help="Optional wall-time ceiling in hours.") _add_common_dataset_args(p_rec) p_rec.set_defaults(func=cmd_recommend) diff --git a/src/autointent/_advisor/_estimates.py b/src/autointent/_advisor/_estimates.py index f60f940a6..e8f619303 100644 --- a/src/autointent/_advisor/_estimates.py +++ b/src/autointent/_advisor/_estimates.py @@ -9,7 +9,8 @@ from __future__ import annotations import logging -from typing import Any, Iterable +from collections.abc import Iterable +from typing import Any from ._hardware import HardwareProfile from ._hub import ModelMeta, hub_reachable, resolve_model @@ -32,6 +33,16 @@ TRANSFORMER_SCORER_MODULES = {"bert", "lora", "ptuning", "dnnc"} +# Coefficients for the linear / catboost time formulas (proposal §"Algorithm"). +_LINEAR_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-8 +_CATBOOST_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-9 +_CATBOOST_GPU_SPEEDUP = 10.0 +# LogisticRegressionCV defaults: Cs=10, cv=3 → 31 inner fits + 1 final refit. +_LOGREG_CV_MULTIPLIER = 31 +_CATBOOST_DEFAULT_BINS = 254 +# Bytes per histogram bucket / tree node — order-of-magnitude constants. +_CATBOOST_BYTES_PER_TREE_NODE = 32 + def _extract_model_names(module_entry: dict[str, Any]) -> list[str]: """Pull model name(s) from a search-space module entry.""" @@ -74,11 +85,22 @@ def _walk_modules(search_space: list[dict[str, Any]]) -> Iterable[tuple[str, dic yield node_type, entry +def _walk_modules_indexed( + search_space: list[dict[str, Any]], +) -> Iterable[tuple[int, str, dict[str, Any]]]: + """Yield (node_index, node_type, module_entry) — index lets us bound per-node max cost.""" + for node_idx, node in enumerate(search_space or []): + node_type = node.get("node_type", "?") + for entry in node.get("search_space", []) or []: + yield node_idx, node_type, entry + + def _vram_for_transformer(meta: ModelMeta, mode: str, mixed_precision: bool) -> float: """VRAM in GB for one trial of a transformer-based module. - Conservative AMP accounting (the proposal flags the prior naive halving - as too generous; keep optimizer state at fp32 even in AMP). + Full fine-tune fp32: weights + grads + Adam (m, v) = 4W. + Full fine-tune AMP: fp16 weights + fp16 grads + fp32 master copy + fp32 Adam = 3W. + (Activations are not modeled separately.) """ weights_gb = meta.weights_gb if mode == "inference": @@ -87,11 +109,9 @@ def _vram_for_transformer(meta: ModelMeta, mode: str, mixed_precision: bool) -> return weights_gb * 1.3 + 0.5 if mode == "reranker": return weights_gb * 1.5 - # full fine-tune (bert, ptuning, gcn-with-backbone) if mixed_precision: - # fp16 weights+grads + fp32 master+adam moments - return (weights_gb * 0.5) * 2 + weights_gb * 1 + weights_gb * 2 - return weights_gb * (1 + 1 + 2) + return weights_gb * 3.0 + return weights_gb * 4.0 def _ram_for_module(meta: ModelMeta, stats: DatasetStats) -> float: @@ -99,6 +119,82 @@ def _ram_for_module(meta: ModelMeta, stats: DatasetStats) -> float: return meta.weights_gb + (stats.n_samples * stats.avg_tokens * 4) / (1024**3) +def _embedder_dim(meta: ModelMeta | None) -> int: + """Coarse hidden-size guess from parameter count. + + Concrete points: MiniLM (33M) ~384, BERT-base (110M) ~768, BERT-large (350M) ~1024. + """ + if meta is None: + return 768 + params = meta.params_millions + if params >= 300: + return 1024 + if params >= 100: + return 768 + if params >= 50: + return 512 + return 384 + + +def _largest_embedder(seen_models: dict[str, ModelMeta]) -> ModelMeta | None: + if not seen_models: + return None + return max(seen_models.values(), key=lambda m: m.params_millions) + + +def _ram_for_linear(*, stats: DatasetStats, embedder_dim: int) -> float: + """Float64 design matrix dominates; coefficients and L-BFGS history are small.""" + data_bytes = 8.0 * stats.n_samples * embedder_dim + coef_bytes = 8.0 * max(1, stats.n_classes) * embedder_dim + lbfgs_bytes = 10.0 * 8.0 * embedder_dim + return (data_bytes + coef_bytes + lbfgs_bytes) / (1024**3) + + +def _time_for_linear( + *, + n_trials: int, + n_samples: int, + embedder_dim: int, + max_iter: int, + cv_multiplier: int, + class_multiplier: int, +) -> float: + seconds = ( + n_trials + * _LINEAR_CPU_S_PER_SAMPLE_FEATURE_ITER + * n_samples + * embedder_dim + * max_iter + * cv_multiplier + * class_multiplier + ) + return seconds / 3600.0 + + +def _ram_for_catboost(*, stats: DatasetStats, n_features: int, iterations: int, depth: int) -> float: + data_bytes = 4.0 * stats.n_samples * n_features + histograms_bytes = 4.0 * n_features * _CATBOOST_DEFAULT_BINS + trees_bytes = iterations * (2**depth) * _CATBOOST_BYTES_PER_TREE_NODE + return (data_bytes + histograms_bytes + trees_bytes) / (1024**3) + + +def _time_for_catboost( + *, + n_trials: int, + n_samples: int, + n_features: int, + iterations: int, + depth: int, + class_multiplier: int, + on_gpu: bool, +) -> float: + coeff = _CATBOOST_CPU_S_PER_SAMPLE_FEATURE_ITER + if on_gpu: + coeff /= _CATBOOST_GPU_SPEEDUP + seconds = n_trials * iterations * coeff * n_samples * n_features * depth * class_multiplier + return seconds / 3600.0 + + def _time_for_transformer( *, meta: ModelMeta, @@ -148,10 +244,24 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity if global_embedder: seen_models[global_embedder] = resolve_model(global_embedder) - for node_type, entry in _walk_modules(config.get("search_space") or []): + # First pass: walk transformer-bearing modules (collects seen_models for embedder_dim lookup). + transformer_entries: list[tuple[int, str, dict[str, Any]]] = [] + classic_entries: list[tuple[int, str, dict[str, Any]]] = [] + for node_idx, node_type, entry in _walk_modules_indexed(config.get("search_space") or []): + module = entry.get("module_name", "?") + if module in {"linear", "catboost"}: + classic_entries.append((node_idx, node_type, entry)) + else: + transformer_entries.append((node_idx, node_type, entry)) + + # Track the heaviest module per node so dump_modules accounting is bounded by + # "one selected variant per node × n_trials", not "sum of every candidate". + node_max_weights: dict[int, float] = {} + + for node_idx, node_type, entry in transformer_entries: module = entry.get("module_name", "?") model_names = _extract_model_names(entry) - if not model_names and global_embedder and module in {"linear", "catboost", "knn", "mlknn"}: + if not model_names and global_embedder and module in {"knn", "mlknn"}: model_names = [global_embedder] for name in model_names: @@ -191,6 +301,7 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity estimate.vram_gb = max(estimate.vram_gb, vram) estimate.ram_gb = max(estimate.ram_gb, ram) estimate.time_hours += time_h + node_max_weights[node_idx] = max(node_max_weights.get(node_idx, 0.0), meta.weights_gb) estimate.drivers.append( { "node_type": node_type, @@ -204,6 +315,76 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity } ) + # Second pass: linear / catboost — cost depends on embedder_dim, not a checkpoint. + embedder_meta = _largest_embedder(seen_models) + embedder_dim = _embedder_dim(embedder_meta) + class_multiplier_classic = max(1, stats.n_classes) if stats.multilabel else 1 + for _node_idx, node_type, entry in classic_entries: + module = entry.get("module_name", "?") + if module == "linear": + max_iter = _max_int(entry.get("max_iter"), 100) + cv_multiplier = 1 if stats.multilabel else _LOGREG_CV_MULTIPLIER + ram = _ram_for_linear(stats=stats, embedder_dim=embedder_dim) + time_h = _time_for_linear( + n_trials=n_trials, + n_samples=stats.n_samples, + embedder_dim=embedder_dim, + max_iter=max_iter, + cv_multiplier=cv_multiplier, + class_multiplier=class_multiplier_classic, + ) + if refit_after: + time_h *= 1 + 1.0 / max(1, n_trials) + vram = 0.0 + mode = "linear-cv" if cv_multiplier > 1 else "linear" + confidence = embedder_meta.confidence if embedder_meta else "heuristic" + elif module == "catboost": + iterations = _max_int(entry.get("iterations"), 1000) + depth = _max_int(entry.get("depth"), 6) + on_gpu = entry.get("task_type") == "GPU" and hardware.accelerator == "cuda" + # CatBoost's multiclass MultiClass loss already grows per-class trees. + cb_class_mult = max(1, stats.n_classes) + ram = _ram_for_catboost( + stats=stats, + n_features=embedder_dim, + iterations=iterations, + depth=depth, + ) + time_h = _time_for_catboost( + n_trials=n_trials, + n_samples=stats.n_samples, + n_features=embedder_dim, + iterations=iterations, + depth=depth, + class_multiplier=cb_class_mult, + on_gpu=on_gpu, + ) + if refit_after: + time_h *= 1 + 1.0 / max(1, n_trials) + vram = ram if on_gpu else 0.0 + if on_gpu: + ram = 0.0 + mode = "catboost-gpu" if on_gpu else "catboost" + confidence = embedder_meta.confidence if embedder_meta else "heuristic" + else: + continue + + estimate.vram_gb = max(estimate.vram_gb, vram) + estimate.ram_gb = max(estimate.ram_gb, ram) + estimate.time_hours += time_h + estimate.drivers.append( + { + "node_type": node_type, + "module": module, + "model": embedder_meta.name if embedder_meta else "(no embedder)", + "mode": mode, + "vram_gb": round(vram, 2), + "ram_gb": round(ram, 2), + "time_hours": round(time_h, 2), + "confidence": confidence, + } + ) + for meta in seen_models.values(): if meta.cached_locally: estimate.disk_cached_gb += meta.disk_gb @@ -211,8 +392,10 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity estimate.disk_download_gb += meta.disk_gb if dump_modules: - weights_total = sum(m.weights_gb for m in seen_models.values()) - estimate.disk_dump_gb = weights_total * n_trials + # Each trial selects one variant per node, so per-trial dumped weights + # are bounded by the heaviest module in each node, summed across nodes. + per_trial_dump_gb = sum(node_max_weights.values()) + estimate.disk_dump_gb = per_trial_dump_gb * n_trials if n_jobs > 1 and hardware.accelerator in {"cuda", "mps"}: effective_vram = estimate.vram_gb * n_jobs @@ -309,23 +492,17 @@ def _data_phase( ) # rare class × linear-CV - has_linear = any( - e.get("module_name") == "linear" for _, e in _walk_modules(config.get("search_space") or []) - ) + has_linear = any(e.get("module_name") == "linear" for _, e in _walk_modules(config.get("search_space") or [])) if has_linear and stats.rare_classes: report.add( "data", Severity.RED, - ( - "LogisticRegressionCV (cv=3) will fail: classes " - f"{stats.rare_classes[:5]} have <3 samples." - ), + (f"LogisticRegressionCV (cv=3) will fail: classes {stats.rare_classes[:5]} have <3 samples."), ) # partial descriptions × description scorer has_description = any( - e.get("module_name") == "description" - for _, e in _walk_modules(config.get("search_space") or []) + e.get("module_name") == "description" for _, e in _walk_modules(config.get("search_space") or []) ) if has_description and stats.has_descriptions is False: report.add( diff --git a/src/autointent/_advisor/_hardware.py b/src/autointent/_advisor/_hardware.py index 2bda6120f..9c0cae049 100644 --- a/src/autointent/_advisor/_hardware.py +++ b/src/autointent/_advisor/_hardware.py @@ -14,6 +14,9 @@ from dataclasses import dataclass, field from typing import Literal +import psutil +import torch + logger = logging.getLogger(__name__) Accelerator = Literal["cuda", "mps", "cpu"] @@ -46,13 +49,7 @@ def device_class(self) -> str: def _detect_ram_gb() -> float: - try: - import psutil - - return psutil.virtual_memory().total / (1024**3) - except ImportError: - logger.debug("psutil unavailable; RAM unknown") - return 0.0 + return psutil.virtual_memory().total / (1024**3) def _detect_free_disk_gb(path: str | None = None) -> float: @@ -67,40 +64,24 @@ def _detect_free_disk_gb(path: str | None = None) -> float: def _detect_cuda() -> tuple[float, str] | None: - try: - import torch - - if not torch.cuda.is_available(): - return None - idx = 0 - try: - free, total = torch.cuda.mem_get_info(idx) - vram_gb = total / (1024**3) - except (RuntimeError, AttributeError) as e: - logger.debug("torch.cuda.mem_get_info failed: %s", e) - return None - name = torch.cuda.get_device_name(idx) - return vram_gb, name - except ImportError: + if not torch.cuda.is_available(): return None - except Exception as e: # noqa: BLE001 - protect the advisor from torch quirks - logger.debug("CUDA detection raised: %s", e) + idx = 0 + try: + _free, total = torch.cuda.mem_get_info(idx) + vram_gb = total / (1024**3) + except (RuntimeError, AttributeError) as e: + logger.debug("torch.cuda.mem_get_info failed: %s", e) return None + name = torch.cuda.get_device_name(idx) + return vram_gb, name def _detect_mps(ram_gb: float, budget_ratio: float = MPS_DEFAULT_BUDGET_RATIO) -> tuple[float, str] | None: - try: - import torch - - if not (hasattr(torch.backends, "mps") and torch.backends.mps.is_available()): - return None - # apple silicon: unified memory; budget is fraction of total RAM - return ram_gb * budget_ratio, f"Apple Silicon ({platform.machine()})" - except ImportError: - return None - except Exception as e: # noqa: BLE001 - logger.debug("MPS detection raised: %s", e) + if not (hasattr(torch.backends, "mps") and torch.backends.mps.is_available()): return None + # apple silicon: unified memory; budget is fraction of total RAM + return ram_gb * budget_ratio, f"Apple Silicon ({platform.machine()})" def detect_hardware( @@ -133,9 +114,7 @@ def detect_hardware( if mps is not None: vram_gb, device_name = mps accel = "mps" - notes.append( - f"MPS unified memory: VRAM budget = {mps_budget_ratio:.0%} of RAM." - ) + notes.append(f"MPS unified memory: VRAM budget = {mps_budget_ratio:.0%} of RAM.") else: vram_gb = 0.0 device_name = platform.processor() or "cpu" @@ -143,9 +122,7 @@ def detect_hardware( if vram_budget_gb is not None: if vram_gb and vram_budget_gb > vram_gb: - notes.append( - f"Manual --budget-vram-gb={vram_budget_gb} exceeds detected {vram_gb:.1f} GB; using override." - ) + notes.append(f"Manual --budget-vram-gb={vram_budget_gb} exceeds detected {vram_gb:.1f} GB; using override.") notes.append(f"Using manual VRAM budget: {vram_budget_gb} GB.") vram_gb = vram_budget_gb diff --git a/src/autointent/_advisor/_hub.py b/src/autointent/_advisor/_hub.py index 80ccb7133..613ab6b40 100644 --- a/src/autointent/_advisor/_hub.py +++ b/src/autointent/_advisor/_hub.py @@ -14,6 +14,8 @@ from functools import lru_cache from typing import Any +from huggingface_hub import HfApi, scan_cache_dir, try_to_load_from_cache + logger = logging.getLogger(__name__) # Coarse heuristic estimates keyed on name fragments. Used only when HF Hub @@ -55,17 +57,11 @@ def weights_gb(self) -> float: def hub_reachable(timeout_s: float = 2.0) -> bool: """Single up-front probe. Memoized per process.""" try: - from huggingface_hub import HfApi - HfApi().list_models(limit=1) - except ImportError: - logger.debug("huggingface_hub not installed; assuming offline") - return False except Exception as e: # noqa: BLE001 logger.debug("HF Hub probe failed: %s", e) return False - else: - return True + return True def _heuristic_params_millions(model_name: str) -> float: @@ -77,11 +73,6 @@ def _heuristic_params_millions(model_name: str) -> float: def _is_warm_cached(model_name: str) -> bool: """True when the weight shard is present in the local HF cache.""" - try: - from huggingface_hub import scan_cache_dir, try_to_load_from_cache - except ImportError: - return False - weight_files = ["model.safetensors", "pytorch_model.bin", "model.safetensors.index.json"] for fname in weight_files: path = try_to_load_from_cache(model_name, fname) @@ -98,11 +89,6 @@ def _is_warm_cached(model_name: str) -> bool: def _hub_metadata(model_name: str) -> ModelMeta | None: - try: - from huggingface_hub import HfApi - except ImportError: - return None - try: info = HfApi().model_info(model_name, files_metadata=True) except Exception as e: # noqa: BLE001 diff --git a/src/autointent/_advisor/_render.py b/src/autointent/_advisor/_render.py index 52168aa75..fe0f32dd7 100644 --- a/src/autointent/_advisor/_render.py +++ b/src/autointent/_advisor/_render.py @@ -18,7 +18,7 @@ _PHASE_LABEL = {"resource": "Resource", "data": "Data", "config": "Config"} -def render_text(report: "PreflightReport") -> str: +def render_text(report: PreflightReport) -> str: lines: list[str] = [] title = "Compute feasibility check" if report.preset_name: @@ -76,12 +76,12 @@ def render_text(report: "PreflightReport") -> str: return "\n".join(lines) -def render_json(report: "PreflightReport") -> str: +def render_json(report: PreflightReport) -> str: return json.dumps(report.to_dict(), indent=2, default=str) def render_recommendation( - results: list[tuple[str, "PreflightReport"]], + results: list[tuple[str, PreflightReport]], chosen: str | None, ) -> str: """Compact table for the ``recommend`` subcommand.""" diff --git a/src/autointent/_advisor/_report.py b/src/autointent/_advisor/_report.py index 0250482a5..6b930db95 100644 --- a/src/autointent/_advisor/_report.py +++ b/src/autointent/_advisor/_report.py @@ -67,7 +67,7 @@ def placeholder( n_classes: int = 10, avg_tokens: int = 32, multilabel: bool = False, - ) -> "DatasetStats": + ) -> DatasetStats: return cls( n_samples=n_samples, n_classes=n_classes, @@ -105,9 +105,7 @@ def is_feasible(self) -> bool: def to_dict(self) -> dict[str, Any]: d = asdict(self) - d["findings"] = [ - {**asdict(f), "severity": f.severity.value} for f in self.findings - ] + d["findings"] = [{**asdict(f), "severity": f.severity.value} for f in self.findings] d["worst_severity"] = self.worst_severity.value d["is_feasible"] = self.is_feasible return d diff --git a/tests/advisor/test_estimates_and_cli.py b/tests/advisor/test_estimates_and_cli.py index 18c2615a6..00537226d 100644 --- a/tests/advisor/test_estimates_and_cli.py +++ b/tests/advisor/test_estimates_and_cli.py @@ -162,9 +162,7 @@ def test_partial_descriptions_with_description_scorer_flags_red() -> None: has_descriptions=False, ) report = run_preflight(cfg, stats, _profile(vram_gb=16.0)) - assert any( - f.phase == "data" and "description" in f.message.lower() for f in report.findings - ) + assert any(f.phase == "data" and "description" in f.message.lower() for f in report.findings) def test_long_dataset_triggers_truncation_warning() -> None: @@ -175,9 +173,7 @@ def test_long_dataset_triggers_truncation_warning() -> None: "search_space": [ { "module_name": "bert", - "classification_model_config": [ - {"model_name": "microsoft/deberta-v3-small"} - ], + "classification_model_config": [{"model_name": "microsoft/deberta-v3-small"}], "max_length": [128], } ], @@ -194,5 +190,40 @@ def test_long_dataset_triggers_truncation_warning() -> None: assert any("truncation" in f.message.lower() for f in report.findings) +def test_cli_recommend_budget_time_flags_red_for_overbudget_presets( + capsys: pytest.CaptureFixture[str], +) -> None: + """Tight time budget must flag every preset that exceeds it with RED severity. + + Previously the budget path used a tautological severity expression and the + breach never escalated the finding — covers the regression.""" + main( + [ + "recommend", + "--n-samples", + "1000", + "--n-classes", + "10", + "--avg-tokens", + "20", + "--budget-vram-gb", + "48", + "--budget-time-h", + "0.0001", + "--json", + ] + ) + payload = json.loads(capsys.readouterr().out) + flagged = [ + r + for r in payload["results"] + if any(f["severity"] == "red" and "exceeds budget" in f["message"] for f in r["report"]["findings"]) + ] + assert flagged, "budget-time-h breach should produce RED severity findings" + # Any preset above the budget must be marked infeasible. + for r in flagged: + assert r["report"]["is_feasible"] is False + + if __name__ == "__main__": sys.exit(pytest.main([__file__, "-v"])) diff --git a/tests/advisor/test_estimates_internals.py b/tests/advisor/test_estimates_internals.py index 0317ff27b..713db27f7 100644 --- a/tests/advisor/test_estimates_internals.py +++ b/tests/advisor/test_estimates_internals.py @@ -2,6 +2,8 @@ from __future__ import annotations +from typing import Any + import pytest from autointent._advisor import _estimates, _hub @@ -109,22 +111,19 @@ def meta(self) -> ModelMeta: confidence="hub", ) - def test_full_finetune_is_larger_than_lora_is_larger_than_inference( - self, meta: ModelMeta - ) -> None: + def test_full_finetune_is_larger_than_lora_is_larger_than_inference(self, meta: ModelMeta) -> None: inference = _vram_for_transformer(meta, "inference", mixed_precision=False) lora = _vram_for_transformer(meta, "lora", mixed_precision=False) full = _vram_for_transformer(meta, "full-finetune", mixed_precision=False) assert inference < lora < full - def test_amp_does_not_naively_halve(self, meta: ModelMeta) -> None: - """The proposal calls out that AMP doesn't halve total VRAM — fp32 master - weights and Adam moments don't shrink. Weight-side accounting comes out - equal to fp32; the only savings (activations) aren't modeled by us.""" + def test_amp_partially_reduces_full_finetune_vram(self, meta: ModelMeta) -> None: + """AMP saves on fp16 weights+grads (W down from 2W); Adam state stays + fp32 (2W). Total 3W vs fp32's 4W — real but not a full halving.""" full_fp32 = _vram_for_transformer(meta, "full-finetune", mixed_precision=False) full_amp = _vram_for_transformer(meta, "full-finetune", mixed_precision=True) - assert full_amp / full_fp32 == pytest.approx(1.0) - assert full_amp / full_fp32 > 0.5 # explicit check vs the naive-halving formula + assert full_amp < full_fp32 + assert full_amp / full_fp32 == pytest.approx(0.75) def test_reranker_uses_inference_class(self, meta: ModelMeta) -> None: inference = _vram_for_transformer(meta, "inference", mixed_precision=False) @@ -155,9 +154,7 @@ def test_dump_modules_adds_disk_during_training(self) -> None: "search_space": [ { "module_name": "bert", - "classification_model_config": [ - {"model_name": "microsoft/deberta-v3-small"} - ], + "classification_model_config": [{"model_name": "microsoft/deberta-v3-small"}], "num_train_epochs": [3], "batch_size": [16], } @@ -179,9 +176,7 @@ def test_refit_after_increases_time(self) -> None: "search_space": [ { "module_name": "bert", - "classification_model_config": [ - {"model_name": "microsoft/deberta-v3-small"} - ], + "classification_model_config": [{"model_name": "microsoft/deberta-v3-small"}], "num_train_epochs": [3], "batch_size": [16], } @@ -207,9 +202,7 @@ def test_catboost_gpu_without_cuda_flags_config(self) -> None: ], } report = run_preflight(cfg, DatasetStats.placeholder(), _profile(accelerator="cpu")) - assert any( - f.phase == "config" and "CatBoost" in f.message for f in report.findings - ) + assert any(f.phase == "config" and "CatBoost" in f.message for f in report.findings) def test_catboost_gpu_with_cuda_is_silent(self) -> None: cfg = { @@ -223,9 +216,7 @@ def test_catboost_gpu_with_cuda_is_silent(self) -> None: ], } report = run_preflight(cfg, DatasetStats.placeholder(), _profile(accelerator="cuda")) - assert not any( - f.phase == "config" and "CatBoost" in f.message for f in report.findings - ) + assert not any(f.phase == "config" and "CatBoost" in f.message for f in report.findings) def test_offline_flips_low_confidence(self) -> None: cfg = { @@ -277,9 +268,7 @@ def test_truncation_red_when_p95_dominates_max_length(self) -> None: { "module_name": "bert", "max_length": [128], - "classification_model_config": [ - {"model_name": "some/model"} - ], + "classification_model_config": [{"model_name": "some/model"}], } ], } @@ -299,9 +288,7 @@ def test_truncation_yellow_when_p95_only_slightly_exceeds(self) -> None: { "module_name": "bert", "max_length": [128], - "classification_model_config": [ - {"model_name": "some/model"} - ], + "classification_model_config": [{"model_name": "some/model"}], } ], } @@ -312,8 +299,203 @@ def test_truncation_yellow_when_p95_only_slightly_exceeds(self) -> None: yellows = [ f for f in report.findings - if f.phase == "data" - and f.severity == Severity.YELLOW - and "truncation" in f.message.lower() + if f.phase == "data" and f.severity == Severity.YELLOW and "truncation" in f.message.lower() ] assert yellows + + +class TestLinearCatboostFormulas: + """Cost surfaces for the classic (sklearn / catboost) scorers.""" + + def _embedder_node(self) -> dict[str, Any]: + return { + "node_type": "embedder", + "search_space": [ + { + "module_name": "sentence_transformer", + "embedder_config": [{"model_name": "sentence-transformers/all-MiniLM-L6-v2"}], + } + ], + } + + def test_linear_contributes_ram_and_time(self) -> None: + cfg = { + "search_space": [ + self._embedder_node(), + { + "node_type": "scoring", + "search_space": [{"module_name": "linear", "max_iter": [200]}], + }, + ], + "hpo_config": {"n_trials": 5}, + } + stats = DatasetStats.placeholder(n_samples=100_000, n_classes=10, avg_tokens=24) + report = run_preflight(cfg, stats, _profile()) + linear_drivers = [d for d in report.resource.drivers if d["module"] == "linear"] + assert len(linear_drivers) == 1 + assert report.resource.ram_gb > 0 + assert report.resource.time_hours > 0 + assert linear_drivers[0]["vram_gb"] == 0 # sklearn is CPU-only + + def test_logreg_cv_multiplier_dominates_multiclass_time(self) -> None: + """Multiclass linear uses LogisticRegressionCV (Cs*cv+1 ≈ 31 inner fits); + multilabel uses one LogReg per class (cv_multiplier=1). At equal n_classes, + multiclass must be much slower than the per-class multilabel path.""" + base = { + "search_space": [ + self._embedder_node(), + { + "node_type": "scoring", + "search_space": [{"module_name": "linear", "max_iter": [1000]}], + }, + ], + "hpo_config": {"n_trials": 1}, + } + multiclass = run_preflight( + base, + DatasetStats.placeholder(n_samples=100_000, n_classes=10, multilabel=False), + _profile(), + ) + multilabel = run_preflight( + base, + DatasetStats.placeholder(n_samples=100_000, n_classes=10, multilabel=True), + _profile(), + ) + # multiclass: 31 inner fits x 1 model; multilabel: 1 fit x n_classes=10 models. + # 31 > 10 => multiclass is the slower path. + assert multiclass.resource.time_hours > multilabel.resource.time_hours + + def test_catboost_contributes_ram_and_time_on_cpu(self) -> None: + cfg = { + "search_space": [ + self._embedder_node(), + { + "node_type": "scoring", + "search_space": [ + { + "module_name": "catboost", + "iterations": [1000], + "depth": [6], + } + ], + }, + ], + "hpo_config": {"n_trials": 3}, + } + stats = DatasetStats.placeholder(n_samples=100_000, n_classes=8, avg_tokens=24) + report = run_preflight(cfg, stats, _profile(accelerator="cpu")) + cb = next(d for d in report.resource.drivers if d["module"] == "catboost") + assert report.resource.ram_gb > 0 + assert report.resource.time_hours > 0 + assert cb["vram_gb"] == 0 + assert cb["mode"] == "catboost" + + def test_catboost_gpu_moves_cost_to_vram(self) -> None: + cfg = { + "search_space": [ + self._embedder_node(), + { + "node_type": "scoring", + "search_space": [ + { + "module_name": "catboost", + "iterations": [1000], + "depth": [6], + "task_type": "GPU", + } + ], + }, + ], + "hpo_config": {"n_trials": 2}, + } + stats = DatasetStats.placeholder(n_samples=100_000, n_classes=8, avg_tokens=24) + report = run_preflight(cfg, stats, _profile(accelerator="cuda")) + cb = next(d for d in report.resource.drivers if d["module"] == "catboost") + assert report.resource.vram_gb > 0 + assert cb["ram_gb"] == 0 + assert cb["mode"] == "catboost-gpu" + + def test_linear_scales_with_n_samples(self) -> None: + cfg = { + "search_space": [ + self._embedder_node(), + { + "node_type": "scoring", + "search_space": [{"module_name": "linear"}], + }, + ], + } + small = run_preflight(cfg, DatasetStats.placeholder(n_samples=500), _profile()) + big = run_preflight(cfg, DatasetStats.placeholder(n_samples=500_000), _profile()) + assert big.resource.time_hours > small.resource.time_hours + assert big.resource.ram_gb > small.resource.ram_gb + + +class TestDumpModulesBounding: + """`dump_modules=True` writes one selected variant per node per trial — not + every candidate. The estimate must be bounded by sum-of-max-per-node x n_trials.""" + + def test_dump_disk_is_bounded_by_per_node_max_not_sum_of_all_variants(self) -> None: + # Two BERT candidates in the same node: only one is selected per trial. + cfg = { + "search_space": [ + { + "node_type": "scoring", + "search_space": [ + { + "module_name": "bert", + "classification_model_config": [ + {"model_name": "microsoft/deberta-v3-small"}, + {"model_name": "microsoft/deberta-v3-large"}, + ], + "num_train_epochs": [3], + "batch_size": [16], + } + ], + } + ], + "hpo_config": {"n_trials": 4}, + "dump_modules": True, + } + report = run_preflight(cfg, DatasetStats.placeholder(), _profile()) + # Per-node max ~ deberta-v3-large weights (~350M x 4 ~ 1.3 GB). Two-candidate + # sum would be roughly doubled. Verify we used the per-node-max bound. + small_meta = _hub.resolve_model("microsoft/deberta-v3-small") + large_meta = _hub.resolve_model("microsoft/deberta-v3-large") + expected = large_meta.weights_gb * 4 + naive_sum = (small_meta.weights_gb + large_meta.weights_gb) * 4 + assert report.resource.disk_dump_gb == pytest.approx(expected, rel=0.01) + assert report.resource.disk_dump_gb < naive_sum + + def test_dump_disk_sums_across_nodes(self) -> None: + cfg = { + "search_space": [ + { + "node_type": "embedder", + "search_space": [ + { + "module_name": "sentence_transformer", + "embedder_config": [{"model_name": "sentence-transformers/all-MiniLM-L6-v2"}], + } + ], + }, + { + "node_type": "scoring", + "search_space": [ + { + "module_name": "bert", + "classification_model_config": [{"model_name": "microsoft/deberta-v3-small"}], + "num_train_epochs": [3], + "batch_size": [16], + } + ], + }, + ], + "hpo_config": {"n_trials": 2}, + "dump_modules": True, + } + report = run_preflight(cfg, DatasetStats.placeholder(), _profile()) + embedder = _hub.resolve_model("sentence-transformers/all-MiniLM-L6-v2") + bert = _hub.resolve_model("microsoft/deberta-v3-small") + expected = (embedder.weights_gb + bert.weights_gb) * 2 + assert report.resource.disk_dump_gb == pytest.approx(expected, rel=0.01) diff --git a/tests/advisor/test_hub_heuristics.py b/tests/advisor/test_hub_heuristics.py index 54a03431d..b43b95522 100644 --- a/tests/advisor/test_hub_heuristics.py +++ b/tests/advisor/test_hub_heuristics.py @@ -32,9 +32,7 @@ def _offline(monkeypatch: pytest.MonkeyPatch) -> None: ("bert-base-uncased", 70, 200), ], ) -def test_name_heuristic_picks_reasonable_bucket( - name: str, expected_min_m: int, expected_max_m: int -) -> None: +def test_name_heuristic_picks_reasonable_bucket(name: str, expected_min_m: int, expected_max_m: int) -> None: meta = _hub.resolve_model(name) assert meta.confidence == "heuristic" assert expected_min_m <= meta.params_millions <= expected_max_m, ( diff --git a/tests/advisor/test_render.py b/tests/advisor/test_render.py index e82d7573b..55a2b0ce3 100644 --- a/tests/advisor/test_render.py +++ b/tests/advisor/test_render.py @@ -4,8 +4,6 @@ import json -import pytest - from autointent._advisor._render import render_json, render_recommendation, render_text from autointent._advisor._report import ( DatasetStats, @@ -140,12 +138,14 @@ def test_shows_status_per_preset(self) -> None: def test_dataset_stats_in_text_block() -> None: stats = DatasetStats.placeholder(n_samples=777, n_classes=4) - r = PreflightReport(dataset={ - "n_samples": stats.n_samples, - "n_classes": stats.n_classes, - "avg_tokens": stats.avg_tokens, - "source": stats.source, - }) + r = PreflightReport( + dataset={ + "n_samples": stats.n_samples, + "n_classes": stats.n_classes, + "avg_tokens": stats.avg_tokens, + "source": stats.source, + } + ) out = render_text(r) assert "777" in out assert "n_classes=4" in out diff --git a/user_guides/advanced/02_embedder_configuration.py b/user_guides/advanced/02_embedder_configuration.py index 32118fed2..43ce27278 100644 --- a/user_guides/advanced/02_embedder_configuration.py +++ b/user_guides/advanced/02_embedder_configuration.py @@ -261,9 +261,7 @@ ) # Example (does not run training here): construct an embedder and call train when you have data. -_embedder_for_ft = Embedder( - SentenceTransformerEmbeddingConfig(model_name="sentence-transformers/all-MiniLM-L6-v2") -) +_embedder_for_ft = Embedder(SentenceTransformerEmbeddingConfig(model_name="sentence-transformers/all-MiniLM-L6-v2")) # _embedder_for_ft.train(utterances=[...], labels=[...], config=ft_cfg) # %% From f927729e2c0d736023d7e7c88bbb1d84bcaf5a57 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 16 Jun 2026 01:45:25 +0300 Subject: [PATCH 05/16] add more handling --- src/autointent/_advisor/_cli.py | 2 +- src/autointent/_advisor/_estimates.py | 171 +++++++++++++++--- src/autointent/_advisor/_render.py | 76 ++++++-- src/autointent/_advisor/_report.py | 17 +- .../_presets/transformers-heavy.yaml | 7 + tests/advisor/test_estimates_and_cli.py | 6 +- tests/advisor/test_estimates_internals.py | 131 ++++++++++++-- tests/advisor/test_render.py | 41 +++-- tests/advisor/test_report.py | 26 +-- 9 files changed, 385 insertions(+), 92 deletions(-) diff --git a/src/autointent/_advisor/_cli.py b/src/autointent/_advisor/_cli.py index b3f43aab5..d300ad6fa 100644 --- a/src/autointent/_advisor/_cli.py +++ b/src/autointent/_advisor/_cli.py @@ -153,7 +153,7 @@ def cmd_recommend(args: argparse.Namespace) -> int: if args.budget_time_h is not None and report.resource.time_hours > args.budget_time_h: report.add( "resource", - Severity.RED, + Severity.OVER, f"Estimated time {report.resource.time_hours:.1f} h exceeds budget {args.budget_time_h} h.", ) results.append((preset, report)) diff --git a/src/autointent/_advisor/_estimates.py b/src/autointent/_advisor/_estimates.py index e8f619303..06d0e4fe1 100644 --- a/src/autointent/_advisor/_estimates.py +++ b/src/autointent/_advisor/_estimates.py @@ -33,6 +33,10 @@ TRANSFORMER_SCORER_MODULES = {"bert", "lora", "ptuning", "dnnc"} +# Fallback max_length when the search-space entry doesn't pin it. Used both as +# the default in _vram_for_transformer and in the entry-walk seq_len resolution. +_DEFAULT_SEQ_LEN = 128 + # Coefficients for the linear / catboost time formulas (proposal §"Algorithm"). _LINEAR_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-8 _CATBOOST_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-9 @@ -95,12 +99,12 @@ def _walk_modules_indexed( yield node_idx, node_type, entry -def _vram_for_transformer(meta: ModelMeta, mode: str, mixed_precision: bool) -> float: - """VRAM in GB for one trial of a transformer-based module. +def _weights_vram_for_transformer(meta: ModelMeta, mode: str) -> float: + """Weight-side VRAM in GB — weights + grads + Adam optimizer state. Excludes activations. - Full fine-tune fp32: weights + grads + Adam (m, v) = 4W. - Full fine-tune AMP: fp16 weights + fp16 grads + fp32 master copy + fp32 Adam = 3W. - (Activations are not modeled separately.) + Full fine-tune fp32: W + W + 2W (Adam m, v) = 4W. + Full fine-tune AMP: 0.5W (fp16 weights) + 0.5W (fp16 grads) + W (fp32 master) + 2W (fp32 Adam) = 4W. + AMP's savings live in activations, not the optimizer — the weight side is identical. """ weights_gb = meta.weights_gb if mode == "inference": @@ -109,16 +113,111 @@ def _vram_for_transformer(meta: ModelMeta, mode: str, mixed_precision: bool) -> return weights_gb * 1.3 + 0.5 if mode == "reranker": return weights_gb * 1.5 - if mixed_precision: - return weights_gb * 3.0 return weights_gb * 4.0 +def _vram_for_transformer( + meta: ModelMeta, + mode: str, + mixed_precision: bool, + *, + batch_size: int = 0, + seq_len: int = _DEFAULT_SEQ_LEN, +) -> float: + """Total VRAM in GB: weights + grads + optimizer state + activations × batch. + + Activation accounting differs by mode — training keeps per-layer outputs for + backward; inference only needs one or two layers in flight. + """ + base = _weights_vram_for_transformer(meta, mode) + if batch_size <= 0: + return base + per_sample = _activations_gb_per_sample( + meta, seq_len, mixed_precision=mixed_precision, is_training=mode != "inference" + ) + return base + per_sample * batch_size + + def _ram_for_module(meta: ModelMeta, stats: DatasetStats) -> float: """RAM in GB. Loose upper bound.""" return meta.weights_gb + (stats.n_samples * stats.avg_tokens * 4) / (1024**3) +def _floor_to_power_of_two(n: int) -> int: + """Largest power of two ≤ n; returns 0 when n < 1.""" + if n < 1: + return 0 + power = 1 + while power * 2 <= n: + power *= 2 + return power + + +def _n_layers(meta: ModelMeta | None) -> int: + """Coarse layer-count guess from parameter count. + + MiniLM (33M) ~6, BERT-base (110M) ~12, BERT-large (350M) ~24. + """ + if meta is None: + return 12 + params = meta.params_millions + if params >= 300: + return 24 + if params >= 100: + return 12 + if params >= 50: + return 8 + return 6 + + +def _activations_gb_per_sample( + meta: ModelMeta | None, + seq_len: int, + *, + mixed_precision: bool, + is_training: bool, +) -> float: + """Heuristic activation memory per sample. + + Training: ``seq_len × hidden × layers × const`` — per-layer outputs are kept + for backward. + Inference: ``seq_len × hidden × const`` — only one or two layers' outputs in + flight at once. + Mixed precision halves activation bytes. + """ + hidden = _embedder_dim(meta) + if is_training: + # Training keeps every layer's outputs for backward → scales × n_layers. + # The 16-byte/token/layer coefficient bundles fp32 activation + ~4× backward overhead. + bytes_per_sample = seq_len * hidden * _n_layers(meta) * 16 + else: + # Inference only holds ~1-2 layers' outputs in flight at once. + bytes_per_sample = seq_len * hidden * 8 + if mixed_precision: + bytes_per_sample //= 2 + return bytes_per_sample / (1024**3) + + +def _max_fitting_batch_size( + *, + weight_vram_gb: float, + vram_budget_gb: float, + per_sample_gb: float, +) -> int: + """Largest batch that keeps total VRAM under the AMPLE/TIGHT threshold. + + Returns 0 when even the weights blow the budget. Result is rounded down to + the nearest power of two. + """ + if per_sample_gb <= 0: + return 0 + target_vram = vram_budget_gb * _YELLOW + available_for_activations = target_vram - weight_vram_gb + if available_for_activations <= 0: + return 0 + return _floor_to_power_of_two(int(available_for_activations / per_sample_gb)) + + def _embedder_dim(meta: ModelMeta | None) -> int: """Coarse hidden-size guess from parameter count. @@ -211,13 +310,13 @@ def _time_for_transformer( def _classify_severity(estimate: float, budget: float) -> Severity: if budget <= 0: - return Severity.YELLOW + return Severity.TIGHT ratio = estimate / budget if ratio >= _RED: - return Severity.RED + return Severity.OVER if ratio >= _YELLOW: - return Severity.YELLOW - return Severity.GREEN + return Severity.TIGHT + return Severity.AMPLE def _resource_phase( # noqa: PLR0912 - kept linear for clarity @@ -281,20 +380,31 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity batch_size = _max_int(entry.get("batch_size"), 32) epochs = _max_int(entry.get("num_train_epochs"), 1 if mode == "inference" else 10) + seq_len = _max_int(entry.get("max_length"), _DEFAULT_SEQ_LEN) - vram = _vram_for_transformer(meta, mode, mixed_precision) + vram = _vram_for_transformer(meta, mode, mixed_precision, batch_size=batch_size, seq_len=seq_len) ram = _ram_for_module(meta, stats) - time_h = 0.0 - if mode != "inference": - time_h = _time_for_transformer( - meta=meta, - n_trials=n_trials, - epochs=epochs, - batch_size=batch_size, - n_samples=stats.n_samples, - device_class=hardware.device_class, + driver_max_batch: int | None = None + if hardware.vram_gb > 0: + weights_vram = _weights_vram_for_transformer(meta, mode) + per_sample_gb = _activations_gb_per_sample( + meta, seq_len, mixed_precision=mixed_precision, is_training=mode != "inference" ) + driver_max_batch = _max_fitting_batch_size( + weight_vram_gb=weights_vram, + vram_budget_gb=hardware.vram_gb, + per_sample_gb=per_sample_gb, + ) + + time_h = _time_for_transformer( + meta=meta, + n_trials=n_trials, + epochs=epochs, + batch_size=batch_size, + n_samples=stats.n_samples, + device_class=hardware.device_class, + ) if refit_after and mode != "inference": time_h *= 1 + 1.0 / max(1, n_trials) @@ -311,6 +421,8 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity "vram_gb": round(vram, 2), "ram_gb": round(ram, 2), "time_hours": round(time_h, 2), + "batch_size": batch_size, + "max_batch_size": driver_max_batch, "confidence": meta.confidence, } ) @@ -381,6 +493,8 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity "vram_gb": round(vram, 2), "ram_gb": round(ram, 2), "time_hours": round(time_h, 2), + "batch_size": None, + "max_batch_size": None, "confidence": confidence, } ) @@ -409,7 +523,7 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity if hardware.accelerator == "cpu" and effective_vram > 0: report.add( "resource", - Severity.YELLOW, + Severity.TIGHT, f"No GPU detected; transformer modules will be very slow (worst case ~{estimate.time_hours:.1f} h).", metric="vram", ) @@ -420,6 +534,7 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity msg += f" vs available {hardware.vram_gb:.1f} GB" report.add("resource", vram_sev, msg, metric="vram") + ram_sev = _classify_severity(estimate.ram_gb, hardware.ram_gb) report.add( "resource", @@ -440,7 +555,7 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity if estimate.time_hours > 0: time_msg = f"Time ~{estimate.time_hours:.1f} h (worst case, no HPO pruning)" - report.add("resource", Severity.GREEN, time_msg, metric="time") + report.add("resource", Severity.AMPLE, time_msg, metric="time") def _config_phase( @@ -454,7 +569,7 @@ def _config_phase( if n_jobs > 1 and hardware.accelerator in {"cuda", "mps"}: report.add( "config", - Severity.YELLOW, + Severity.TIGHT, f"hpo_config.n_jobs={n_jobs} on a single GPU multiplies VRAM demand by {n_jobs}×.", ) @@ -466,7 +581,7 @@ def _config_phase( if uses_catboost_gpu and hardware.accelerator != "cuda": report.add( "config", - Severity.YELLOW, + Severity.TIGHT, "CatBoost task_type=GPU configured but no CUDA detected — will fall back to CPU.", ) @@ -484,7 +599,7 @@ def _data_phase( continue max_len = _max_int(max_len_value, 512) if p95 > max_len: - severity = Severity.RED if p95 > max_len * 1.5 else Severity.YELLOW + severity = Severity.OVER if p95 > max_len * 1.5 else Severity.TIGHT report.add( "data", severity, @@ -496,7 +611,7 @@ def _data_phase( if has_linear and stats.rare_classes: report.add( "data", - Severity.RED, + Severity.OVER, (f"LogisticRegressionCV (cv=3) will fail: classes {stats.rare_classes[:5]} have <3 samples."), ) @@ -507,7 +622,7 @@ def _data_phase( if has_description and stats.has_descriptions is False: report.add( "data", - Severity.RED, + Severity.OVER, "description scorer present but intent descriptions are missing — fill them in or drop the scorer.", ) diff --git a/src/autointent/_advisor/_render.py b/src/autointent/_advisor/_render.py index fe0f32dd7..a3778d307 100644 --- a/src/autointent/_advisor/_render.py +++ b/src/autointent/_advisor/_render.py @@ -13,11 +13,69 @@ if TYPE_CHECKING: from ._report import PreflightReport -_SEVERITY_TAG = {"green": "✓", "yellow": "⚠", "red": "✗"} +_SEVERITY_TAG = {"ample": "✓", "tight": "⚠", "over": "✗"} _PHASE_ORDER = ("resource", "data", "config") _PHASE_LABEL = {"resource": "Resource", "data": "Data", "config": "Config"} +def _batch_hint(driver: dict) -> str: + """Per-driver batch annotation: '64 → 32', '64', '64 (no fit)', or ''.""" + bs = driver.get("batch_size") + if bs is None: + return "" + mx = driver.get("max_batch_size") + if mx is None: + return str(bs) + if mx == 0: + return f"{bs} (no fit)" + if mx == bs: + return str(bs) + return f"{bs} → {mx}" + + +_DRIVERS_LIMIT = 8 +_DRIVERS_HEADERS = ("Node", "Model", "Mode", "VRAM", "Time", "Batch", "Source") + + +def _render_drivers_table(drivers: list[dict]) -> list[str]: + """Format the Drivers of cost section as an aligned table.""" + visible = drivers[:_DRIVERS_LIMIT] + rows: list[tuple[str, ...]] = [] + for d in visible: + rows.append(( + f"{d['node_type']}.{d['module']}", + str(d["model"]), + str(d["mode"]), + f"{d['vram_gb']:.2f} GB", + f"{d['time_hours']:.2f} h", + _batch_hint(d), + f"[{d['confidence']}]", + )) + + widths = [len(h) for h in _DRIVERS_HEADERS] + for row in rows: + for i, cell in enumerate(row): + widths[i] = max(widths[i], len(cell)) + + # Right-align numeric columns (VRAM @ idx 3, Time @ idx 4); left-align the rest. + right_align = {3, 4} + + def fmt(row: tuple[str, ...]) -> str: + cells = [] + for i, cell in enumerate(row): + if i in right_align: + cells.append(cell.rjust(widths[i])) + else: + cells.append(cell.ljust(widths[i])) + return " " + " ".join(cells).rstrip() + + lines = ["Drivers of cost:", fmt(_DRIVERS_HEADERS), " " + " ".join("─" * w for w in widths)] + lines.extend(fmt(r) for r in rows) + if len(drivers) > _DRIVERS_LIMIT: + lines.append(f" … and {len(drivers) - _DRIVERS_LIMIT} more") + return lines + + def render_text(report: PreflightReport) -> str: lines: list[str] = [] title = "Compute feasibility check" @@ -50,15 +108,7 @@ def render_text(report: PreflightReport) -> str: lines.append("") if report.resource.drivers: - lines.append("Drivers of cost:") - for d in report.resource.drivers[:8]: - lines.append( - f" {d['node_type']}.{d['module']:<10} {d['model']:<48}" - f" {d['mode']:<14} VRAM ~{d['vram_gb']} GB, time ~{d['time_hours']} h" - f" [{d['confidence']}]" - ) - if len(report.resource.drivers) > 8: - lines.append(f" … and {len(report.resource.drivers) - 8} more") + lines.extend(_render_drivers_table(report.resource.drivers)) lines.append("") if report.notes: @@ -68,7 +118,7 @@ def render_text(report: PreflightReport) -> str: lines.append("") summary = f"Verdict: {'feasible' if report.is_feasible else 'INFEASIBLE'} " - summary += f"(worst severity: {report.worst_severity.value})" + summary += f"(headroom: {report.headroom.value})" if report.low_confidence: summary += " — low-confidence (heuristic fallback in use)" lines.append(summary) @@ -91,7 +141,7 @@ def render_recommendation( else: lines.append(" → none of the bundled presets fit your hardware as-is.") lines.append("") - lines.append(f"{'Preset':<24} {'Status':<14} {'VRAM':<10} {'Time':<10} {'Worst':<8}") + lines.append(f"{'Preset':<24} {'Status':<14} {'VRAM':<10} {'Time':<10} {'Headroom':<10}") lines.append("-" * 68) for name, report in results: verdict = "feasible" if report.is_feasible else "infeasible" @@ -99,6 +149,6 @@ def render_recommendation( f"{name:<24} {verdict:<14} " f"{report.resource.vram_gb:>4.1f} GB " f"{report.resource.time_hours:>4.1f} h " - f"{report.worst_severity.value:<8}" + f"{report.headroom.value:<8}" ) return "\n".join(lines) diff --git a/src/autointent/_advisor/_report.py b/src/autointent/_advisor/_report.py index 6b930db95..9b4a319c8 100644 --- a/src/autointent/_advisor/_report.py +++ b/src/autointent/_advisor/_report.py @@ -8,9 +8,9 @@ class Severity(str, Enum): - GREEN = "green" - YELLOW = "yellow" - RED = "red" + AMPLE = "ample" + TIGHT = "tight" + OVER = "over" Phase = Literal["resource", "data", "config"] @@ -93,19 +93,20 @@ def add(self, phase: Phase, severity: Severity, message: str, metric: str | None self.findings.append(Finding(phase=phase, severity=severity, message=message, metric=metric)) @property - def worst_severity(self) -> Severity: - order = {Severity.GREEN: 0, Severity.YELLOW: 1, Severity.RED: 2} + def headroom(self) -> Severity: + """Worst headroom level across all findings — the column shown in CLI reports.""" + order = {Severity.AMPLE: 0, Severity.TIGHT: 1, Severity.OVER: 2} if not self.findings: - return Severity.GREEN + return Severity.AMPLE return max((f.severity for f in self.findings), key=lambda s: order[s]) @property def is_feasible(self) -> bool: - return self.worst_severity != Severity.RED + return self.headroom != Severity.OVER def to_dict(self) -> dict[str, Any]: d = asdict(self) d["findings"] = [{**asdict(f), "severity": f.severity.value} for f in self.findings] - d["worst_severity"] = self.worst_severity.value + d["headroom"] = self.headroom.value d["is_feasible"] = self.is_feasible return d diff --git a/src/autointent/_presets/transformers-heavy.yaml b/src/autointent/_presets/transformers-heavy.yaml index 2576fbc82..cd15d791e 100644 --- a/src/autointent/_presets/transformers-heavy.yaml +++ b/src/autointent/_presets/transformers-heavy.yaml @@ -5,12 +5,19 @@ search_space: - module_name: bert classification_model_config: - model_name: microsoft/deberta-v3-large + - model_name: intfloat/multilingual-e5-large-instruct + - model_name: microsoft/harrier-oss-v1-27b num_train_epochs: [30] batch_size: [32, 64] learning_rate: low: 1.0e-5 high: 1.0e-4 log: True + - module_name: description_bi + embedder_config: + - model_name: microsoft/deberta-v3-large + - model_name: intfloat/multilingual-e5-large-instruct + - model_name: microsoft/harrier-oss-v1-27b - node_type: decision target_metric: decision_accuracy search_space: diff --git a/tests/advisor/test_estimates_and_cli.py b/tests/advisor/test_estimates_and_cli.py index 00537226d..15a087e07 100644 --- a/tests/advisor/test_estimates_and_cli.py +++ b/tests/advisor/test_estimates_and_cli.py @@ -98,7 +98,7 @@ def test_cli_inspect_json_is_parseable(capsys: pytest.CaptureFixture[str]) -> No payload = json.loads(captured.out) assert payload["preset_name"] == "transformers-light" assert "findings" in payload - assert payload["worst_severity"] in {"green", "yellow", "red"} + assert payload["headroom"] in {"ample", "tight", "over"} # rc is 0 on feasible, 1 otherwise assert rc in (0, 1) @@ -217,9 +217,9 @@ def test_cli_recommend_budget_time_flags_red_for_overbudget_presets( flagged = [ r for r in payload["results"] - if any(f["severity"] == "red" and "exceeds budget" in f["message"] for f in r["report"]["findings"]) + if any(f["severity"] == "over" and "exceeds budget" in f["message"] for f in r["report"]["findings"]) ] - assert flagged, "budget-time-h breach should produce RED severity findings" + assert flagged, "budget-time-h breach should produce OVER severity findings" # Any preset above the budget must be marked infeasible. for r in flagged: assert r["report"]["is_feasible"] is False diff --git a/tests/advisor/test_estimates_internals.py b/tests/advisor/test_estimates_internals.py index 713db27f7..5ac66af2f 100644 --- a/tests/advisor/test_estimates_internals.py +++ b/tests/advisor/test_estimates_internals.py @@ -86,17 +86,17 @@ def test_empty_entry(self) -> None: class TestClassifySeverity: def test_below_yellow_is_green(self) -> None: - assert _classify_severity(estimate=1.0, budget=10.0) == Severity.GREEN + assert _classify_severity(estimate=1.0, budget=10.0) == Severity.AMPLE def test_above_yellow_threshold(self) -> None: - assert _classify_severity(estimate=8.0, budget=10.0) == Severity.YELLOW + assert _classify_severity(estimate=8.0, budget=10.0) == Severity.TIGHT def test_at_or_above_red_threshold(self) -> None: - assert _classify_severity(estimate=10.0, budget=10.0) == Severity.RED - assert _classify_severity(estimate=12.0, budget=10.0) == Severity.RED + assert _classify_severity(estimate=10.0, budget=10.0) == Severity.OVER + assert _classify_severity(estimate=12.0, budget=10.0) == Severity.OVER def test_zero_budget_returns_yellow(self) -> None: - assert _classify_severity(estimate=1.0, budget=0.0) == Severity.YELLOW + assert _classify_severity(estimate=1.0, budget=0.0) == Severity.TIGHT class TestVramForTransformer: @@ -117,13 +117,34 @@ def test_full_finetune_is_larger_than_lora_is_larger_than_inference(self, meta: full = _vram_for_transformer(meta, "full-finetune", mixed_precision=False) assert inference < lora < full - def test_amp_partially_reduces_full_finetune_vram(self, meta: ModelMeta) -> None: - """AMP saves on fp16 weights+grads (W down from 2W); Adam state stays - fp32 (2W). Total 3W vs fp32's 4W — real but not a full halving.""" - full_fp32 = _vram_for_transformer(meta, "full-finetune", mixed_precision=False) - full_amp = _vram_for_transformer(meta, "full-finetune", mixed_precision=True) - assert full_amp < full_fp32 - assert full_amp / full_fp32 == pytest.approx(0.75) + def test_inference_activations_are_smaller_than_training(self, meta: ModelMeta) -> None: + """Inference doesn't store per-layer outputs for backward — activation memory + should be many times smaller than training at the same batch_size.""" + train_total = _vram_for_transformer(meta, "full-finetune", False, batch_size=64, seq_len=128) + train_weights = _vram_for_transformer(meta, "full-finetune", False, batch_size=0) + inf_total = _vram_for_transformer(meta, "inference", False, batch_size=64, seq_len=128) + inf_weights = _vram_for_transformer(meta, "inference", False, batch_size=0) + train_acts = train_total - train_weights + inf_acts = inf_total - inf_weights + assert inf_acts > 0 + assert train_acts > inf_acts + # 12-layer model: training activations should be at least ~5× inference. + assert train_acts / inf_acts > 5 + + def test_amp_does_not_reduce_weight_side_vram(self, meta: ModelMeta) -> None: + """Weight-side AMP accounting: fp16 weights+grads (W) + fp32 master copy (W) + + fp32 Adam moments (2W) = 4W, identical to pure fp32. AMP's savings live + in activations, not the optimizer.""" + full_fp32 = _vram_for_transformer(meta, "full-finetune", mixed_precision=False, batch_size=0) + full_amp = _vram_for_transformer(meta, "full-finetune", mixed_precision=True, batch_size=0) + assert full_amp == pytest.approx(full_fp32) + + def test_amp_does_reduce_activation_side_vram(self, meta: ModelMeta) -> None: + """When a batch is configured, AMP halves activation bytes — total VRAM + with batch should be strictly smaller under AMP than fp32.""" + fp32 = _vram_for_transformer(meta, "full-finetune", mixed_precision=False, batch_size=64, seq_len=128) + amp = _vram_for_transformer(meta, "full-finetune", mixed_precision=True, batch_size=64, seq_len=128) + assert amp < fp32 def test_reranker_uses_inference_class(self, meta: ModelMeta) -> None: inference = _vram_for_transformer(meta, "inference", mixed_precision=False) @@ -255,7 +276,7 @@ def test_rare_classes_with_linear_scorer_flag_red(self) -> None: ) report = run_preflight(cfg, stats, _profile()) assert any( - f.phase == "data" and "LogisticRegressionCV" in f.message and f.severity == Severity.RED + f.phase == "data" and "LogisticRegressionCV" in f.message and f.severity == Severity.OVER for f in report.findings ) @@ -276,7 +297,7 @@ def test_truncation_red_when_p95_dominates_max_length(self) -> None: } stats = DatasetStats(n_samples=500, n_classes=5, avg_tokens=50, p95_tokens=400) report = run_preflight(cfg, stats, _profile()) - red = [f for f in report.findings if f.phase == "data" and f.severity == Severity.RED] + red = [f for f in report.findings if f.phase == "data" and f.severity == Severity.OVER] assert red, "p95=400 > 1.5 * max_length=128 should be red" def test_truncation_yellow_when_p95_only_slightly_exceeds(self) -> None: @@ -299,7 +320,7 @@ def test_truncation_yellow_when_p95_only_slightly_exceeds(self) -> None: yellows = [ f for f in report.findings - if f.phase == "data" and f.severity == Severity.YELLOW and "truncation" in f.message.lower() + if f.phase == "data" and f.severity == Severity.TIGHT and "truncation" in f.message.lower() ] assert yellows @@ -431,6 +452,86 @@ def test_linear_scales_with_n_samples(self) -> None: assert big.resource.ram_gb > small.resource.ram_gb +class TestPerDriverBatchHint: + """Each transformer driver carries its own (batch_size, max_batch_size) for rendering.""" + + def _bert_cfg(self, model_name: str, batch_size: int) -> dict[str, Any]: + return { + "search_space": [ + { + "node_type": "scoring", + "search_space": [ + { + "module_name": "bert", + "classification_model_config": [{"model_name": model_name}], + "num_train_epochs": [3], + "batch_size": [batch_size], + } + ], + } + ], + "hpo_config": {"n_trials": 1}, + } + + def test_driver_records_current_and_max_batch(self) -> None: + report = run_preflight( + self._bert_cfg("microsoft/deberta-v3-large", batch_size=64), + DatasetStats.placeholder(), + _profile(vram_gb=10.0), + ) + drivers = [d for d in report.resource.drivers if d["module"] == "bert"] + assert drivers + d = drivers[0] + assert d["batch_size"] == 64 + # vram_gb=10 + 5 GB weights → some room for activations, max < 64. + assert d["max_batch_size"] is not None + assert 0 < d["max_batch_size"] < 64 + + def test_max_batch_zero_when_weights_alone_overflow(self) -> None: + report = run_preflight( + self._bert_cfg("microsoft/deberta-v3-large", batch_size=64), + DatasetStats.placeholder(), + _profile(vram_gb=2.0), + ) + d = next(d for d in report.resource.drivers if d["module"] == "bert") + assert d["max_batch_size"] == 0 + + def test_max_batch_can_be_larger_than_current(self) -> None: + report = run_preflight( + self._bert_cfg("microsoft/deberta-v3-large", batch_size=32), + DatasetStats.placeholder(), + _profile(vram_gb=64.0), + ) + d = next(d for d in report.resource.drivers if d["module"] == "bert") + assert d["max_batch_size"] is not None and d["max_batch_size"] > 32 + + def test_multiple_drivers_carry_independent_max_batch(self) -> None: + cfg = { + "search_space": [ + { + "node_type": "scoring", + "search_space": [ + { + "module_name": "bert", + "classification_model_config": [ + {"model_name": "microsoft/deberta-v3-small"}, + {"model_name": "microsoft/deberta-v3-large"}, + ], + "num_train_epochs": [3], + "batch_size": [64], + } + ], + } + ], + "hpo_config": {"n_trials": 1}, + } + report = run_preflight(cfg, DatasetStats.placeholder(), _profile(vram_gb=10.0)) + small = next(d for d in report.resource.drivers if "small" in d["model"]) + large = next(d for d in report.resource.drivers if "large" in d["model"]) + # The smaller model has more headroom → larger max batch (or equal-cap when both saturate). + assert small["max_batch_size"] >= large["max_batch_size"] + + class TestDumpModulesBounding: """`dump_modules=True` writes one selected variant per node per trial — not every candidate. The estimate must be bounded by sum-of-max-per-node x n_trials.""" diff --git a/tests/advisor/test_render.py b/tests/advisor/test_render.py index 55a2b0ce3..2c0604a11 100644 --- a/tests/advisor/test_render.py +++ b/tests/advisor/test_render.py @@ -4,7 +4,7 @@ import json -from autointent._advisor._render import render_json, render_recommendation, render_text +from autointent._advisor._render import _batch_hint, render_json, render_recommendation, render_text from autointent._advisor._report import ( DatasetStats, PreflightReport, @@ -46,8 +46,8 @@ def _populated_report() -> PreflightReport: ), notes=["MPS unified memory note"], ) - r.add("resource", Severity.YELLOW, "VRAM ~6 GB vs available 8 GB") - r.add("data", Severity.RED, "rare classes blocked") + r.add("resource", Severity.TIGHT, "VRAM ~6 GB vs available 8 GB") + r.add("data", Severity.OVER, "rare classes blocked") return r @@ -64,10 +64,10 @@ def test_includes_drivers_block(self) -> None: assert "Drivers of cost:" in out assert "x/y" in out - def test_verdict_reflects_worst_severity(self) -> None: + def test_verdict_reflects_headroom(self) -> None: out = render_text(_populated_report()) assert "Verdict: INFEASIBLE" in out - assert "worst severity: red" in out + assert "headroom: over" in out def test_disclaimer_always_present(self) -> None: out = render_text(_populated_report()) @@ -96,25 +96,25 @@ def test_is_valid_json(self) -> None: def test_findings_have_string_severity(self) -> None: d = json.loads(render_json(_populated_report())) for f in d["findings"]: - assert f["severity"] in {"green", "yellow", "red"} + assert f["severity"] in {"ample", "tight", "over"} - def test_worst_severity_and_feasibility_serialized(self) -> None: + def test_headroom_and_feasibility_serialized(self) -> None: d = json.loads(render_json(_populated_report())) - assert d["worst_severity"] == "red" + assert d["headroom"] == "over" assert d["is_feasible"] is False def test_empty_report_serializes(self) -> None: d = json.loads(render_json(PreflightReport())) - assert d["worst_severity"] == "green" + assert d["headroom"] == "ample" assert d["is_feasible"] is True class TestRenderRecommendation: def _two_reports(self) -> list[tuple[str, PreflightReport]]: a = PreflightReport(preset_name="a", resource=ResourceEstimate(vram_gb=2.0, time_hours=0.5)) - a.add("resource", Severity.GREEN, "ok") + a.add("resource", Severity.AMPLE, "ok") b = PreflightReport(preset_name="b", resource=ResourceEstimate(vram_gb=8.0, time_hours=4.0)) - b.add("resource", Severity.RED, "too big") + b.add("resource", Severity.OVER, "too big") return [("a", a), ("b", b)] def test_lists_chosen_preset_when_present(self) -> None: @@ -136,6 +136,25 @@ def test_shows_status_per_preset(self) -> None: assert "infeasible" in out +class TestBatchHint: + """Per-driver batch cell rendered in the Drivers-of-cost table.""" + + def test_arrow_when_max_differs(self) -> None: + assert _batch_hint({"batch_size": 64, "max_batch_size": 32}) == "64 → 32" + + def test_plain_when_max_equals_current(self) -> None: + assert _batch_hint({"batch_size": 64, "max_batch_size": 64}) == "64" + + def test_no_fit_label_when_max_zero(self) -> None: + assert _batch_hint({"batch_size": 64, "max_batch_size": 0}) == "64 (no fit)" + + def test_empty_when_no_batch(self) -> None: + assert _batch_hint({"batch_size": None, "max_batch_size": None}) == "" + + def test_increase_arrow(self) -> None: + assert _batch_hint({"batch_size": 32, "max_batch_size": 128}) == "32 → 128" + + def test_dataset_stats_in_text_block() -> None: stats = DatasetStats.placeholder(n_samples=777, n_classes=4) r = PreflightReport( diff --git a/tests/advisor/test_report.py b/tests/advisor/test_report.py index 52f2e675e..28adbfc34 100644 --- a/tests/advisor/test_report.py +++ b/tests/advisor/test_report.py @@ -14,22 +14,22 @@ class TestSeverityOrdering: - def test_worst_severity_on_empty_report_is_green(self) -> None: - assert PreflightReport().worst_severity == Severity.GREEN + def test_headroom_on_empty_report_is_green(self) -> None: + assert PreflightReport().headroom == Severity.AMPLE def test_red_beats_yellow_beats_green(self) -> None: r = PreflightReport() - r.add("resource", Severity.GREEN, "ok") - r.add("data", Severity.YELLOW, "warn") - assert r.worst_severity == Severity.YELLOW - r.add("config", Severity.RED, "fail") - assert r.worst_severity == Severity.RED + r.add("resource", Severity.AMPLE, "ok") + r.add("data", Severity.TIGHT, "warn") + assert r.headroom == Severity.TIGHT + r.add("config", Severity.OVER, "fail") + assert r.headroom == Severity.OVER def test_is_feasible_flips_on_any_red(self) -> None: r = PreflightReport() - r.add("resource", Severity.YELLOW, "warn") + r.add("resource", Severity.TIGHT, "warn") assert r.is_feasible is True - r.add("data", Severity.RED, "fail") + r.add("data", Severity.OVER, "fail") assert r.is_feasible is False @@ -62,12 +62,12 @@ def test_total_disk_ignores_cached(self) -> None: class TestToDictSerialization: def test_findings_round_trip_severity_as_string(self) -> None: r = PreflightReport() - r.add("resource", Severity.RED, "boom") + r.add("resource", Severity.OVER, "boom") d = r.to_dict() - assert d["worst_severity"] == "red" + assert d["headroom"] == "over" assert d["is_feasible"] is False assert d["findings"] == [ - {"phase": "resource", "severity": "red", "message": "boom", "metric": None}, + {"phase": "resource", "severity": "over", "message": "boom", "metric": None}, ] def test_hardware_and_dataset_pass_through(self) -> None: @@ -80,6 +80,6 @@ def test_hardware_and_dataset_pass_through(self) -> None: assert d["dataset"]["n_samples"] == 100 def test_finding_is_frozen(self) -> None: - f = Finding(phase="resource", severity=Severity.GREEN, message="ok") + f = Finding(phase="resource", severity=Severity.AMPLE, message="ok") with pytest.raises(Exception): # noqa: PT011 - dataclass.FrozenInstanceError varies f.message = "changed" # type: ignore[misc] From 82a78287bdbb1bec9e4bce6dda9804f7a801fe19 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 16 Jun 2026 02:26:01 +0300 Subject: [PATCH 06/16] add more handling --- pyproject.toml | 1 + src/autointent/_advisor/_cli.py | 101 +++++++++++++++--- src/autointent/_advisor/_estimates.py | 135 +++++++++++++++--------- src/autointent/_advisor/_hub.py | 10 +- tests/advisor/test_estimates_and_cli.py | 2 +- 5 files changed, 183 insertions(+), 66 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8675e7ea3..4ee34b9c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ dependencies = [ "aiometer (>=1.0.0,<2.0.0)", "aiofiles (>=24.1.0,<25.0.0)", "threadpoolctl (>=3.0.0,<4.0.0)", + "psutil (>=5.9.0,<8.0.0)", ] [project.optional-dependencies] diff --git a/src/autointent/_advisor/_cli.py b/src/autointent/_advisor/_cli.py index d300ad6fa..c9485cc81 100644 --- a/src/autointent/_advisor/_cli.py +++ b/src/autointent/_advisor/_cli.py @@ -5,9 +5,10 @@ * ``inspect`` — show what a given preset / config will cost on this machine. * ``recommend`` — pick the best-fitting bundled preset for this machine. -Both subcommands accept either a real ``--dataset`` (path to load with -``Dataset.from_*`` constructors) or ``--n-samples / --n-classes / --avg-tokens`` -placeholders so the script is useful before the user has built a dataset. +Both subcommands accept either a real ``--dataset`` (Hub id or local +csv/json/jsonl/parquet path loaded via ``datasets.load_dataset``) or +``--n-samples / --n-classes / --avg-tokens`` placeholders so the script is +useful before the user has built a dataset. """ from __future__ import annotations @@ -19,8 +20,8 @@ from typing import Any import yaml +from datasets import ClassLabel, Sequence, load_dataset -from autointent import Dataset from autointent.utils import load_preset from ._estimates import run_preflight @@ -79,35 +80,107 @@ def _stats_from_args(args: argparse.Namespace) -> DatasetStats: ) +_UTTERANCE_COLS = ("utterance", "text", "sentence", "query", "input") +_LABEL_COLS = ("label", "labels", "intent", "target") +# Map file extension → datasets builder name. Anything else is treated as a Hub +# repo id or a directory and passed to load_dataset directly. +_FILE_BUILDERS = {".csv": "csv", ".tsv": "csv", ".json": "json", ".jsonl": "json", ".parquet": "parquet"} + + def _stats_from_dataset(path: str, *, multilabel: bool) -> DatasetStats: - """Best-effort: load a dataset from disk via the existing Dataset constructor.""" + """Best-effort: load via HF ``datasets.load_dataset``. + + Accepts a Hub repo id ('DeepPavlov/clinc150') or a local file path + (.csv / .json / .jsonl / .parquet) / dataset directory. Falls back to a + placeholder on any loader error so the advisor stays best-effort. + """ + builder = _FILE_BUILDERS.get(Path(path).suffix.lower()) try: - ds = Dataset.from_json(path) if path.endswith(".json") else Dataset.from_hub(path) - except (OSError, ValueError) as e: + ds = load_dataset(builder, data_files=path) if builder else load_dataset(path) + except (OSError, ValueError, FileNotFoundError) as e: logger.warning("Failed to load dataset %s: %s", path, e) return DatasetStats.placeholder(multilabel=multilabel) - train = ds.get("train") or next(iter(ds.values()), None) + train = ds["train"] if "train" in ds else next(iter(ds.values()), None) if train is None: return DatasetStats.placeholder(multilabel=multilabel) - utt_col = getattr(ds, "utterance_feature", "utterance") + cols = train.column_names + utt_col = next((c for c in _UTTERANCE_COLS if c in cols), cols[0] if cols else None) + label_col = next((c for c in _LABEL_COLS if c in cols), None) + + detected_multilabel, n_classes = _label_shape(train, label_col, fallback_multilabel=multilabel) + sample = train[:1000] if len(train) > 1000 else train[:] - lengths = [len(str(s).split()) for s in sample.get(utt_col, [])] + lengths = [len(str(s).split()) for s in (sample.get(utt_col, []) if utt_col else [])] avg_tokens = int(sum(lengths) / max(1, len(lengths))) if lengths else 32 - p95 = sorted(lengths)[int(len(lengths) * 0.95)] if lengths else avg_tokens * 2 + if lengths: + sorted_lengths = sorted(lengths) + idx = max(0, min(len(sorted_lengths) - 1, int(round((len(sorted_lengths) - 1) * 0.95)))) + p95 = sorted_lengths[idx] + else: + p95 = avg_tokens * 2 return DatasetStats( n_samples=len(train), - n_classes=getattr(ds, "n_classes", 0) or 0, + n_classes=n_classes, avg_tokens=avg_tokens, p95_tokens=p95, - multilabel=getattr(ds, "multilabel", multilabel), - has_descriptions=getattr(ds, "has_descriptions", None), + multilabel=detected_multilabel, + has_descriptions=None, + rare_classes=_rare_classes(train, label_col, detected_multilabel, n_classes) if label_col else [], source=f"dataset:{path}", ) +def _label_shape(train: Any, label_col: str | None, *, fallback_multilabel: bool) -> tuple[bool, int]: + """Derive (multilabel, n_classes) from the HF feature schema, with a value-based fallback.""" + if label_col is None: + return fallback_multilabel, 0 + feature = train.features.get(label_col) + if isinstance(feature, Sequence): + inner = feature.feature + if isinstance(inner, ClassLabel): + return True, inner.num_classes + # Sequence of plain ints — n_classes = max label index + 1. + max_idx = max((max(row) for row in train[label_col] if row), default=-1) + return True, max_idx + 1 + if isinstance(feature, ClassLabel): + return False, feature.num_classes + # Plain int/string column. Detect multilabel from the first non-empty row, then count uniques. + is_multi = len(train) > 0 and isinstance(train[0][label_col], (list, tuple)) + if is_multi: + max_idx = max((max(row) for row in train[label_col] if row), default=-1) + return True, max_idx + 1 + return False, len({label for label in train[label_col] if label is not None}) + + +def _rare_classes(train: Any, label_col: str, multilabel: bool, n_classes: int, min_count: int = 3) -> list[str]: + """Return labels with fewer than ``min_count`` samples in the train split. + + Used to surface the LogisticRegressionCV(cv=3) failure case before fit. + Returns an empty list on any error so the advisor stays best-effort. + """ + try: + labels = train[label_col] + except (KeyError, AttributeError, TypeError): + return [] + counts: dict[str, int] = {} + if multilabel: + for row in labels: + if not row: + continue + for i, v in enumerate(row): + if v: + counts[str(i)] = counts.get(str(i), 0) + 1 + for i in range(n_classes): + counts.setdefault(str(i), 0) + else: + for label in labels: + counts[str(label)] = counts.get(str(label), 0) + 1 + return sorted(name for name, c in counts.items() if c < min_count) + + def _add_common_dataset_args(p: argparse.ArgumentParser) -> None: p.add_argument("--dataset", help="Path or hub id of a dataset; overrides placeholders.") p.add_argument("--n-samples", type=int, default=1_000, help="Placeholder training set size.") diff --git a/src/autointent/_advisor/_estimates.py b/src/autointent/_advisor/_estimates.py index 06d0e4fe1..e88fffd0a 100644 --- a/src/autointent/_advisor/_estimates.py +++ b/src/autointent/_advisor/_estimates.py @@ -12,15 +12,49 @@ from collections.abc import Iterable from typing import Any +from pydantic import BaseModel, ConfigDict, Field, ValidationError + +from autointent.configs._optimization import HPOConfig + from ._hardware import HardwareProfile from ._hub import ModelMeta, hub_reachable, resolve_model from ._report import DatasetStats, PreflightReport, ResourceEstimate, Severity logger = logging.getLogger(__name__) -# yellow / red thresholds as fraction of available budget -_YELLOW = 0.7 -_RED = 1.0 + +class _AdvisorConfig(BaseModel): + """Validated view of the advisor's input config. + + Wraps the four top-level keys the phase helpers read. Unknown top-level + keys are ignored (preset YAMLs carry extra metadata the advisor doesn't model). + """ + + model_config = ConfigDict(extra="ignore") + + hpo_config: HPOConfig = Field(default_factory=HPOConfig) + search_space: list[dict[str, Any]] = Field(default_factory=list) + refit_after: bool = False + dump_modules: bool = False + embedder_config: dict[str, Any] | None = None + + +def _validated_config(config: dict[str, Any]) -> _AdvisorConfig: + """Validate ``config`` against ``_AdvisorConfig``; fall back to defaults on any error. + + The advisor is best-effort: a malformed user config should still produce a + report (with placeholder costs) rather than crashing. + """ + try: + return _AdvisorConfig.model_validate(config) + except ValidationError as e: + logger.warning("Advisor config failed validation; falling back to defaults: %s", e) + return _AdvisorConfig() + +# Severity thresholds as a fraction of available budget: at or above _TIGHT +# downgrades to Severity.TIGHT; at or above _OVER downgrades to Severity.OVER. +_TIGHT_RATIO = 0.7 +_OVER_RATIO = 1.0 # rough per-step seconds, keyed on device class. Scaled by params_millions / 100. _PER_STEP_BASELINE_S = { @@ -31,7 +65,14 @@ "apple-silicon": 0.08, } -TRANSFORMER_SCORER_MODULES = {"bert", "lora", "ptuning", "dnnc"} +# Maps each fine-tunable transformer module to its training-mode label. +# Modules not listed are treated as inference-only. +_TRANSFORMER_TRAINING_MODE = { + "bert": "full-finetune", + "ptuning": "lora", + "lora": "lora", + "dnnc": "reranker", +} # Fallback max_length when the search-space entry doesn't pin it. Used both as # the default in _vram_for_transformer and in the entry-walk seq_len resolution. @@ -81,14 +122,6 @@ def _max_int(value: Any, default: int) -> int: return default -def _walk_modules(search_space: list[dict[str, Any]]) -> Iterable[tuple[str, dict[str, Any]]]: - """Yield (node_type, module_entry) pairs.""" - for node in search_space or []: - node_type = node.get("node_type", "?") - for entry in node.get("search_space", []) or []: - yield node_type, entry - - def _walk_modules_indexed( search_space: list[dict[str, Any]], ) -> Iterable[tuple[int, str, dict[str, Any]]]: @@ -99,6 +132,12 @@ def _walk_modules_indexed( yield node_idx, node_type, entry +def _walk_modules(search_space: list[dict[str, Any]]) -> Iterable[tuple[str, dict[str, Any]]]: + """Yield (node_type, module_entry) pairs — index-agnostic view over `_walk_modules_indexed`.""" + for _, node_type, entry in _walk_modules_indexed(search_space): + yield node_type, entry + + def _weights_vram_for_transformer(meta: ModelMeta, mode: str) -> float: """Weight-side VRAM in GB — weights + grads + Adam optimizer state. Excludes activations. @@ -211,7 +250,7 @@ def _max_fitting_batch_size( """ if per_sample_gb <= 0: return 0 - target_vram = vram_budget_gb * _YELLOW + target_vram = vram_budget_gb * _TIGHT_RATIO available_for_activations = target_vram - weight_vram_gb if available_for_activations <= 0: return 0 @@ -309,12 +348,14 @@ def _time_for_transformer( def _classify_severity(estimate: float, budget: float) -> Severity: + if estimate <= 0: + return Severity.AMPLE if budget <= 0: return Severity.TIGHT ratio = estimate / budget - if ratio >= _RED: + if ratio >= _OVER_RATIO: return Severity.OVER - if ratio >= _YELLOW: + if ratio >= _TIGHT_RATIO: return Severity.TIGHT return Severity.AMPLE @@ -325,28 +366,27 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity hardware: HardwareProfile, report: PreflightReport, ) -> None: - hpo = config.get("hpo_config") or {} - n_trials = int(hpo.get("n_trials", 1)) - n_jobs = int(hpo.get("n_jobs", 1)) - refit_after = bool(config.get("refit_after", False)) - dump_modules = bool(config.get("dump_modules", False)) + cfg = _validated_config(config) + n_trials = max(1, cfg.hpo_config.n_trials) + n_jobs = max(1, cfg.hpo_config.n_jobs) + refit_after = cfg.refit_after + dump_modules = cfg.dump_modules if not hub_reachable(): report.low_confidence = True report.notes.append("HF Hub unreachable — all model sizes are name-pattern heuristics.") seen_models: dict[str, ModelMeta] = {} - estimate = ResourceEstimate(parallel_factor=max(1, n_jobs)) + estimate = ResourceEstimate(parallel_factor=n_jobs) - embedder_cfg = config.get("embedder_config") or {} - global_embedder = embedder_cfg.get("model_name") if isinstance(embedder_cfg, dict) else None + global_embedder = (cfg.embedder_config or {}).get("model_name") if global_embedder: seen_models[global_embedder] = resolve_model(global_embedder) # First pass: walk transformer-bearing modules (collects seen_models for embedder_dim lookup). transformer_entries: list[tuple[int, str, dict[str, Any]]] = [] classic_entries: list[tuple[int, str, dict[str, Any]]] = [] - for node_idx, node_type, entry in _walk_modules_indexed(config.get("search_space") or []): + for node_idx, node_type, entry in _walk_modules_indexed(cfg.search_space): module = entry.get("module_name", "?") if module in {"linear", "catboost"}: classic_entries.append((node_idx, node_type, entry)) @@ -367,16 +407,7 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity meta = seen_models.setdefault(name, resolve_model(name)) mixed_precision = entry.get("dtype") in {"fp16", "bf16"} - if module == "bert": - mode = "full-finetune" - elif module == "lora": - mode = "lora" - elif module == "dnnc": - mode = "reranker" - elif module == "ptuning": - mode = "full-finetune" - else: - mode = "inference" + mode = _TRANSFORMER_TRAINING_MODE.get(module, "inference") batch_size = _max_int(entry.get("batch_size"), 32) epochs = _max_int(entry.get("num_train_epochs"), 1 if mode == "inference" else 10) @@ -430,7 +461,11 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity # Second pass: linear / catboost — cost depends on embedder_dim, not a checkpoint. embedder_meta = _largest_embedder(seen_models) embedder_dim = _embedder_dim(embedder_meta) - class_multiplier_classic = max(1, stats.n_classes) if stats.multilabel else 1 + # Both multinomial (multiclass) and one-vs-rest (multilabel) LR scale linearly in n_classes; + # the multiclass path additionally pays the LogisticRegressionCV inner-fit multiplier. + class_multiplier_classic = max(1, stats.n_classes) + confidence = embedder_meta.confidence if embedder_meta else "heuristic" + embedder_label = embedder_meta.name if embedder_meta else "(no embedder)" for _node_idx, node_type, entry in classic_entries: module = entry.get("module_name", "?") if module == "linear": @@ -449,14 +484,14 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity time_h *= 1 + 1.0 / max(1, n_trials) vram = 0.0 mode = "linear-cv" if cv_multiplier > 1 else "linear" - confidence = embedder_meta.confidence if embedder_meta else "heuristic" elif module == "catboost": iterations = _max_int(entry.get("iterations"), 1000) depth = _max_int(entry.get("depth"), 6) on_gpu = entry.get("task_type") == "GPU" and hardware.accelerator == "cuda" - # CatBoost's multiclass MultiClass loss already grows per-class trees. - cb_class_mult = max(1, stats.n_classes) - ram = _ram_for_catboost( + # CatBoost's MultiClass loss grows per-class trees only above binary; + # binary uses Logloss with one tree per iteration. + cb_class_mult = max(1, stats.n_classes) if stats.n_classes > 2 or stats.multilabel else 1 + ram_total = _ram_for_catboost( stats=stats, n_features=embedder_dim, iterations=iterations, @@ -473,11 +508,8 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity ) if refit_after: time_h *= 1 + 1.0 / max(1, n_trials) - vram = ram if on_gpu else 0.0 - if on_gpu: - ram = 0.0 + vram, ram = (ram_total, 0.0) if on_gpu else (0.0, ram_total) mode = "catboost-gpu" if on_gpu else "catboost" - confidence = embedder_meta.confidence if embedder_meta else "heuristic" else: continue @@ -488,7 +520,7 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity { "node_type": node_type, "module": module, - "model": embedder_meta.name if embedder_meta else "(no embedder)", + "model": embedder_label, "mode": mode, "vram_gb": round(vram, 2), "ram_gb": round(ram, 2), @@ -515,6 +547,9 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity effective_vram = estimate.vram_gb * n_jobs else: effective_vram = estimate.vram_gb + # MPS shares one unified pool: parallel workers each allocate weights+activations + # in RAM, so peak RAM also scales with n_jobs on Apple Silicon. + effective_ram = estimate.ram_gb * n_jobs if n_jobs > 1 and hardware.accelerator == "mps" else estimate.ram_gb report.resource = estimate @@ -535,11 +570,11 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity report.add("resource", vram_sev, msg, metric="vram") - ram_sev = _classify_severity(estimate.ram_gb, hardware.ram_gb) + ram_sev = _classify_severity(effective_ram, hardware.ram_gb) report.add( "resource", ram_sev, - f"RAM ~{estimate.ram_gb:.1f} GB vs available {hardware.ram_gb:.1f} GB", + f"RAM ~{effective_ram:.1f} GB vs available {hardware.ram_gb:.1f} GB", metric="ram", ) @@ -606,9 +641,10 @@ def _data_phase( f"Train tokens p95~{p95} exceeds {entry.get('module_name', '?')}.max_length={max_len}; expect silent truncation.", ) - # rare class × linear-CV + # rare class × linear-CV (LogisticRegressionCV cv=3 needs ≥3 samples/class; + # multilabel path uses one-vs-rest without CV so the failure can't occur there) has_linear = any(e.get("module_name") == "linear" for _, e in _walk_modules(config.get("search_space") or [])) - if has_linear and stats.rare_classes: + if has_linear and stats.rare_classes and not stats.multilabel: report.add( "data", Severity.OVER, @@ -616,8 +652,9 @@ def _data_phase( ) # partial descriptions × description scorer + description_modules = {"description_bi", "description_cross", "description_llm"} has_description = any( - e.get("module_name") == "description" for _, e in _walk_modules(config.get("search_space") or []) + e.get("module_name") in description_modules for _, e in _walk_modules(config.get("search_space") or []) ) if has_description and stats.has_descriptions is False: report.add( diff --git a/src/autointent/_advisor/_hub.py b/src/autointent/_advisor/_hub.py index 613ab6b40..1c559ee2f 100644 --- a/src/autointent/_advisor/_hub.py +++ b/src/autointent/_advisor/_hub.py @@ -76,7 +76,7 @@ def _is_warm_cached(model_name: str) -> bool: weight_files = ["model.safetensors", "pytorch_model.bin", "model.safetensors.index.json"] for fname in weight_files: path = try_to_load_from_cache(model_name, fname) - if path is not None and path is not False: + if isinstance(path, str): return True # sharded models won't match the single-file probe; fall back to a scan @@ -114,11 +114,17 @@ def _hub_metadata(model_name: str) -> ModelMeta | None: if size: total_file_bytes += int(size) + # Track whether either size came from the Hub or from the name-pattern fallback; + # if any field was filled by heuristic, downgrade confidence so the report flips + # low_confidence rather than misreporting hub-grade accuracy. + confidence = "hub" if params_millions == 0: params_millions = _heuristic_params_millions(model_name) + confidence = "heuristic" if total_file_bytes == 0: total_file_bytes = int(params_millions * 1_000_000 * weight_bytes_per_param) + confidence = "heuristic" return ModelMeta( name=model_name, @@ -126,7 +132,7 @@ def _hub_metadata(model_name: str) -> ModelMeta | None: weight_bytes_per_param=weight_bytes_per_param, total_file_bytes=total_file_bytes, cached_locally=_is_warm_cached(model_name), - confidence="hub", + confidence=confidence, ) diff --git a/tests/advisor/test_estimates_and_cli.py b/tests/advisor/test_estimates_and_cli.py index 15a087e07..3092dce9e 100644 --- a/tests/advisor/test_estimates_and_cli.py +++ b/tests/advisor/test_estimates_and_cli.py @@ -150,7 +150,7 @@ def test_partial_descriptions_with_description_scorer_flags_red() -> None: { "node_type": "scoring", "search_space": [ - {"module_name": "description"}, + {"module_name": "description_bi"}, ], } ], From bbb039e576467b479e8148a87bbfd4b34299976e Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 16 Jun 2026 17:41:08 +0300 Subject: [PATCH 07/16] fix typing & lint --- pyproject.toml | 6 ++ src/autointent/_advisor/_cli.py | 26 ++++++-- src/autointent/_advisor/_estimates.py | 78 ++++++++++++----------- src/autointent/_advisor/_hardware.py | 14 ++-- src/autointent/_advisor/_hub.py | 6 +- src/autointent/_advisor/_render.py | 18 +++--- tests/advisor/test_estimates_internals.py | 5 +- tests/advisor/test_report.py | 4 +- 8 files changed, 94 insertions(+), 63 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4ee34b9c5..971de2655 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -296,6 +296,12 @@ module = [ "dspy.evaluate.auto_evaluation", "codecarbon", "catboost", + "openai", + "openai.*", + "tiktoken", + "peft", + "sentence_transformers", + "psutil", ] ignore_missing_imports = true diff --git a/src/autointent/_advisor/_cli.py b/src/autointent/_advisor/_cli.py index c9485cc81..8c8b7b9d2 100644 --- a/src/autointent/_advisor/_cli.py +++ b/src/autointent/_advisor/_cli.py @@ -17,7 +17,7 @@ import logging import sys from pathlib import Path -from typing import Any +from typing import TYPE_CHECKING, Any import yaml from datasets import ClassLabel, Sequence, load_dataset @@ -27,10 +27,16 @@ from ._estimates import run_preflight from ._hardware import detect_hardware from ._render import render_json, render_recommendation, render_text -from ._report import DatasetStats, PreflightReport, Severity +from ._report import DatasetStats, Severity + +if TYPE_CHECKING: + from ._report import PreflightReport logger = logging.getLogger("autointent.advisor") +_SAMPLE_LIMIT = 1000 +_P95_PERCENTILE = 0.95 + BUNDLED_PRESETS = [ "transformers-heavy", "transformers-light", @@ -111,12 +117,12 @@ def _stats_from_dataset(path: str, *, multilabel: bool) -> DatasetStats: detected_multilabel, n_classes = _label_shape(train, label_col, fallback_multilabel=multilabel) - sample = train[:1000] if len(train) > 1000 else train[:] + sample = train[:_SAMPLE_LIMIT] if len(train) > _SAMPLE_LIMIT else train[:] lengths = [len(str(s).split()) for s in (sample.get(utt_col, []) if utt_col else [])] avg_tokens = int(sum(lengths) / max(1, len(lengths))) if lengths else 32 if lengths: sorted_lengths = sorted(lengths) - idx = max(0, min(len(sorted_lengths) - 1, int(round((len(sorted_lengths) - 1) * 0.95)))) + idx = max(0, min(len(sorted_lengths) - 1, round((len(sorted_lengths) - 1) * _P95_PERCENTILE))) p95 = sorted_lengths[idx] else: p95 = avg_tokens * 2 @@ -133,7 +139,7 @@ def _stats_from_dataset(path: str, *, multilabel: bool) -> DatasetStats: ) -def _label_shape(train: Any, label_col: str | None, *, fallback_multilabel: bool) -> tuple[bool, int]: +def _label_shape(train: Any, label_col: str | None, *, fallback_multilabel: bool) -> tuple[bool, int]: # noqa: ANN401 """Derive (multilabel, n_classes) from the HF feature schema, with a value-based fallback.""" if label_col is None: return fallback_multilabel, 0 @@ -155,7 +161,13 @@ def _label_shape(train: Any, label_col: str | None, *, fallback_multilabel: bool return False, len({label for label in train[label_col] if label is not None}) -def _rare_classes(train: Any, label_col: str, multilabel: bool, n_classes: int, min_count: int = 3) -> list[str]: +def _rare_classes( + train: Any, # noqa: ANN401 + label_col: str, + multilabel: bool, + n_classes: int, + min_count: int = 3, +) -> list[str]: """Return labels with fewer than ``min_count`` samples in the train split. Used to surface the LogisticRegressionCV(cv=3) failure case before fit. @@ -293,7 +305,7 @@ def main(argv: list[str] | None = None) -> int: level=logging.DEBUG if args.verbose else logging.WARNING, format="%(levelname)s %(name)s: %(message)s", ) - return args.func(args) + return int(args.func(args)) if __name__ == "__main__": diff --git a/src/autointent/_advisor/_estimates.py b/src/autointent/_advisor/_estimates.py index e88fffd0a..93dfcedaf 100644 --- a/src/autointent/_advisor/_estimates.py +++ b/src/autointent/_advisor/_estimates.py @@ -9,16 +9,26 @@ from __future__ import annotations import logging -from collections.abc import Iterable -from typing import Any +from typing import TYPE_CHECKING, Any from pydantic import BaseModel, ConfigDict, Field, ValidationError from autointent.configs._optimization import HPOConfig -from ._hardware import HardwareProfile -from ._hub import ModelMeta, hub_reachable, resolve_model -from ._report import DatasetStats, PreflightReport, ResourceEstimate, Severity +from ._hub import hub_reachable, resolve_model +from ._report import PreflightReport, ResourceEstimate, Severity + +if TYPE_CHECKING: + from collections.abc import Iterable + + from ._hardware import HardwareProfile + from ._hub import ModelMeta + from ._report import DatasetStats + +_MULTICLASS_THRESHOLD = 2 +_PARAMS_LARGE = 300 +_PARAMS_BASE = 100 +_PARAMS_SMALL = 50 logger = logging.getLogger(__name__) @@ -51,6 +61,7 @@ def _validated_config(config: dict[str, Any]) -> _AdvisorConfig: logger.warning("Advisor config failed validation; falling back to defaults: %s", e) return _AdvisorConfig() + # Severity thresholds as a fraction of available budget: at or above _TIGHT # downgrades to Severity.TIGHT; at or above _OVER downgrades to Severity.OVER. _TIGHT_RATIO = 0.7 @@ -94,22 +105,18 @@ def _extract_model_names(module_entry: dict[str, Any]) -> list[str]: candidates: list[str] = [] cfg = module_entry.get("classification_model_config") if isinstance(cfg, list): - for c in cfg: - if isinstance(c, dict) and c.get("model_name"): - candidates.append(c["model_name"]) + candidates.extend(c["model_name"] for c in cfg if isinstance(c, dict) and c.get("model_name")) elif isinstance(cfg, dict) and cfg.get("model_name"): candidates.append(cfg["model_name"]) embedder_cfg = module_entry.get("embedder_config") if isinstance(embedder_cfg, list): - for c in embedder_cfg: - if isinstance(c, dict) and c.get("model_name"): - candidates.append(c["model_name"]) + candidates.extend(c["model_name"] for c in embedder_cfg if isinstance(c, dict) and c.get("model_name")) elif isinstance(embedder_cfg, dict) and embedder_cfg.get("model_name"): candidates.append(embedder_cfg["model_name"]) return candidates -def _max_int(value: Any, default: int) -> int: +def _max_int(value: Any, default: int) -> int: # noqa: ANN401 if value is None: return default if isinstance(value, list) and value: @@ -163,7 +170,7 @@ def _vram_for_transformer( batch_size: int = 0, seq_len: int = _DEFAULT_SEQ_LEN, ) -> float: - """Total VRAM in GB: weights + grads + optimizer state + activations × batch. + """Total VRAM in GB: weights + grads + optimizer state + activations x batch. Activation accounting differs by mode — training keeps per-layer outputs for backward; inference only needs one or two layers in flight. @@ -200,11 +207,11 @@ def _n_layers(meta: ModelMeta | None) -> int: if meta is None: return 12 params = meta.params_millions - if params >= 300: + if params >= _PARAMS_LARGE: return 24 - if params >= 100: + if params >= _PARAMS_BASE: return 12 - if params >= 50: + if params >= _PARAMS_SMALL: return 8 return 6 @@ -218,20 +225,17 @@ def _activations_gb_per_sample( ) -> float: """Heuristic activation memory per sample. - Training: ``seq_len × hidden × layers × const`` — per-layer outputs are kept + Training: ``seq_len x hidden x layers x const`` — per-layer outputs are kept for backward. - Inference: ``seq_len × hidden × const`` — only one or two layers' outputs in + Inference: ``seq_len x hidden x const`` — only one or two layers' outputs in flight at once. Mixed precision halves activation bytes. """ hidden = _embedder_dim(meta) - if is_training: - # Training keeps every layer's outputs for backward → scales × n_layers. - # The 16-byte/token/layer coefficient bundles fp32 activation + ~4× backward overhead. - bytes_per_sample = seq_len * hidden * _n_layers(meta) * 16 - else: - # Inference only holds ~1-2 layers' outputs in flight at once. - bytes_per_sample = seq_len * hidden * 8 + # Training keeps every layer's outputs for backward -> scales x n_layers. + # The 16-byte/token/layer coefficient bundles fp32 activation + ~4x backward overhead. + # Inference only holds ~1-2 layers' outputs in flight at once. + bytes_per_sample = seq_len * hidden * _n_layers(meta) * 16 if is_training else seq_len * hidden * 8 if mixed_precision: bytes_per_sample //= 2 return bytes_per_sample / (1024**3) @@ -265,11 +269,11 @@ def _embedder_dim(meta: ModelMeta | None) -> int: if meta is None: return 768 params = meta.params_millions - if params >= 300: + if params >= _PARAMS_LARGE: return 1024 - if params >= 100: + if params >= _PARAMS_BASE: return 768 - if params >= 50: + if params >= _PARAMS_SMALL: return 512 return 384 @@ -313,7 +317,7 @@ def _ram_for_catboost(*, stats: DatasetStats, n_features: int, iterations: int, data_bytes = 4.0 * stats.n_samples * n_features histograms_bytes = 4.0 * n_features * _CATBOOST_DEFAULT_BINS trees_bytes = iterations * (2**depth) * _CATBOOST_BYTES_PER_TREE_NODE - return (data_bytes + histograms_bytes + trees_bytes) / (1024**3) + return float((data_bytes + histograms_bytes + trees_bytes) / (1024**3)) def _time_for_catboost( @@ -360,7 +364,7 @@ def _classify_severity(estimate: float, budget: float) -> Severity: return Severity.AMPLE -def _resource_phase( # noqa: PLR0912 - kept linear for clarity +def _resource_phase( # noqa: PLR0912, C901, PLR0915 - kept linear for clarity config: dict[str, Any], stats: DatasetStats, hardware: HardwareProfile, @@ -394,7 +398,7 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity transformer_entries.append((node_idx, node_type, entry)) # Track the heaviest module per node so dump_modules accounting is bounded by - # "one selected variant per node × n_trials", not "sum of every candidate". + # "one selected variant per node x n_trials", not "sum of every candidate". node_max_weights: dict[int, float] = {} for node_idx, node_type, entry in transformer_entries: @@ -490,7 +494,9 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity on_gpu = entry.get("task_type") == "GPU" and hardware.accelerator == "cuda" # CatBoost's MultiClass loss grows per-class trees only above binary; # binary uses Logloss with one tree per iteration. - cb_class_mult = max(1, stats.n_classes) if stats.n_classes > 2 or stats.multilabel else 1 + cb_class_mult = ( + max(1, stats.n_classes) if stats.n_classes > _MULTICLASS_THRESHOLD or stats.multilabel else 1 + ) ram_total = _ram_for_catboost( stats=stats, n_features=embedder_dim, @@ -569,7 +575,6 @@ def _resource_phase( # noqa: PLR0912 - kept linear for clarity msg += f" vs available {hardware.vram_gb:.1f} GB" report.add("resource", vram_sev, msg, metric="vram") - ram_sev = _classify_severity(effective_ram, hardware.ram_gb) report.add( "resource", @@ -635,13 +640,14 @@ def _data_phase( max_len = _max_int(max_len_value, 512) if p95 > max_len: severity = Severity.OVER if p95 > max_len * 1.5 else Severity.TIGHT + module_name = entry.get("module_name", "?") report.add( "data", severity, - f"Train tokens p95~{p95} exceeds {entry.get('module_name', '?')}.max_length={max_len}; expect silent truncation.", + f"Train tokens p95~{p95} exceeds {module_name}.max_length={max_len}; expect silent truncation.", ) - # rare class × linear-CV (LogisticRegressionCV cv=3 needs ≥3 samples/class; + # rare class x linear-CV (LogisticRegressionCV cv=3 needs >=3 samples/class; # multilabel path uses one-vs-rest without CV so the failure can't occur there) has_linear = any(e.get("module_name") == "linear" for _, e in _walk_modules(config.get("search_space") or [])) if has_linear and stats.rare_classes and not stats.multilabel: @@ -651,7 +657,7 @@ def _data_phase( (f"LogisticRegressionCV (cv=3) will fail: classes {stats.rare_classes[:5]} have <3 samples."), ) - # partial descriptions × description scorer + # partial descriptions x description scorer description_modules = {"description_bi", "description_cross", "description_llm"} has_description = any( e.get("module_name") in description_modules for _, e in _walk_modules(config.get("search_space") or []) diff --git a/src/autointent/_advisor/_hardware.py b/src/autointent/_advisor/_hardware.py index 9c0cae049..e959b6ebf 100644 --- a/src/autointent/_advisor/_hardware.py +++ b/src/autointent/_advisor/_hardware.py @@ -12,6 +12,7 @@ import platform import shutil from dataclasses import dataclass, field +from pathlib import Path from typing import Literal import psutil @@ -24,6 +25,9 @@ # matches macOS PYTORCH_MPS_HIGH_WATERMARK_RATIO default MPS_DEFAULT_BUDGET_RATIO = 0.7 +_HIGH_GPU_VRAM_GB = 24 +_MID_GPU_VRAM_GB = 12 + @dataclass class HardwareProfile: @@ -41,20 +45,20 @@ def device_class(self) -> str: return "cpu" if self.accelerator == "mps": return "apple-silicon" - if self.vram_gb >= 24: + if self.vram_gb >= _HIGH_GPU_VRAM_GB: return "high-gpu" - if self.vram_gb >= 12: + if self.vram_gb >= _MID_GPU_VRAM_GB: return "mid-gpu" return "low-gpu" def _detect_ram_gb() -> float: - return psutil.virtual_memory().total / (1024**3) + return float(psutil.virtual_memory().total) / (1024**3) def _detect_free_disk_gb(path: str | None = None) -> float: - cache = path or os.environ.get("HF_HOME") or os.path.expanduser("~/.cache/huggingface") - probe_path = cache if os.path.exists(cache) else os.path.expanduser("~") + cache = Path(path or os.environ.get("HF_HOME") or Path("~/.cache/huggingface").expanduser()) + probe_path = cache if cache.exists() else Path("~").expanduser() try: usage = shutil.disk_usage(probe_path) return usage.free / (1024**3) diff --git a/src/autointent/_advisor/_hub.py b/src/autointent/_advisor/_hub.py index 1c559ee2f..9b351952a 100644 --- a/src/autointent/_advisor/_hub.py +++ b/src/autointent/_advisor/_hub.py @@ -8,10 +8,10 @@ from __future__ import annotations import logging -import os import re from dataclasses import dataclass from functools import lru_cache +from pathlib import Path from typing import Any from huggingface_hub import HfApi, scan_cache_dir, try_to_load_from_cache @@ -54,7 +54,7 @@ def weights_gb(self) -> float: @lru_cache(maxsize=1) -def hub_reachable(timeout_s: float = 2.0) -> bool: +def hub_reachable() -> bool: """Single up-front probe. Memoized per process.""" try: HfApi().list_models(limit=1) @@ -157,7 +157,7 @@ def resolve_model(model_name: str) -> ModelMeta: Always returns a value — never raises — so the advisor can keep going on offline machines or for unknown checkpoints. """ - if model_name.startswith("local:") or os.path.isabs(model_name): + if model_name.startswith("local:") or Path(model_name).is_absolute(): return ModelMeta( name=model_name, params_millions=_heuristic_params_millions(model_name), diff --git a/src/autointent/_advisor/_render.py b/src/autointent/_advisor/_render.py index a3778d307..82771ef9f 100644 --- a/src/autointent/_advisor/_render.py +++ b/src/autointent/_advisor/_render.py @@ -8,7 +8,7 @@ from __future__ import annotations import json -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any if TYPE_CHECKING: from ._report import PreflightReport @@ -18,7 +18,7 @@ _PHASE_LABEL = {"resource": "Resource", "data": "Data", "config": "Config"} -def _batch_hint(driver: dict) -> str: +def _batch_hint(driver: dict[str, Any]) -> str: """Per-driver batch annotation: '64 → 32', '64', '64 (no fit)', or ''.""" bs = driver.get("batch_size") if bs is None: @@ -37,12 +37,11 @@ def _batch_hint(driver: dict) -> str: _DRIVERS_HEADERS = ("Node", "Model", "Mode", "VRAM", "Time", "Batch", "Source") -def _render_drivers_table(drivers: list[dict]) -> list[str]: +def _render_drivers_table(drivers: list[dict[str, Any]]) -> list[str]: """Format the Drivers of cost section as an aligned table.""" visible = drivers[:_DRIVERS_LIMIT] - rows: list[tuple[str, ...]] = [] - for d in visible: - rows.append(( + rows: list[tuple[str, ...]] = [ + ( f"{d['node_type']}.{d['module']}", str(d["model"]), str(d["mode"]), @@ -50,7 +49,9 @@ def _render_drivers_table(drivers: list[dict]) -> list[str]: f"{d['time_hours']:.2f} h", _batch_hint(d), f"[{d['confidence']}]", - )) + ) + for d in visible + ] widths = [len(h) for h in _DRIVERS_HEADERS] for row in rows: @@ -113,8 +114,7 @@ def render_text(report: PreflightReport) -> str: if report.notes: lines.append("Notes:") - for note in report.notes: - lines.append(f" • {note}") + lines.extend(f" • {note}" for note in report.notes) lines.append("") summary = f"Verdict: {'feasible' if report.is_feasible else 'INFEASIBLE'} " diff --git a/tests/advisor/test_estimates_internals.py b/tests/advisor/test_estimates_internals.py index 5ac66af2f..9b4881611 100644 --- a/tests/advisor/test_estimates_internals.py +++ b/tests/advisor/test_estimates_internals.py @@ -128,7 +128,7 @@ def test_inference_activations_are_smaller_than_training(self, meta: ModelMeta) inf_acts = inf_total - inf_weights assert inf_acts > 0 assert train_acts > inf_acts - # 12-layer model: training activations should be at least ~5× inference. + # 12-layer model: training activations should be at least ~5x inference. assert train_acts / inf_acts > 5 def test_amp_does_not_reduce_weight_side_vram(self, meta: ModelMeta) -> None: @@ -503,7 +503,8 @@ def test_max_batch_can_be_larger_than_current(self) -> None: _profile(vram_gb=64.0), ) d = next(d for d in report.resource.drivers if d["module"] == "bert") - assert d["max_batch_size"] is not None and d["max_batch_size"] > 32 + assert d["max_batch_size"] is not None + assert d["max_batch_size"] > 32 def test_multiple_drivers_carry_independent_max_batch(self) -> None: cfg = { diff --git a/tests/advisor/test_report.py b/tests/advisor/test_report.py index 28adbfc34..dbfc7adf6 100644 --- a/tests/advisor/test_report.py +++ b/tests/advisor/test_report.py @@ -2,6 +2,8 @@ from __future__ import annotations +import dataclasses + import pytest from autointent._advisor._report import ( @@ -81,5 +83,5 @@ def test_hardware_and_dataset_pass_through(self) -> None: def test_finding_is_frozen(self) -> None: f = Finding(phase="resource", severity=Severity.AMPLE, message="ok") - with pytest.raises(Exception): # noqa: PT011 - dataclass.FrozenInstanceError varies + with pytest.raises(dataclasses.FrozenInstanceError): f.message = "changed" # type: ignore[misc] From 334783c77f02b084a47d2e504782b55b296565ac Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 16 Jun 2026 18:01:40 +0300 Subject: [PATCH 08/16] try to fix typing --- tests/advisor/test_estimates_and_cli.py | 6 +++--- tests/advisor/test_report.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/advisor/test_estimates_and_cli.py b/tests/advisor/test_estimates_and_cli.py index 3092dce9e..2f16555ae 100644 --- a/tests/advisor/test_estimates_and_cli.py +++ b/tests/advisor/test_estimates_and_cli.py @@ -56,21 +56,21 @@ def test_every_preset_inspects_without_raising(preset: str) -> None: def test_heavy_preset_is_infeasible_on_2gb_budget() -> None: - cfg = load_preset("transformers-heavy") # type: ignore[arg-type] + cfg = load_preset("transformers-heavy") stats = DatasetStats.placeholder(n_samples=5000, n_classes=20, avg_tokens=40) report = run_preflight(cfg, stats, _profile(vram_gb=2.0), preset_name="transformers-heavy") assert not report.is_feasible, "deberta-v3-large should not fit in 2 GB" def test_light_preset_is_feasible_on_8gb_budget() -> None: - cfg = load_preset("transformers-light") # type: ignore[arg-type] + cfg = load_preset("transformers-light") stats = DatasetStats.placeholder(n_samples=1000, n_classes=10, avg_tokens=24) report = run_preflight(cfg, stats, _profile(vram_gb=8.0), preset_name="transformers-light") assert report.is_feasible def test_n_jobs_doubles_vram_findings() -> None: - cfg = load_preset("transformers-light") # type: ignore[arg-type] + cfg = load_preset("transformers-light") cfg = {**cfg, "hpo_config": {**(cfg.get("hpo_config") or {}), "n_jobs": 4}} stats = DatasetStats.placeholder() report = run_preflight(cfg, stats, _profile(vram_gb=4.0)) diff --git a/tests/advisor/test_report.py b/tests/advisor/test_report.py index dbfc7adf6..acb2b5bf8 100644 --- a/tests/advisor/test_report.py +++ b/tests/advisor/test_report.py @@ -25,7 +25,7 @@ def test_red_beats_yellow_beats_green(self) -> None: r.add("data", Severity.TIGHT, "warn") assert r.headroom == Severity.TIGHT r.add("config", Severity.OVER, "fail") - assert r.headroom == Severity.OVER + assert r.headroom == Severity.OVER # type: ignore[comparison-overlap] def test_is_feasible_flips_on_any_red(self) -> None: r = PreflightReport() From 4e4da91edc5712017ad134b22db8b8260a1b3ca9 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 16 Jun 2026 18:10:21 +0300 Subject: [PATCH 09/16] roll back config changes --- src/autointent/_presets/transformers-heavy.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/autointent/_presets/transformers-heavy.yaml b/src/autointent/_presets/transformers-heavy.yaml index cd15d791e..2576fbc82 100644 --- a/src/autointent/_presets/transformers-heavy.yaml +++ b/src/autointent/_presets/transformers-heavy.yaml @@ -5,19 +5,12 @@ search_space: - module_name: bert classification_model_config: - model_name: microsoft/deberta-v3-large - - model_name: intfloat/multilingual-e5-large-instruct - - model_name: microsoft/harrier-oss-v1-27b num_train_epochs: [30] batch_size: [32, 64] learning_rate: low: 1.0e-5 high: 1.0e-4 log: True - - module_name: description_bi - embedder_config: - - model_name: microsoft/deberta-v3-large - - model_name: intfloat/multilingual-e5-large-instruct - - model_name: microsoft/harrier-oss-v1-27b - node_type: decision target_metric: decision_accuracy search_space: From 8bd0b018ede3c89175cca41e6d71f0676d3f58d9 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 16 Jun 2026 18:26:19 +0300 Subject: [PATCH 10/16] move cli logic --- src/autointent/_advisor/__init__.py | 9 +- src/autointent/_advisor/_cli.py | 231 +++----------------------- src/autointent/_advisor/_report.py | 18 ++ src/autointent/_advisor/_workflows.py | 231 ++++++++++++++++++++++++++ src/autointent/custom_types/_types.py | 17 +- 5 files changed, 294 insertions(+), 212 deletions(-) create mode 100644 src/autointent/_advisor/_workflows.py diff --git a/src/autointent/_advisor/__init__.py b/src/autointent/_advisor/__init__.py index 3ff898816..28422c78d 100644 --- a/src/autointent/_advisor/__init__.py +++ b/src/autointent/_advisor/__init__.py @@ -9,15 +9,22 @@ from ._estimates import run_preflight from ._hardware import HardwareProfile, detect_hardware -from ._report import DatasetStats, Finding, PreflightReport, ResourceEstimate, Severity +from ._report import DatasetStats, Finding, PreflightReport, RecommendationResult, ResourceEstimate, Severity +from ._workflows import BUNDLED_PRESETS, inspect, load_config, recommend, stats_from_dataset __all__ = [ + "BUNDLED_PRESETS", "DatasetStats", "Finding", "HardwareProfile", "PreflightReport", + "RecommendationResult", "ResourceEstimate", "Severity", "detect_hardware", + "inspect", + "load_config", + "recommend", "run_preflight", + "stats_from_dataset", ] diff --git a/src/autointent/_advisor/_cli.py b/src/autointent/_advisor/_cli.py index 8c8b7b9d2..315d5c443 100644 --- a/src/autointent/_advisor/_cli.py +++ b/src/autointent/_advisor/_cli.py @@ -9,190 +9,40 @@ csv/json/jsonl/parquet path loaded via ``datasets.load_dataset``) or ``--n-samples / --n-classes / --avg-tokens`` placeholders so the script is useful before the user has built a dataset. + +The CLI is a thin wrapper around :func:`autointent._advisor.inspect` and +:func:`autointent._advisor.recommend`; callers that don't need argparse can +import those helpers directly. """ from __future__ import annotations import argparse +import json import logging import sys -from pathlib import Path -from typing import TYPE_CHECKING, Any - -import yaml -from datasets import ClassLabel, Sequence, load_dataset - -from autointent.utils import load_preset -from ._estimates import run_preflight -from ._hardware import detect_hardware from ._render import render_json, render_recommendation, render_text -from ._report import DatasetStats, Severity +from ._report import DatasetStats +from ._workflows import BUNDLED_PRESETS, inspect, recommend, stats_from_dataset -if TYPE_CHECKING: - from ._report import PreflightReport +__all__ = ["BUNDLED_PRESETS", "build_parser", "cmd_inspect", "cmd_recommend", "main"] logger = logging.getLogger("autointent.advisor") -_SAMPLE_LIMIT = 1000 -_P95_PERCENTILE = 0.95 - -BUNDLED_PRESETS = [ - "transformers-heavy", - "transformers-light", - "transformers-no-hpo", - "nn-heavy", - "nn-medium", - "classic-heavy", - "classic-medium", - "classic-light", - "zero-shot-encoders", - "zero-shot-llm", -] - -# rough quality tiering used by `recommend` -_QUALITY_TIER = { - "transformers-heavy": 5, - "nn-heavy": 4, - "transformers-light": 4, - "nn-medium": 3, - "classic-heavy": 3, - "transformers-no-hpo": 3, - "classic-medium": 2, - "classic-light": 1, - "zero-shot-encoders": 2, - "zero-shot-llm": 4, -} - - -def _load_config(target: str) -> tuple[dict[str, Any], str]: - """Return (config_dict, friendly_name) for either a preset or a path.""" - path = Path(target) - if path.is_file(): - with path.open(encoding="utf-8") as f: - return yaml.safe_load(f), path.stem - # treat as a bundled preset name - return load_preset(target), target # type: ignore[arg-type] - def _stats_from_args(args: argparse.Namespace) -> DatasetStats: + multilabel = args.task == "multilabel" if args.dataset: - return _stats_from_dataset(args.dataset, multilabel=args.task == "multilabel") + return stats_from_dataset(args.dataset, multilabel=multilabel) return DatasetStats.placeholder( n_samples=args.n_samples, n_classes=args.n_classes, avg_tokens=args.avg_tokens, - multilabel=args.task == "multilabel", - ) - - -_UTTERANCE_COLS = ("utterance", "text", "sentence", "query", "input") -_LABEL_COLS = ("label", "labels", "intent", "target") -# Map file extension → datasets builder name. Anything else is treated as a Hub -# repo id or a directory and passed to load_dataset directly. -_FILE_BUILDERS = {".csv": "csv", ".tsv": "csv", ".json": "json", ".jsonl": "json", ".parquet": "parquet"} - - -def _stats_from_dataset(path: str, *, multilabel: bool) -> DatasetStats: - """Best-effort: load via HF ``datasets.load_dataset``. - - Accepts a Hub repo id ('DeepPavlov/clinc150') or a local file path - (.csv / .json / .jsonl / .parquet) / dataset directory. Falls back to a - placeholder on any loader error so the advisor stays best-effort. - """ - builder = _FILE_BUILDERS.get(Path(path).suffix.lower()) - try: - ds = load_dataset(builder, data_files=path) if builder else load_dataset(path) - except (OSError, ValueError, FileNotFoundError) as e: - logger.warning("Failed to load dataset %s: %s", path, e) - return DatasetStats.placeholder(multilabel=multilabel) - - train = ds["train"] if "train" in ds else next(iter(ds.values()), None) - if train is None: - return DatasetStats.placeholder(multilabel=multilabel) - - cols = train.column_names - utt_col = next((c for c in _UTTERANCE_COLS if c in cols), cols[0] if cols else None) - label_col = next((c for c in _LABEL_COLS if c in cols), None) - - detected_multilabel, n_classes = _label_shape(train, label_col, fallback_multilabel=multilabel) - - sample = train[:_SAMPLE_LIMIT] if len(train) > _SAMPLE_LIMIT else train[:] - lengths = [len(str(s).split()) for s in (sample.get(utt_col, []) if utt_col else [])] - avg_tokens = int(sum(lengths) / max(1, len(lengths))) if lengths else 32 - if lengths: - sorted_lengths = sorted(lengths) - idx = max(0, min(len(sorted_lengths) - 1, round((len(sorted_lengths) - 1) * _P95_PERCENTILE))) - p95 = sorted_lengths[idx] - else: - p95 = avg_tokens * 2 - - return DatasetStats( - n_samples=len(train), - n_classes=n_classes, - avg_tokens=avg_tokens, - p95_tokens=p95, - multilabel=detected_multilabel, - has_descriptions=None, - rare_classes=_rare_classes(train, label_col, detected_multilabel, n_classes) if label_col else [], - source=f"dataset:{path}", + multilabel=multilabel, ) -def _label_shape(train: Any, label_col: str | None, *, fallback_multilabel: bool) -> tuple[bool, int]: # noqa: ANN401 - """Derive (multilabel, n_classes) from the HF feature schema, with a value-based fallback.""" - if label_col is None: - return fallback_multilabel, 0 - feature = train.features.get(label_col) - if isinstance(feature, Sequence): - inner = feature.feature - if isinstance(inner, ClassLabel): - return True, inner.num_classes - # Sequence of plain ints — n_classes = max label index + 1. - max_idx = max((max(row) for row in train[label_col] if row), default=-1) - return True, max_idx + 1 - if isinstance(feature, ClassLabel): - return False, feature.num_classes - # Plain int/string column. Detect multilabel from the first non-empty row, then count uniques. - is_multi = len(train) > 0 and isinstance(train[0][label_col], (list, tuple)) - if is_multi: - max_idx = max((max(row) for row in train[label_col] if row), default=-1) - return True, max_idx + 1 - return False, len({label for label in train[label_col] if label is not None}) - - -def _rare_classes( - train: Any, # noqa: ANN401 - label_col: str, - multilabel: bool, - n_classes: int, - min_count: int = 3, -) -> list[str]: - """Return labels with fewer than ``min_count`` samples in the train split. - - Used to surface the LogisticRegressionCV(cv=3) failure case before fit. - Returns an empty list on any error so the advisor stays best-effort. - """ - try: - labels = train[label_col] - except (KeyError, AttributeError, TypeError): - return [] - counts: dict[str, int] = {} - if multilabel: - for row in labels: - if not row: - continue - for i, v in enumerate(row): - if v: - counts[str(i)] = counts.get(str(i), 0) + 1 - for i in range(n_classes): - counts.setdefault(str(i), 0) - else: - for label in labels: - counts[str(label)] = counts.get(str(label), 0) + 1 - return sorted(name for name, c in counts.items() if c < min_count) - - def _add_common_dataset_args(p: argparse.ArgumentParser) -> None: p.add_argument("--dataset", help="Path or hub id of a dataset; overrides placeholders.") p.add_argument("--n-samples", type=int, default=1_000, help="Placeholder training set size.") @@ -207,63 +57,36 @@ def _add_common_dataset_args(p: argparse.ArgumentParser) -> None: def cmd_inspect(args: argparse.Namespace) -> int: - config, name = _load_config(args.target) - hardware = detect_hardware( - vram_budget_gb=args.budget_vram_gb, + report = inspect( + args.target, + stats=_stats_from_args(args), + budget_vram_gb=args.budget_vram_gb, ) - stats = _stats_from_args(args) - report = run_preflight(config, stats, hardware, preset_name=name) if args.json: sys.stdout.write(render_json(report)) - sys.stdout.write("\n") else: sys.stdout.write(render_text(report)) - sys.stdout.write("\n") + sys.stdout.write("\n") return 0 if report.is_feasible else 1 def cmd_recommend(args: argparse.Namespace) -> int: - hardware = detect_hardware(vram_budget_gb=args.budget_vram_gb) - stats = _stats_from_args(args) - - results: list[tuple[str, PreflightReport]] = [] - - for preset in BUNDLED_PRESETS: - try: - cfg = load_preset(preset) # type: ignore[arg-type] - except (OSError, ValueError, KeyError) as e: - logger.debug("Skipping preset %s: %s", preset, e) - continue - report = run_preflight(cfg, stats, hardware, preset_name=preset) - if args.budget_time_h is not None and report.resource.time_hours > args.budget_time_h: - report.add( - "resource", - Severity.OVER, - f"Estimated time {report.resource.time_hours:.1f} h exceeds budget {args.budget_time_h} h.", - ) - results.append((preset, report)) - - feasible = [(name, r) for name, r in results if r.is_feasible] - feasible.sort(key=lambda pair: (-_QUALITY_TIER.get(pair[0], 0), pair[1].resource.time_hours, pair[0])) - chosen = feasible[0][0] if feasible else None - + result = recommend( + stats=_stats_from_args(args), + budget_vram_gb=args.budget_vram_gb, + budget_time_h=args.budget_time_h, + ) if args.json: - import json - - out = { - "chosen": chosen, - "results": [{"preset": name, "report": r.to_dict()} for name, r in results], - } - sys.stdout.write(json.dumps(out, indent=2, default=str)) + sys.stdout.write(json.dumps(result.to_dict(), indent=2, default=str)) sys.stdout.write("\n") else: - sys.stdout.write(render_recommendation(results, chosen)) + sys.stdout.write(render_recommendation(result.results, result.chosen)) sys.stdout.write("\n") - if chosen: + if result.chosen: sys.stdout.write("\n") - sys.stdout.write(render_text(dict(results)[chosen])) + sys.stdout.write(render_text(dict(result.results)[result.chosen])) sys.stdout.write("\n") - return 0 if chosen else 1 + return 0 if result.chosen else 1 def build_parser() -> argparse.ArgumentParser: @@ -309,4 +132,4 @@ def main(argv: list[str] | None = None) -> int: if __name__ == "__main__": - raise SystemExit(main()) + main() diff --git a/src/autointent/_advisor/_report.py b/src/autointent/_advisor/_report.py index 9b4a319c8..c9fd920f4 100644 --- a/src/autointent/_advisor/_report.py +++ b/src/autointent/_advisor/_report.py @@ -110,3 +110,21 @@ def to_dict(self) -> dict[str, Any]: d["headroom"] = self.headroom.value d["is_feasible"] = self.is_feasible return d + + +@dataclass +class RecommendationResult: + """Output of the recommend workflow: ranked per-preset reports plus the pick. + + ``chosen`` is the best feasible preset name, or ``None`` if none fit. + ``results`` is the full per-preset report list in evaluation order. + """ + + chosen: str | None + results: list[tuple[str, PreflightReport]] + + def to_dict(self) -> dict[str, Any]: + return { + "chosen": self.chosen, + "results": [{"preset": name, "report": r.to_dict()} for name, r in self.results], + } diff --git a/src/autointent/_advisor/_workflows.py b/src/autointent/_advisor/_workflows.py new file mode 100644 index 000000000..0bd7ee5bf --- /dev/null +++ b/src/autointent/_advisor/_workflows.py @@ -0,0 +1,231 @@ +"""High-level advisor workflows: ``inspect`` and ``recommend``. + +Each workflow orchestrates the lower-level pieces (``load_config``, +``detect_hardware``, ``stats_from_dataset``, ``run_preflight``) into a single +typed call. They expose the same logic the CLI uses but accept Python +arguments instead of an ``argparse.Namespace`` — useful from notebooks, +integration tests, or any caller that wants a ``PreflightReport`` / +``RecommendationResult`` directly. +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import TYPE_CHECKING, Any, get_args + +import yaml +from datasets import ClassLabel, Sequence, load_dataset + +from autointent.custom_types import SearchSpacePreset +from autointent.utils import load_preset + +from ._estimates import run_preflight +from ._hardware import detect_hardware +from ._report import DatasetStats, RecommendationResult, Severity + +if TYPE_CHECKING: + from collections.abc import Iterable + + from ._report import PreflightReport + + +logger = logging.getLogger("autointent.advisor") + +_SAMPLE_LIMIT = 1000 +_P95_PERCENTILE = 0.95 +BUNDLED_PRESETS: tuple[str, ...] = get_args(SearchSpacePreset) + + +def load_config(target: str) -> tuple[dict[str, Any], str]: + """Return ``(config_dict, friendly_name)`` for either a preset name or a YAML path.""" + path = Path(target) + if path.is_file(): + with path.open(encoding="utf-8") as f: + return yaml.safe_load(f), path.stem + return load_preset(target), target # type: ignore[arg-type] + + +def stats_from_dataset(path: str, *, multilabel: bool = False) -> DatasetStats: + """Best-effort: load a dataset via HF ``datasets.load_dataset`` and derive advisor stats. + + Accepts a Hub repo id (``DeepPavlov/clinc150``) or a local file path + (``.csv`` / ``.json`` / ``.jsonl`` / ``.parquet``) / dataset directory. Falls + back to a placeholder on any loader error so callers stay best-effort. + """ + # Anything not in this map (no suffix, unknown suffix) is treated as a Hub + # repo id or a dataset directory and passed to load_dataset directly. + file_builders = {".csv": "csv", ".tsv": "csv", ".json": "json", ".jsonl": "json", ".parquet": "parquet"} + builder = file_builders.get(Path(path).suffix.lower()) + try: + ds = load_dataset(builder, data_files=path) if builder else load_dataset(path) + except (OSError, ValueError, FileNotFoundError) as e: + logger.warning("Failed to load dataset %s: %s", path, e) + return DatasetStats.placeholder(multilabel=multilabel) + + train = ds["train"] if "train" in ds else next(iter(ds.values()), None) + if train is None: + return DatasetStats.placeholder(multilabel=multilabel) + + cols = train.column_names + utt_col = next( + (c for c in ("utterance", "text", "sentence", "query", "input") if c in cols), cols[0] if cols else None + ) + label_col = next((c for c in ("label", "labels", "intent", "target") if c in cols), None) + + detected_multilabel, n_classes = _label_shape(train, label_col, fallback_multilabel=multilabel) + + sample = train[:_SAMPLE_LIMIT] if len(train) > _SAMPLE_LIMIT else train[:] + lengths = [len(str(s).split()) for s in (sample.get(utt_col, []) if utt_col else [])] + avg_tokens = int(sum(lengths) / max(1, len(lengths))) if lengths else 32 + if lengths: + sorted_lengths = sorted(lengths) + idx = max(0, min(len(sorted_lengths) - 1, round((len(sorted_lengths) - 1) * _P95_PERCENTILE))) + p95 = sorted_lengths[idx] + else: + p95 = avg_tokens * 2 + + return DatasetStats( + n_samples=len(train), + n_classes=n_classes, + avg_tokens=avg_tokens, + p95_tokens=p95, + multilabel=detected_multilabel, + has_descriptions=None, + rare_classes=_rare_classes(train, label_col, detected_multilabel, n_classes) if label_col else [], + source=f"dataset:{path}", + ) + + +def _label_shape(train: Any, label_col: str | None, *, fallback_multilabel: bool) -> tuple[bool, int]: # noqa: ANN401 + """Derive ``(multilabel, n_classes)`` from the HF feature schema with a value-based fallback.""" + if label_col is None: + return fallback_multilabel, 0 + feature = train.features.get(label_col) + if isinstance(feature, Sequence): + inner = feature.feature + if isinstance(inner, ClassLabel): + return True, inner.num_classes + # Sequence of plain ints — n_classes = max label index + 1. + max_idx = max((max(row) for row in train[label_col] if row), default=-1) + return True, max_idx + 1 + if isinstance(feature, ClassLabel): + return False, feature.num_classes + # Plain int/string column. Detect multilabel from the first non-empty row, then count uniques. + is_multi = len(train) > 0 and isinstance(train[0][label_col], (list, tuple)) + if is_multi: + max_idx = max((max(row) for row in train[label_col] if row), default=-1) + return True, max_idx + 1 + return False, len({label for label in train[label_col] if label is not None}) + + +def _rare_classes( + train: Any, # noqa: ANN401 + label_col: str, + multilabel: bool, + n_classes: int, + min_count: int = 3, +) -> list[str]: + """Return labels with fewer than ``min_count`` samples in the train split. + + Used to surface the LogisticRegressionCV(cv=3) failure case before fit. + Returns an empty list on any error so the advisor stays best-effort. + """ + try: + labels = train[label_col] + except (KeyError, AttributeError, TypeError): + return [] + counts: dict[str, int] = {} + if multilabel: + for row in labels: + if not row: + continue + for i, v in enumerate(row): + if v: + counts[str(i)] = counts.get(str(i), 0) + 1 + for i in range(n_classes): + counts.setdefault(str(i), 0) + else: + for label in labels: + counts[str(label)] = counts.get(str(label), 0) + 1 + return sorted(name for name, c in counts.items() if c < min_count) + + +def inspect( + target: str, + *, + stats: DatasetStats | None = None, + budget_vram_gb: float | None = None, +) -> PreflightReport: + """Inspect a preset (or YAML config path) against the local hardware. + + Args: + target: Bundled preset name (e.g. ``'transformers-light'``) or a YAML + config path. The friendly name surfaced in the report is the file + stem for paths and the preset name otherwise. + stats: Dataset stats to score against. Defaults to a placeholder if + ``None``. + budget_vram_gb: Optional VRAM-budget override for the hardware probe. + + Returns: + ``PreflightReport`` covering resource / data / config phases. + """ + config, name = load_config(target) + hardware = detect_hardware(vram_budget_gb=budget_vram_gb) + return run_preflight(config, stats or DatasetStats.placeholder(), hardware, preset_name=name) + + +def recommend( + *, + stats: DatasetStats | None = None, + presets: Iterable[str] | None = None, + budget_vram_gb: float | None = None, + budget_time_h: float | None = None, +) -> RecommendationResult: + """Walk bundled presets and return the best feasible fit plus all per-preset reports. + + Args: + stats: Dataset stats to score against. Defaults to a placeholder if ``None``. + presets: Override of the preset list (defaults to ``BUNDLED_PRESETS``). + budget_vram_gb: Optional VRAM-budget override for the hardware probe. + budget_time_h: Optional wall-time ceiling in hours; presets exceeding it + get an extra ``Severity.OVER`` finding so they drop out of the + feasible ranking. + + Returns: + ``RecommendationResult`` with the chosen preset name and full results list. + + Note: + Among feasible presets we pick the one with the largest estimated + ``time_hours`` (ties broken alphabetically). Higher-quality presets cost + more wall-time, so the slowest feasible preset is also the heaviest + preset that still fits the hardware — i.e. "use what you have". + """ + hardware = detect_hardware(vram_budget_gb=budget_vram_gb) + stats = stats or DatasetStats.placeholder() + preset_iter = list(presets) if presets is not None else BUNDLED_PRESETS + + results: list[tuple[str, PreflightReport]] = [] + for preset in preset_iter: + try: + cfg = load_preset(preset) # type: ignore[arg-type] + except (OSError, ValueError, KeyError) as e: + logger.debug("Skipping preset %s: %s", preset, e) + continue + report = run_preflight(cfg, stats, hardware, preset_name=preset) + if budget_time_h is not None and report.resource.time_hours > budget_time_h: + report.add( + "resource", + Severity.OVER, + f"Estimated time {report.resource.time_hours:.1f} h exceeds budget {budget_time_h} h.", + ) + results.append((preset, report)) + + # Rank by Literal position (lower index = higher quality); presets the user + # passed via the ``presets`` override but not in BUNDLED_PRESETS sort last. + quality_rank = {name: i for i, name in enumerate(BUNDLED_PRESETS)} + feasible = [(name, r) for name, r in results if r.is_feasible] + feasible.sort(key=lambda pair: (quality_rank.get(pair[0], len(BUNDLED_PRESETS)), pair[0])) + chosen = feasible[0][0] if feasible else None + + return RecommendationResult(chosen=chosen, results=results) diff --git a/src/autointent/custom_types/_types.py b/src/autointent/custom_types/_types.py index cbfa82576..a54da368d 100644 --- a/src/autointent/custom_types/_types.py +++ b/src/autointent/custom_types/_types.py @@ -117,18 +117,21 @@ class Split: """ SearchSpacePreset = Literal[ - "classic-heavy", - "classic-light", - "classic-medium", - "nn-heavy", - "nn-medium", "transformers-heavy", "transformers-light", - "transformers-no-hpo", + "nn-heavy", "zero-shot-llm", + "nn-medium", + "classic-heavy", + "transformers-no-hpo", + "classic-medium", "zero-shot-encoders", + "classic-light", ] -"""Some presets that our library supports.""" +"""Bundled search-space presets, listed in descending quality order. + +The order is consumed by ``autointent._advisor.recommend`` to pick the +highest-quality feasible preset (lower index = higher quality).""" class Document(BaseModel): From b77d57586028af16a01b210161af702dcf34e789 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 16 Jun 2026 18:57:07 +0300 Subject: [PATCH 11/16] simplify logic --- src/autointent/_advisor/_estimates.py | 477 ++++++++++++---------- src/autointent/_advisor/_hub.py | 47 ++- tests/advisor/test_estimates_internals.py | 10 +- 3 files changed, 320 insertions(+), 214 deletions(-) diff --git a/src/autointent/_advisor/_estimates.py b/src/autointent/_advisor/_estimates.py index 93dfcedaf..274407c26 100644 --- a/src/autointent/_advisor/_estimates.py +++ b/src/autointent/_advisor/_estimates.py @@ -9,6 +9,7 @@ from __future__ import annotations import logging +from dataclasses import dataclass from typing import TYPE_CHECKING, Any from pydantic import BaseModel, ConfigDict, Field, ValidationError @@ -26,9 +27,11 @@ from ._report import DatasetStats _MULTICLASS_THRESHOLD = 2 -_PARAMS_LARGE = 300 -_PARAMS_BASE = 100 -_PARAMS_SMALL = 50 + +# Fallback architecture shape (BERT-base) used only when the model's actual +# config.json couldn't be fetched from HF Hub — see _hub._shape_from_config. +_DEFAULT_HIDDEN = 768 +_DEFAULT_LAYERS = 12 logger = logging.getLogger(__name__) @@ -77,12 +80,14 @@ def _validated_config(config: dict[str, Any]) -> _AdvisorConfig: } # Maps each fine-tunable transformer module to its training-mode label. -# Modules not listed are treated as inference-only. +# Modules not listed (or listed as "inference") run the encoder forward-only. +# Note: dnnc keeps the cross-encoder frozen and trains an sklearn LogisticRegressionCV +# head on top of its features (see autointent._wrappers.ranker.Ranker._fit), so the +# encoder's VRAM profile matches inference rather than fine-tuning. _TRANSFORMER_TRAINING_MODE = { "bert": "full-finetune", "ptuning": "lora", "lora": "lora", - "dnnc": "reranker", } # Fallback max_length when the search-space entry doesn't pin it. Used both as @@ -148,17 +153,16 @@ def _walk_modules(search_space: list[dict[str, Any]]) -> Iterable[tuple[str, dic def _weights_vram_for_transformer(meta: ModelMeta, mode: str) -> float: """Weight-side VRAM in GB — weights + grads + Adam optimizer state. Excludes activations. - Full fine-tune fp32: W + W + 2W (Adam m, v) = 4W. - Full fine-tune AMP: 0.5W (fp16 weights) + 0.5W (fp16 grads) + W (fp32 master) + 2W (fp32 Adam) = 4W. - AMP's savings live in activations, not the optimizer — the weight side is identical. + Modes: + * ``inference``: forward only — weights + ~30% intermediate-tensor overhead. + * ``lora``: frozen base + small trainable adapters + their grads/optimizer (~0.5 GB). + * ``full-finetune`` (default): weights + grads + Adam (m, v) = 4x weights. """ weights_gb = meta.weights_gb if mode == "inference": return weights_gb * 1.3 if mode == "lora": return weights_gb * 1.3 + 0.5 - if mode == "reranker": - return weights_gb * 1.5 return weights_gb * 4.0 @@ -200,20 +204,10 @@ def _floor_to_power_of_two(n: int) -> int: def _n_layers(meta: ModelMeta | None) -> int: - """Coarse layer-count guess from parameter count. - - MiniLM (33M) ~6, BERT-base (110M) ~12, BERT-large (350M) ~24. - """ - if meta is None: - return 12 - params = meta.params_millions - if params >= _PARAMS_LARGE: - return 24 - if params >= _PARAMS_BASE: - return 12 - if params >= _PARAMS_SMALL: - return 8 - return 6 + """Layer count from the model's ``config.json``; falls back to BERT-base when absent.""" + if meta is not None and meta.n_layers is not None: + return meta.n_layers + return _DEFAULT_LAYERS def _activations_gb_per_sample( @@ -262,20 +256,10 @@ def _max_fitting_batch_size( def _embedder_dim(meta: ModelMeta | None) -> int: - """Coarse hidden-size guess from parameter count. - - Concrete points: MiniLM (33M) ~384, BERT-base (110M) ~768, BERT-large (350M) ~1024. - """ - if meta is None: - return 768 - params = meta.params_millions - if params >= _PARAMS_LARGE: - return 1024 - if params >= _PARAMS_BASE: - return 768 - if params >= _PARAMS_SMALL: - return 512 - return 384 + """Hidden size from the model's ``config.json``; falls back to BERT-base when absent.""" + if meta is not None and meta.hidden_size is not None: + return meta.hidden_size + return _DEFAULT_HIDDEN def _largest_embedder(seen_models: dict[str, ModelMeta]) -> ModelMeta | None: @@ -364,146 +348,139 @@ def _classify_severity(estimate: float, budget: float) -> Severity: return Severity.AMPLE -def _resource_phase( # noqa: PLR0912, C901, PLR0915 - kept linear for clarity - config: dict[str, Any], - stats: DatasetStats, - hardware: HardwareProfile, - report: PreflightReport, -) -> None: - cfg = _validated_config(config) - n_trials = max(1, cfg.hpo_config.n_trials) - n_jobs = max(1, cfg.hpo_config.n_jobs) - refit_after = cfg.refit_after - dump_modules = cfg.dump_modules +@dataclass +class _ModuleEstimate: + """Per-module cost contribution + the dict that gets rendered in the report.""" - if not hub_reachable(): - report.low_confidence = True - report.notes.append("HF Hub unreachable — all model sizes are name-pattern heuristics.") + driver: dict[str, Any] + vram_gb: float + ram_gb: float + time_hours: float + model_weights_gb: float = 0.0 - seen_models: dict[str, ModelMeta] = {} - estimate = ResourceEstimate(parallel_factor=n_jobs) - global_embedder = (cfg.embedder_config or {}).get("model_name") - if global_embedder: - seen_models[global_embedder] = resolve_model(global_embedder) +def _refit_factor(*, refit_after: bool, n_trials: int) -> float: + """Wall-time multiplier for ``refit_after=True`` (amortized 1/n_trials extra).""" + return 1 + 1.0 / max(1, n_trials) if refit_after else 1.0 - # First pass: walk transformer-bearing modules (collects seen_models for embedder_dim lookup). - transformer_entries: list[tuple[int, str, dict[str, Any]]] = [] - classic_entries: list[tuple[int, str, dict[str, Any]]] = [] - for node_idx, node_type, entry in _walk_modules_indexed(cfg.search_space): - module = entry.get("module_name", "?") - if module in {"linear", "catboost"}: - classic_entries.append((node_idx, node_type, entry)) - else: - transformer_entries.append((node_idx, node_type, entry)) - # Track the heaviest module per node so dump_modules accounting is bounded by - # "one selected variant per node x n_trials", not "sum of every candidate". - node_max_weights: dict[int, float] = {} +def _split_entries( + search_space: list[dict[str, Any]], +) -> tuple[list[tuple[int, str, dict[str, Any]]], list[tuple[int, str, dict[str, Any]]]]: + """Partition search-space entries into (transformer-bearing, classic).""" + transformer, classic = [], [] + for node_idx, node_type, entry in _walk_modules_indexed(search_space): + bucket = classic if entry.get("module_name") in {"linear", "catboost"} else transformer + bucket.append((node_idx, node_type, entry)) + return transformer, classic - for node_idx, node_type, entry in transformer_entries: - module = entry.get("module_name", "?") - model_names = _extract_model_names(entry) - if not model_names and global_embedder and module in {"knn", "mlknn"}: - model_names = [global_embedder] - for name in model_names: - meta = seen_models.setdefault(name, resolve_model(name)) +def _estimate_transformer_model( + *, + meta: ModelMeta, + entry: dict[str, Any], + node_type: str, + module: str, + name: str, + stats: DatasetStats, + hardware: HardwareProfile, + n_trials: int, + refit_after: bool, +) -> _ModuleEstimate: + """One row of cost for a transformer module + a specific model checkpoint.""" + mixed_precision = entry.get("dtype") in {"fp16", "bf16"} + mode = _TRANSFORMER_TRAINING_MODE.get(module, "inference") + batch_size = _max_int(entry.get("batch_size"), 32) + epochs = _max_int(entry.get("num_train_epochs"), 1 if mode == "inference" else 10) + seq_len = _max_int(entry.get("max_length"), _DEFAULT_SEQ_LEN) + + vram = _vram_for_transformer(meta, mode, mixed_precision, batch_size=batch_size, seq_len=seq_len) + ram = _ram_for_module(meta, stats) + + driver_max_batch: int | None = None + if hardware.vram_gb > 0: + driver_max_batch = _max_fitting_batch_size( + weight_vram_gb=_weights_vram_for_transformer(meta, mode), + vram_budget_gb=hardware.vram_gb, + per_sample_gb=_activations_gb_per_sample( + meta, seq_len, mixed_precision=mixed_precision, is_training=mode != "inference" + ), + ) - mixed_precision = entry.get("dtype") in {"fp16", "bf16"} - mode = _TRANSFORMER_TRAINING_MODE.get(module, "inference") - - batch_size = _max_int(entry.get("batch_size"), 32) - epochs = _max_int(entry.get("num_train_epochs"), 1 if mode == "inference" else 10) - seq_len = _max_int(entry.get("max_length"), _DEFAULT_SEQ_LEN) - - vram = _vram_for_transformer(meta, mode, mixed_precision, batch_size=batch_size, seq_len=seq_len) - ram = _ram_for_module(meta, stats) - - driver_max_batch: int | None = None - if hardware.vram_gb > 0: - weights_vram = _weights_vram_for_transformer(meta, mode) - per_sample_gb = _activations_gb_per_sample( - meta, seq_len, mixed_precision=mixed_precision, is_training=mode != "inference" - ) - driver_max_batch = _max_fitting_batch_size( - weight_vram_gb=weights_vram, - vram_budget_gb=hardware.vram_gb, - per_sample_gb=per_sample_gb, - ) - - time_h = _time_for_transformer( - meta=meta, - n_trials=n_trials, - epochs=epochs, - batch_size=batch_size, - n_samples=stats.n_samples, - device_class=hardware.device_class, - ) - if refit_after and mode != "inference": - time_h *= 1 + 1.0 / max(1, n_trials) - - estimate.vram_gb = max(estimate.vram_gb, vram) - estimate.ram_gb = max(estimate.ram_gb, ram) - estimate.time_hours += time_h - node_max_weights[node_idx] = max(node_max_weights.get(node_idx, 0.0), meta.weights_gb) - estimate.drivers.append( - { - "node_type": node_type, - "module": module, - "model": name, - "mode": mode, - "vram_gb": round(vram, 2), - "ram_gb": round(ram, 2), - "time_hours": round(time_h, 2), - "batch_size": batch_size, - "max_batch_size": driver_max_batch, - "confidence": meta.confidence, - } - ) + time_h = _time_for_transformer( + meta=meta, + n_trials=n_trials, + epochs=epochs, + batch_size=batch_size, + n_samples=stats.n_samples, + device_class=hardware.device_class, + ) + if mode != "inference": + time_h *= _refit_factor(refit_after=refit_after, n_trials=n_trials) + + return _ModuleEstimate( + driver={ + "node_type": node_type, + "module": module, + "model": name, + "mode": mode, + "vram_gb": round(vram, 2), + "ram_gb": round(ram, 2), + "time_hours": round(time_h, 2), + "batch_size": batch_size, + "max_batch_size": driver_max_batch, + "confidence": meta.confidence, + }, + vram_gb=vram, + ram_gb=ram, + time_hours=time_h, + model_weights_gb=meta.weights_gb, + ) - # Second pass: linear / catboost — cost depends on embedder_dim, not a checkpoint. - embedder_meta = _largest_embedder(seen_models) - embedder_dim = _embedder_dim(embedder_meta) - # Both multinomial (multiclass) and one-vs-rest (multilabel) LR scale linearly in n_classes; - # the multiclass path additionally pays the LogisticRegressionCV inner-fit multiplier. - class_multiplier_classic = max(1, stats.n_classes) - confidence = embedder_meta.confidence if embedder_meta else "heuristic" - embedder_label = embedder_meta.name if embedder_meta else "(no embedder)" - for _node_idx, node_type, entry in classic_entries: - module = entry.get("module_name", "?") - if module == "linear": - max_iter = _max_int(entry.get("max_iter"), 100) - cv_multiplier = 1 if stats.multilabel else _LOGREG_CV_MULTIPLIER - ram = _ram_for_linear(stats=stats, embedder_dim=embedder_dim) - time_h = _time_for_linear( + +def _estimate_classic_entry( + *, + entry: dict[str, Any], + node_type: str, + embedder_meta: ModelMeta | None, + embedder_dim: int, + stats: DatasetStats, + hardware: HardwareProfile, + n_trials: int, + refit_after: bool, +) -> _ModuleEstimate | None: + """Cost row for a linear or catboost scorer (returns ``None`` for any other module).""" + module = entry.get("module_name", "?") + refit = _refit_factor(refit_after=refit_after, n_trials=n_trials) + # Both multinomial (multiclass) and one-vs-rest (multilabel) LR scale linearly in n_classes. + class_multiplier = max(1, stats.n_classes) + + if module == "linear": + cv_multiplier = 1 if stats.multilabel else _LOGREG_CV_MULTIPLIER + ram = _ram_for_linear(stats=stats, embedder_dim=embedder_dim) + time_h = ( + _time_for_linear( n_trials=n_trials, n_samples=stats.n_samples, embedder_dim=embedder_dim, - max_iter=max_iter, + max_iter=_max_int(entry.get("max_iter"), 100), cv_multiplier=cv_multiplier, - class_multiplier=class_multiplier_classic, - ) - if refit_after: - time_h *= 1 + 1.0 / max(1, n_trials) - vram = 0.0 - mode = "linear-cv" if cv_multiplier > 1 else "linear" - elif module == "catboost": - iterations = _max_int(entry.get("iterations"), 1000) - depth = _max_int(entry.get("depth"), 6) - on_gpu = entry.get("task_type") == "GPU" and hardware.accelerator == "cuda" - # CatBoost's MultiClass loss grows per-class trees only above binary; - # binary uses Logloss with one tree per iteration. - cb_class_mult = ( - max(1, stats.n_classes) if stats.n_classes > _MULTICLASS_THRESHOLD or stats.multilabel else 1 - ) - ram_total = _ram_for_catboost( - stats=stats, - n_features=embedder_dim, - iterations=iterations, - depth=depth, + class_multiplier=class_multiplier, ) - time_h = _time_for_catboost( + * refit + ) + vram = 0.0 + mode = "linear-cv" if cv_multiplier > 1 else "linear" + elif module == "catboost": + on_gpu = entry.get("task_type") == "GPU" and hardware.accelerator == "cuda" + # CatBoost MultiClass loss grows per-class trees only above binary; binary uses + # Logloss with one tree per iteration. + cb_class_mult = class_multiplier if stats.n_classes > _MULTICLASS_THRESHOLD or stats.multilabel else 1 + iterations = _max_int(entry.get("iterations"), 1000) + depth = _max_int(entry.get("depth"), 6) + ram_total = _ram_for_catboost(stats=stats, n_features=embedder_dim, iterations=iterations, depth=depth) + time_h = ( + _time_for_catboost( n_trials=n_trials, n_samples=stats.n_samples, n_features=embedder_dim, @@ -512,55 +489,66 @@ def _resource_phase( # noqa: PLR0912, C901, PLR0915 - kept linear for clarity class_multiplier=cb_class_mult, on_gpu=on_gpu, ) - if refit_after: - time_h *= 1 + 1.0 / max(1, n_trials) - vram, ram = (ram_total, 0.0) if on_gpu else (0.0, ram_total) - mode = "catboost-gpu" if on_gpu else "catboost" - else: - continue - - estimate.vram_gb = max(estimate.vram_gb, vram) - estimate.ram_gb = max(estimate.ram_gb, ram) - estimate.time_hours += time_h - estimate.drivers.append( - { - "node_type": node_type, - "module": module, - "model": embedder_label, - "mode": mode, - "vram_gb": round(vram, 2), - "ram_gb": round(ram, 2), - "time_hours": round(time_h, 2), - "batch_size": None, - "max_batch_size": None, - "confidence": confidence, - } + * refit ) + vram, ram = (ram_total, 0.0) if on_gpu else (0.0, ram_total) + mode = "catboost-gpu" if on_gpu else "catboost" + else: + return None + + return _ModuleEstimate( + driver={ + "node_type": node_type, + "module": module, + "model": embedder_meta.name if embedder_meta else "(no embedder)", + "mode": mode, + "vram_gb": round(vram, 2), + "ram_gb": round(ram, 2), + "time_hours": round(time_h, 2), + "batch_size": None, + "max_batch_size": None, + "confidence": embedder_meta.confidence if embedder_meta else "heuristic", + }, + vram_gb=vram, + ram_gb=ram, + time_hours=time_h, + ) + +def _aggregate_disk( + estimate: ResourceEstimate, + seen_models: dict[str, ModelMeta], + node_max_weights: dict[int, float], + *, + dump_modules: bool, + n_trials: int, +) -> None: + """Fold per-model download/cached sizes into ``estimate`` and apply dump-modules accounting.""" for meta in seen_models.values(): if meta.cached_locally: estimate.disk_cached_gb += meta.disk_gb else: estimate.disk_download_gb += meta.disk_gb - if dump_modules: # Each trial selects one variant per node, so per-trial dumped weights # are bounded by the heaviest module in each node, summed across nodes. - per_trial_dump_gb = sum(node_max_weights.values()) - estimate.disk_dump_gb = per_trial_dump_gb * n_trials + estimate.disk_dump_gb = sum(node_max_weights.values()) * n_trials - if n_jobs > 1 and hardware.accelerator in {"cuda", "mps"}: - effective_vram = estimate.vram_gb * n_jobs - else: - effective_vram = estimate.vram_gb + +def _emit_resource_findings( + report: PreflightReport, + estimate: ResourceEstimate, + hardware: HardwareProfile, + *, + n_jobs: int, +) -> None: + """Translate aggregated estimates into VRAM/RAM/disk/time findings on the report.""" + parallel_gpu = n_jobs > 1 and hardware.accelerator in {"cuda", "mps"} + effective_vram = estimate.vram_gb * n_jobs if parallel_gpu else estimate.vram_gb # MPS shares one unified pool: parallel workers each allocate weights+activations # in RAM, so peak RAM also scales with n_jobs on Apple Silicon. effective_ram = estimate.ram_gb * n_jobs if n_jobs > 1 and hardware.accelerator == "mps" else estimate.ram_gb - report.resource = estimate - - # render findings - vram_sev = _classify_severity(effective_vram, hardware.vram_gb) if hardware.accelerator == "cpu" and effective_vram > 0: report.add( "resource", @@ -573,29 +561,108 @@ def _resource_phase( # noqa: PLR0912, C901, PLR0915 - kept linear for clarity if n_jobs > 1: msg += f" (= per-trial {estimate.vram_gb:.1f} GB × {n_jobs} parallel trials)" msg += f" vs available {hardware.vram_gb:.1f} GB" - report.add("resource", vram_sev, msg, metric="vram") + report.add("resource", _classify_severity(effective_vram, hardware.vram_gb), msg, metric="vram") - ram_sev = _classify_severity(effective_ram, hardware.ram_gb) report.add( "resource", - ram_sev, + _classify_severity(effective_ram, hardware.ram_gb), f"RAM ~{effective_ram:.1f} GB vs available {hardware.ram_gb:.1f} GB", metric="ram", ) disk_total = estimate.disk_download_gb + estimate.disk_dump_gb - disk_sev = _classify_severity(disk_total, hardware.free_disk_gb) disk_msg = f"Disk ~{estimate.disk_download_gb:.1f} GB to download" if estimate.disk_cached_gb > 0: disk_msg += f", {estimate.disk_cached_gb:.1f} GB already cached" if estimate.disk_dump_gb > 0: disk_msg += f", +{estimate.disk_dump_gb:.1f} GB during training (dump_modules=True)" disk_msg += f" vs {hardware.free_disk_gb:.0f} GB free" - report.add("resource", disk_sev, disk_msg, metric="disk") + report.add("resource", _classify_severity(disk_total, hardware.free_disk_gb), disk_msg, metric="disk") if estimate.time_hours > 0: - time_msg = f"Time ~{estimate.time_hours:.1f} h (worst case, no HPO pruning)" - report.add("resource", Severity.AMPLE, time_msg, metric="time") + report.add( + "resource", + Severity.AMPLE, + f"Time ~{estimate.time_hours:.1f} h (worst case, no HPO pruning)", + metric="time", + ) + + +def _resource_phase( + config: dict[str, Any], + stats: DatasetStats, + hardware: HardwareProfile, + report: PreflightReport, +) -> None: + cfg = _validated_config(config) + n_trials = max(1, cfg.hpo_config.n_trials) + n_jobs = max(1, cfg.hpo_config.n_jobs) + + if not hub_reachable(): + report.low_confidence = True + report.notes.append("HF Hub unreachable — all model sizes are name-pattern heuristics.") + + seen_models: dict[str, ModelMeta] = {} + global_embedder = (cfg.embedder_config or {}).get("model_name") + if global_embedder: + seen_models[global_embedder] = resolve_model(global_embedder) + + transformer_entries, classic_entries = _split_entries(cfg.search_space) + + # First pass: transformer modules (also populates seen_models for the classic pass). + module_estimates: list[_ModuleEstimate] = [] + node_max_weights: dict[int, float] = {} + for node_idx, node_type, entry in transformer_entries: + module = entry.get("module_name", "?") + model_names = _extract_model_names(entry) + if not model_names and global_embedder and module in {"knn", "mlknn"}: + model_names = [global_embedder] + for name in model_names: + meta = seen_models.setdefault(name, resolve_model(name)) + me = _estimate_transformer_model( + meta=meta, + entry=entry, + node_type=node_type, + module=module, + name=name, + stats=stats, + hardware=hardware, + n_trials=n_trials, + refit_after=cfg.refit_after, + ) + module_estimates.append(me) + # Track heaviest weight per node so dump_modules is bounded by one + # selected variant per node x n_trials, not the sum of all candidates. + node_max_weights[node_idx] = max(node_max_weights.get(node_idx, 0.0), me.model_weights_gb) + + # Second pass: linear / catboost — cost depends on embedder_dim, not a checkpoint. + embedder_meta = _largest_embedder(seen_models) + embedder_dim = _embedder_dim(embedder_meta) + for _, node_type, entry in classic_entries: + me = _estimate_classic_entry( + entry=entry, + node_type=node_type, + embedder_meta=embedder_meta, + embedder_dim=embedder_dim, + stats=stats, + hardware=hardware, + n_trials=n_trials, + refit_after=cfg.refit_after, + ) + if me is not None: + module_estimates.append(me) + + estimate = ResourceEstimate(parallel_factor=n_jobs) + for me in module_estimates: + estimate.vram_gb = max(estimate.vram_gb, me.vram_gb) + estimate.ram_gb = max(estimate.ram_gb, me.ram_gb) + estimate.time_hours += me.time_hours + estimate.drivers.append(me.driver) + + _aggregate_disk(estimate, seen_models, node_max_weights, dump_modules=cfg.dump_modules, n_trials=n_trials) + + report.resource = estimate + _emit_resource_findings(report, estimate, hardware, n_jobs=n_jobs) def _config_phase( diff --git a/src/autointent/_advisor/_hub.py b/src/autointent/_advisor/_hub.py index 9b351952a..b459adf9c 100644 --- a/src/autointent/_advisor/_hub.py +++ b/src/autointent/_advisor/_hub.py @@ -7,6 +7,7 @@ from __future__ import annotations +import json import logging import re from dataclasses import dataclass @@ -14,7 +15,7 @@ from pathlib import Path from typing import Any -from huggingface_hub import HfApi, scan_cache_dir, try_to_load_from_cache +from huggingface_hub import HfApi, hf_hub_download, scan_cache_dir, try_to_load_from_cache logger = logging.getLogger(__name__) @@ -43,6 +44,11 @@ class ModelMeta: total_file_bytes: int cached_locally: bool confidence: str # "hub" | "heuristic" + # Architecture shape read straight from the model's config.json when reachable; + # None when the file couldn't be fetched/parsed. Estimates fall back to a + # BERT-base default in that case. + hidden_size: int | None = None + n_layers: int | None = None @property def disk_gb(self) -> float: @@ -71,6 +77,30 @@ def _heuristic_params_millions(model_name: str) -> float: return 110.0 # generic BERT-base default +def _shape_from_config(model_name: str) -> tuple[int | None, int | None]: + """Return ``(hidden_size, num_hidden_layers)`` straight from the model's config.json. + + ``hf_hub_download`` caches the file after the first call, so repeated lookups + in the same process (or across CLI invocations) hit local disk. Returns + ``(None, None)`` on any failure — the advisor stays best-effort. + """ + try: + path = hf_hub_download(model_name, "config.json") + except Exception as e: # noqa: BLE001 + logger.debug("config.json download(%s) failed: %s", model_name, e) + return None, None + try: + cfg = json.loads(Path(path).read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError) as e: + logger.debug("config.json parse(%s) failed: %s", model_name, e) + return None, None + # Cover the common HF naming variants: BERT/Llama/Gemma use hidden_size + + # num_hidden_layers; T5/MT5 use d_model + num_layers; GPT-2/Neo use n_embd + n_layer. + hidden = cfg.get("hidden_size") or cfg.get("d_model") or cfg.get("n_embd") + layers = cfg.get("num_hidden_layers") or cfg.get("num_layers") or cfg.get("n_layer") + return (int(hidden) if hidden else None, int(layers) if layers else None) + + def _is_warm_cached(model_name: str) -> bool: """True when the weight shard is present in the local HF cache.""" weight_files = ["model.safetensors", "pytorch_model.bin", "model.safetensors.index.json"] @@ -126,6 +156,14 @@ def _hub_metadata(model_name: str) -> ModelMeta | None: total_file_bytes = int(params_millions * 1_000_000 * weight_bytes_per_param) confidence = "heuristic" + hidden_size, n_layers = _shape_from_config(model_name) + if hidden_size is None or n_layers is None: + logger.warning( + "Could not read hidden_size / num_hidden_layers from config.json for %s; " + "activation-memory estimates will fall back to BERT-base defaults (768 / 12).", + model_name, + ) + return ModelMeta( name=model_name, params_millions=params_millions, @@ -133,10 +171,17 @@ def _hub_metadata(model_name: str) -> ModelMeta | None: total_file_bytes=total_file_bytes, cached_locally=_is_warm_cached(model_name), confidence=confidence, + hidden_size=hidden_size, + n_layers=n_layers, ) def _heuristic_metadata(model_name: str) -> ModelMeta: + logger.warning( + "Falling back to name-pattern heuristic for %s; " + "activation-memory estimates will use BERT-base defaults (hidden=768, layers=12).", + model_name, + ) params_millions = _heuristic_params_millions(model_name) weight_bytes_per_param = 4 total_file_bytes = int(params_millions * 1_000_000 * weight_bytes_per_param) diff --git a/tests/advisor/test_estimates_internals.py b/tests/advisor/test_estimates_internals.py index 9b4881611..c63acde9d 100644 --- a/tests/advisor/test_estimates_internals.py +++ b/tests/advisor/test_estimates_internals.py @@ -146,12 +146,6 @@ def test_amp_does_reduce_activation_side_vram(self, meta: ModelMeta) -> None: amp = _vram_for_transformer(meta, "full-finetune", mixed_precision=True, batch_size=64, seq_len=128) assert amp < fp32 - def test_reranker_uses_inference_class(self, meta: ModelMeta) -> None: - inference = _vram_for_transformer(meta, "inference", mixed_precision=False) - reranker = _vram_for_transformer(meta, "reranker", mixed_precision=False) - assert reranker > inference - - def test_ram_scales_with_dataset_size() -> None: meta = ModelMeta( name="x", @@ -477,13 +471,13 @@ def test_driver_records_current_and_max_batch(self) -> None: report = run_preflight( self._bert_cfg("microsoft/deberta-v3-large", batch_size=64), DatasetStats.placeholder(), - _profile(vram_gb=10.0), + _profile(vram_gb=8.0), ) drivers = [d for d in report.resource.drivers if d["module"] == "bert"] assert drivers d = drivers[0] assert d["batch_size"] == 64 - # vram_gb=10 + 5 GB weights → some room for activations, max < 64. + # vram_gb=8 with ~5 GB weights leaves little room for activations → max < 64. assert d["max_batch_size"] is not None assert 0 < d["max_batch_size"] < 64 From 1f1778a509f0f8c4fff8277d43e307ffccbbed41 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 16 Jun 2026 21:24:25 +0300 Subject: [PATCH 12/16] simplify logic --- src/autointent/_advisor/_estimates.py | 144 +++++++++++--------- src/autointent/_advisor/_hardware.py | 9 +- src/autointent/_advisor/_hub.py | 153 +++++++++++----------- src/autointent/_advisor/_render.py | 10 +- tests/advisor/test_estimates_and_cli.py | 10 +- tests/advisor/test_estimates_internals.py | 58 +++++--- tests/advisor/test_hub_heuristics.py | 51 +++----- tests/advisor/test_render.py | 8 +- 8 files changed, 230 insertions(+), 213 deletions(-) diff --git a/src/autointent/_advisor/_estimates.py b/src/autointent/_advisor/_estimates.py index 274407c26..faeb4b1ee 100644 --- a/src/autointent/_advisor/_estimates.py +++ b/src/autointent/_advisor/_estimates.py @@ -12,11 +12,17 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any -from pydantic import BaseModel, ConfigDict, Field, ValidationError +from pydantic import ValidationError -from autointent.configs._optimization import HPOConfig +from autointent._optimization_config import OptimizationConfig +from autointent.configs._embedder import ( + EmbedderConfig, + OpenaiEmbeddingConfig, + SentenceTransformerEmbeddingConfig, + VllmEmbeddingConfig, +) -from ._hub import hub_reachable, resolve_model +from ._hub import resolve_model from ._report import PreflightReport, ResourceEstimate, Severity if TYPE_CHECKING: @@ -27,6 +33,7 @@ from ._report import DatasetStats _MULTICLASS_THRESHOLD = 2 +_BYTES_PER_GB = 1024**3 # binary GiB convention; matches all advisor byte->GB conversions # Fallback architecture shape (BERT-base) used only when the model's actual # config.json couldn't be fetched from HF Hub — see _hub._shape_from_config. @@ -36,48 +43,39 @@ logger = logging.getLogger(__name__) -class _AdvisorConfig(BaseModel): - """Validated view of the advisor's input config. - - Wraps the four top-level keys the phase helpers read. Unknown top-level - keys are ignored (preset YAMLs carry extra metadata the advisor doesn't model). - """ - - model_config = ConfigDict(extra="ignore") - - hpo_config: HPOConfig = Field(default_factory=HPOConfig) - search_space: list[dict[str, Any]] = Field(default_factory=list) - refit_after: bool = False - dump_modules: bool = False - embedder_config: dict[str, Any] | None = None - - -def _validated_config(config: dict[str, Any]) -> _AdvisorConfig: - """Validate ``config`` against ``_AdvisorConfig``; fall back to defaults on any error. +def _validated_config(config: dict[str, Any]) -> OptimizationConfig: + """Validate ``config`` against the project's canonical ``OptimizationConfig``. The advisor is best-effort: a malformed user config should still produce a - report (with placeholder costs) rather than crashing. + report (with placeholder costs) rather than crashing, so any validation + error falls back to the model defaults. """ try: - return _AdvisorConfig.model_validate(config) + return OptimizationConfig.model_validate(config) except ValidationError as e: logger.warning("Advisor config failed validation; falling back to defaults: %s", e) - return _AdvisorConfig() + # OptimizationConfig requires `search_space`; build a minimal valid default. + return OptimizationConfig.model_validate({"search_space": []}) -# Severity thresholds as a fraction of available budget: at or above _TIGHT -# downgrades to Severity.TIGHT; at or above _OVER downgrades to Severity.OVER. -_TIGHT_RATIO = 0.7 -_OVER_RATIO = 1.0 +_TIGHT_RATIO = 0.9 + +# Union variants of EmbedderConfig that carry a model_name attribute. +# HashingVectorizerEmbeddingConfig and the bare BaseEmbedderConfig don't have +# one (sklearn vectorizer / abstract base), so we filter them out below. +_MODEL_BACKED_EMBEDDERS = ( + SentenceTransformerEmbeddingConfig, + OpenaiEmbeddingConfig, + VllmEmbeddingConfig, +) + + +def _embedder_model_name(embedder: EmbedderConfig) -> str | None: + """Return the embedder's model_name when the config variant carries one.""" + if isinstance(embedder, _MODEL_BACKED_EMBEDDERS): + return embedder.model_name + return None -# rough per-step seconds, keyed on device class. Scaled by params_millions / 100. -_PER_STEP_BASELINE_S = { - "cpu": 0.5, - "low-gpu": 0.04, - "mid-gpu": 0.02, - "high-gpu": 0.01, - "apple-silicon": 0.08, -} # Maps each fine-tunable transformer module to its training-mode label. # Modules not listed (or listed as "inference") run the encoder forward-only. @@ -98,7 +96,7 @@ def _validated_config(config: dict[str, Any]) -> _AdvisorConfig: _LINEAR_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-8 _CATBOOST_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-9 _CATBOOST_GPU_SPEEDUP = 10.0 -# LogisticRegressionCV defaults: Cs=10, cv=3 → 31 inner fits + 1 final refit. +# LogisticRegressionCV defaults: Cs=10, cv=3 -> 31 inner fits + 1 final refit. _LOGREG_CV_MULTIPLIER = 31 _CATBOOST_DEFAULT_BINS = 254 # Bytes per histogram bucket / tree node — order-of-magnitude constants. @@ -190,7 +188,7 @@ def _vram_for_transformer( def _ram_for_module(meta: ModelMeta, stats: DatasetStats) -> float: """RAM in GB. Loose upper bound.""" - return meta.weights_gb + (stats.n_samples * stats.avg_tokens * 4) / (1024**3) + return meta.weights_gb + (stats.n_samples * stats.avg_tokens * 4) / _BYTES_PER_GB def _floor_to_power_of_two(n: int) -> int: @@ -232,7 +230,7 @@ def _activations_gb_per_sample( bytes_per_sample = seq_len * hidden * _n_layers(meta) * 16 if is_training else seq_len * hidden * 8 if mixed_precision: bytes_per_sample //= 2 - return bytes_per_sample / (1024**3) + return bytes_per_sample / _BYTES_PER_GB def _max_fitting_batch_size( @@ -265,7 +263,7 @@ def _embedder_dim(meta: ModelMeta | None) -> int: def _largest_embedder(seen_models: dict[str, ModelMeta]) -> ModelMeta | None: if not seen_models: return None - return max(seen_models.values(), key=lambda m: m.params_millions) + return max(seen_models.values(), key=lambda m: m.total_params) def _ram_for_linear(*, stats: DatasetStats, embedder_dim: int) -> float: @@ -273,7 +271,7 @@ def _ram_for_linear(*, stats: DatasetStats, embedder_dim: int) -> float: data_bytes = 8.0 * stats.n_samples * embedder_dim coef_bytes = 8.0 * max(1, stats.n_classes) * embedder_dim lbfgs_bytes = 10.0 * 8.0 * embedder_dim - return (data_bytes + coef_bytes + lbfgs_bytes) / (1024**3) + return (data_bytes + coef_bytes + lbfgs_bytes) / _BYTES_PER_GB def _time_for_linear( @@ -301,7 +299,7 @@ def _ram_for_catboost(*, stats: DatasetStats, n_features: int, iterations: int, data_bytes = 4.0 * stats.n_samples * n_features histograms_bytes = 4.0 * n_features * _CATBOOST_DEFAULT_BINS trees_bytes = iterations * (2**depth) * _CATBOOST_BYTES_PER_TREE_NODE - return float((data_bytes + histograms_bytes + trees_bytes) / (1024**3)) + return float((data_bytes + histograms_bytes + trees_bytes) / _BYTES_PER_GB) def _time_for_catboost( @@ -323,16 +321,20 @@ def _time_for_catboost( def _time_for_transformer( *, - meta: ModelMeta, n_trials: int, epochs: int, batch_size: int, n_samples: int, - device_class: str, ) -> float: - per_step = _PER_STEP_BASELINE_S[device_class] * (meta.params_millions / 100.0) + """Transformer training time in hours, assuming a flat 1 second per step. + + The advisor has no real wall-time calibration across hardware tiers / model + sizes, so the report uses ``time_hours`` as a step-count proxy rather than + pretending to estimate seconds. Users should treat the number as ordering / + ballpark information, not a budget. + """ steps = max(1, (n_samples // max(1, batch_size))) * epochs - return (n_trials * steps * per_step) / 3600.0 + return (n_trials * steps) / 3600.0 def _classify_severity(estimate: float, budget: float) -> Severity: @@ -341,7 +343,7 @@ def _classify_severity(estimate: float, budget: float) -> Severity: if budget <= 0: return Severity.TIGHT ratio = estimate / budget - if ratio >= _OVER_RATIO: + if ratio >= 1: return Severity.OVER if ratio >= _TIGHT_RATIO: return Severity.TIGHT @@ -368,7 +370,8 @@ def _split_entries( search_space: list[dict[str, Any]], ) -> tuple[list[tuple[int, str, dict[str, Any]]], list[tuple[int, str, dict[str, Any]]]]: """Partition search-space entries into (transformer-bearing, classic).""" - transformer, classic = [], [] + transformer: list[tuple[int, str, dict[str, Any]]] = [] + classic: list[tuple[int, str, dict[str, Any]]] = [] for node_idx, node_type, entry in _walk_modules_indexed(search_space): bucket = classic if entry.get("module_name") in {"linear", "catboost"} else transformer bucket.append((node_idx, node_type, entry)) @@ -408,12 +411,10 @@ def _estimate_transformer_model( ) time_h = _time_for_transformer( - meta=meta, n_trials=n_trials, epochs=epochs, batch_size=batch_size, n_samples=stats.n_samples, - device_class=hardware.device_class, ) if mode != "inference": time_h *= _refit_factor(refit_after=refit_after, n_trials=n_trials) @@ -593,17 +594,16 @@ def _resource_phase( stats: DatasetStats, hardware: HardwareProfile, report: PreflightReport, + *, + refit_after: bool = False, ) -> None: cfg = _validated_config(config) - n_trials = max(1, cfg.hpo_config.n_trials) - n_jobs = max(1, cfg.hpo_config.n_jobs) - - if not hub_reachable(): - report.low_confidence = True - report.notes.append("HF Hub unreachable — all model sizes are name-pattern heuristics.") + n_trials = cfg.hpo_config.n_trials + n_jobs = cfg.hpo_config.n_jobs + dump_modules = cfg.logging_config.dump_modules seen_models: dict[str, ModelMeta] = {} - global_embedder = (cfg.embedder_config or {}).get("model_name") + global_embedder = _embedder_model_name(cfg.embedder_config) if global_embedder: seen_models[global_embedder] = resolve_model(global_embedder) @@ -628,7 +628,7 @@ def _resource_phase( stats=stats, hardware=hardware, n_trials=n_trials, - refit_after=cfg.refit_after, + refit_after=refit_after, ) module_estimates.append(me) # Track heaviest weight per node so dump_modules is bounded by one @@ -639,7 +639,7 @@ def _resource_phase( embedder_meta = _largest_embedder(seen_models) embedder_dim = _embedder_dim(embedder_meta) for _, node_type, entry in classic_entries: - me = _estimate_classic_entry( + classic_estimate = _estimate_classic_entry( entry=entry, node_type=node_type, embedder_meta=embedder_meta, @@ -647,10 +647,10 @@ def _resource_phase( stats=stats, hardware=hardware, n_trials=n_trials, - refit_after=cfg.refit_after, + refit_after=refit_after, ) - if me is not None: - module_estimates.append(me) + if classic_estimate is not None: + module_estimates.append(classic_estimate) estimate = ResourceEstimate(parallel_factor=n_jobs) for me in module_estimates: @@ -659,7 +659,17 @@ def _resource_phase( estimate.time_hours += me.time_hours estimate.drivers.append(me.driver) - _aggregate_disk(estimate, seen_models, node_max_weights, dump_modules=cfg.dump_modules, n_trials=n_trials) + _aggregate_disk(estimate, seen_models, node_max_weights, dump_modules=dump_modules, n_trials=n_trials) + + # Flip low_confidence if any model fell back to the heuristic path (Hub + # unreachable, repo missing safetensors metadata, local-path checkpoint). + heuristic_models = [m.name for m in seen_models.values() if m.confidence == "heuristic"] + if heuristic_models: + report.low_confidence = True + report.notes.append( + f"Heuristic fallback used for {len(heuristic_models)} model(s) — sizes are BERT-base " + f"defaults: {', '.join(heuristic_models[:3])}{'...' if len(heuristic_models) > 3 else ''}", # noqa: PLR2004 + ) report.resource = estimate _emit_resource_findings(report, estimate, hardware, n_jobs=n_jobs) @@ -743,15 +753,19 @@ def run_preflight( hardware: HardwareProfile, *, preset_name: str | None = None, + refit_after: bool = False, ) -> PreflightReport: """Run all three phases and return one report. Args: config: parsed preset / OptimizationConfig dict (top-level keys: - ``search_space``, ``hpo_config``, optional ``embedder_config``). + ``search_space``, ``hpo_config``, optional ``embedder_config``, + optional ``logging_config.dump_modules``). stats: dataset statistics (real or placeholder). hardware: detected hardware profile. preset_name: optional friendly name for the report header. + refit_after: matches the ``Pipeline.fit(refit_after=...)`` argument. + When True, time estimates include the extra refit-on-full-data pass. Returns: PreflightReport with findings across resource/data/config phases. @@ -777,7 +791,7 @@ def run_preflight( ) report.notes.extend(hardware.notes) - _resource_phase(config, stats, hardware, report) + _resource_phase(config, stats, hardware, report, refit_after=refit_after) _data_phase(config, stats, report) _config_phase(config, hardware, report) diff --git a/src/autointent/_advisor/_hardware.py b/src/autointent/_advisor/_hardware.py index e959b6ebf..6aa9741ee 100644 --- a/src/autointent/_advisor/_hardware.py +++ b/src/autointent/_advisor/_hardware.py @@ -1,7 +1,7 @@ """Local hardware detection. Probes CPU / RAM / disk and the highest-priority accelerator available -(CUDA → MPS → CPU). All probes are wrapped to fall back safely on a +(CUDA -> MPS -> CPU). All probes are wrapped to fall back safely on a broken install (e.g. CUDA driver mismatch) rather than crash the advisor. """ @@ -27,6 +27,7 @@ _HIGH_GPU_VRAM_GB = 24 _MID_GPU_VRAM_GB = 12 +_BYTES_PER_GB = 1024**3 # binary GiB convention; matches all advisor byte->GB conversions @dataclass @@ -53,7 +54,7 @@ def device_class(self) -> str: def _detect_ram_gb() -> float: - return float(psutil.virtual_memory().total) / (1024**3) + return float(psutil.virtual_memory().total) / _BYTES_PER_GB def _detect_free_disk_gb(path: str | None = None) -> float: @@ -61,7 +62,7 @@ def _detect_free_disk_gb(path: str | None = None) -> float: probe_path = cache if cache.exists() else Path("~").expanduser() try: usage = shutil.disk_usage(probe_path) - return usage.free / (1024**3) + return usage.free / _BYTES_PER_GB except OSError as e: logger.debug("disk usage probe failed at %s: %s", probe_path, e) return 0.0 @@ -73,7 +74,7 @@ def _detect_cuda() -> tuple[float, str] | None: idx = 0 try: _free, total = torch.cuda.mem_get_info(idx) - vram_gb = total / (1024**3) + vram_gb = total / _BYTES_PER_GB except (RuntimeError, AttributeError) as e: logger.debug("torch.cuda.mem_get_info failed: %s", e) return None diff --git a/src/autointent/_advisor/_hub.py b/src/autointent/_advisor/_hub.py index b459adf9c..a5b6126a5 100644 --- a/src/autointent/_advisor/_hub.py +++ b/src/autointent/_advisor/_hub.py @@ -9,72 +9,40 @@ import json import logging -import re from dataclasses import dataclass from functools import lru_cache from pathlib import Path -from typing import Any +from typing import Literal from huggingface_hub import HfApi, hf_hub_download, scan_cache_dir, try_to_load_from_cache +Confidence = Literal["hub", "heuristic"] + logger = logging.getLogger(__name__) -# Coarse heuristic estimates keyed on name fragments. Used only when HF Hub -# is unreachable and we can't get safetensors metadata. Values in millions. -_NAME_HEURISTICS = [ - (re.compile(r"(?i)(deberta|roberta|bert).*(xxlarge|huge)"), 1_500), - (re.compile(r"(?i)(deberta|roberta|bert).*xlarge"), 750), - (re.compile(r"(?i)(deberta|roberta|bert).*large"), 350), - (re.compile(r"(?i)e5.*large"), 560), - (re.compile(r"(?i)e5.*small"), 33), - (re.compile(r"(?i)mpnet"), 110), - (re.compile(r"(?i)minilm"), 33), - (re.compile(r"(?i)distil"), 66), - (re.compile(r"(?i)small"), 60), - (re.compile(r"(?i)base"), 110), - (re.compile(r"(?i)large"), 350), -] +_DEFAULT_HEURISTIC_PARAMS = 110_000_000 +_DEFAULT_BYTES_PER_PARAM = 4 +_BYTES_PER_GB = 1024**3 # using the binary GiB convention everywhere in the advisor @dataclass class ModelMeta: name: str - params_millions: float - weight_bytes_per_param: int + total_params: int + weight_bytes_per_param: float total_file_bytes: int cached_locally: bool - confidence: str # "hub" | "heuristic" - # Architecture shape read straight from the model's config.json when reachable; - # None when the file couldn't be fetched/parsed. Estimates fall back to a - # BERT-base default in that case. + confidence: Confidence hidden_size: int | None = None n_layers: int | None = None @property def disk_gb(self) -> float: - return self.total_file_bytes / (1024**3) + return self.total_file_bytes / _BYTES_PER_GB @property def weights_gb(self) -> float: - return (self.params_millions * 1_000_000 * self.weight_bytes_per_param) / (1024**3) - - -@lru_cache(maxsize=1) -def hub_reachable() -> bool: - """Single up-front probe. Memoized per process.""" - try: - HfApi().list_models(limit=1) - except Exception as e: # noqa: BLE001 - logger.debug("HF Hub probe failed: %s", e) - return False - return True - - -def _heuristic_params_millions(model_name: str) -> float: - for pattern, m in _NAME_HEURISTICS: - if pattern.search(model_name): - return float(m) - return 110.0 # generic BERT-base default + return (self.total_params * self.weight_bytes_per_param) / _BYTES_PER_GB def _shape_from_config(model_name: str) -> tuple[int | None, int | None]: @@ -98,7 +66,7 @@ def _shape_from_config(model_name: str) -> tuple[int | None, int | None]: # num_hidden_layers; T5/MT5 use d_model + num_layers; GPT-2/Neo use n_embd + n_layer. hidden = cfg.get("hidden_size") or cfg.get("d_model") or cfg.get("n_embd") layers = cfg.get("num_hidden_layers") or cfg.get("num_layers") or cfg.get("n_layer") - return (int(hidden) if hidden else None, int(layers) if layers else None) + return int(hidden) if hidden else None, int(layers) if layers else None def _is_warm_cached(model_name: str) -> bool: @@ -124,36 +92,46 @@ def _hub_metadata(model_name: str) -> ModelMeta | None: except Exception as e: # noqa: BLE001 logger.debug("model_info(%s) failed: %s", model_name, e) return None - - params_millions = 0.0 - weight_bytes_per_param = 4 - safetensors = getattr(info, "safetensors", None) - if safetensors is not None: - params_total = getattr(safetensors, "total", None) or sum( - getattr(safetensors, "parameters", {}).values() or [0] - ) - if params_total: - params_millions = params_total / 1_000_000 - params_map: dict[str, Any] = getattr(safetensors, "parameters", {}) or {} - if any("F16" in k or "BF16" in k for k in params_map): - weight_bytes_per_param = 2 - - total_file_bytes = 0 - for sibling in getattr(info, "siblings", []) or []: - size = getattr(sibling, "size", None) - if size: - total_file_bytes += int(size) + # Bytes-per-element for safetensors dtype strings. Used to convert the per-dtype + # parameter counts (info.safetensors.parameters) into a weighted average + # bytes-per-param for mixed-precision repos. + _dtype_bytes: dict[str, int] = { + "F64": 8, + "F32": 4, + "F16": 2, + "BF16": 2, + "I64": 8, + "I32": 4, + "I16": 2, + "I8": 1, + "U8": 1, + "BOOL": 1, + } + + total_params = 0 + weight_bytes_per_param: float = _DEFAULT_BYTES_PER_PARAM + if info.safetensors is not None: + params_by_dtype = info.safetensors.parameters or {} + total_params = info.safetensors.total or sum(params_by_dtype.values()) + if total_params: + total_weight_bytes = sum( + _dtype_bytes.get(dtype, _DEFAULT_BYTES_PER_PARAM) * count for dtype, count in params_by_dtype.items() + ) + if total_weight_bytes: + weight_bytes_per_param = total_weight_bytes / total_params + + total_file_bytes = sum(s.size for s in (info.siblings or []) if s.size) # Track whether either size came from the Hub or from the name-pattern fallback; # if any field was filled by heuristic, downgrade confidence so the report flips # low_confidence rather than misreporting hub-grade accuracy. - confidence = "hub" - if params_millions == 0: - params_millions = _heuristic_params_millions(model_name) + confidence: Confidence = "hub" + if total_params == 0: + total_params = _DEFAULT_HEURISTIC_PARAMS confidence = "heuristic" if total_file_bytes == 0: - total_file_bytes = int(params_millions * 1_000_000 * weight_bytes_per_param) + total_file_bytes = int(total_params * weight_bytes_per_param) confidence = "heuristic" hidden_size, n_layers = _shape_from_config(model_name) @@ -166,7 +144,7 @@ def _hub_metadata(model_name: str) -> ModelMeta | None: return ModelMeta( name=model_name, - params_millions=params_millions, + total_params=total_params, weight_bytes_per_param=weight_bytes_per_param, total_file_bytes=total_file_bytes, cached_locally=_is_warm_cached(model_name), @@ -182,19 +160,33 @@ def _heuristic_metadata(model_name: str) -> ModelMeta: "activation-memory estimates will use BERT-base defaults (hidden=768, layers=12).", model_name, ) - params_millions = _heuristic_params_millions(model_name) - weight_bytes_per_param = 4 - total_file_bytes = int(params_millions * 1_000_000 * weight_bytes_per_param) + total_file_bytes = _DEFAULT_HEURISTIC_PARAMS * _DEFAULT_BYTES_PER_PARAM return ModelMeta( name=model_name, - params_millions=params_millions, - weight_bytes_per_param=weight_bytes_per_param, + total_params=_DEFAULT_HEURISTIC_PARAMS, + weight_bytes_per_param=_DEFAULT_BYTES_PER_PARAM, total_file_bytes=total_file_bytes, cached_locally=_is_warm_cached(model_name), confidence="heuristic", ) +def _looks_like_local_path(model_name: str) -> bool: + """True when ``model_name`` is a filesystem path rather than an HF Hub repo id. + + Hub repo ids match ``org/repo``; anything that starts with a path separator, + ``~``, a relative-path prefix, or a Windows drive letter, or contains a + backslash, is treated as a local path. We can't rely on ``Path.is_absolute()`` + alone because POSIX-style absolute paths (``/tmp/...``) are *not* absolute + on Windows. + """ + if model_name.startswith(("local:", "/", "~", "./", "../", "\\\\")): + return True + if "\\" in model_name: + return True + return len(model_name) >= 2 and model_name[1] == ":" and model_name[0].isalpha() # noqa: PLR2004 + + @lru_cache(maxsize=64) def resolve_model(model_name: str) -> ModelMeta: """Resolve metadata for a single model name. Memoized per process. @@ -202,19 +194,20 @@ def resolve_model(model_name: str) -> ModelMeta: Always returns a value — never raises — so the advisor can keep going on offline machines or for unknown checkpoints. """ - if model_name.startswith("local:") or Path(model_name).is_absolute(): + if _looks_like_local_path(model_name): return ModelMeta( name=model_name, - params_millions=_heuristic_params_millions(model_name), - weight_bytes_per_param=4, + total_params=_DEFAULT_HEURISTIC_PARAMS, + weight_bytes_per_param=_DEFAULT_BYTES_PER_PARAM, total_file_bytes=0, cached_locally=True, confidence="heuristic", ) - if hub_reachable(): - meta = _hub_metadata(model_name) - if meta is not None: - return meta + # _hub_metadata returns None on any failure (network outage, missing repo, + # SDK exception) so we don't need a separate up-front probe. + meta = _hub_metadata(model_name) + if meta is not None: + return meta return _heuristic_metadata(model_name) diff --git a/src/autointent/_advisor/_render.py b/src/autointent/_advisor/_render.py index 82771ef9f..afd541e2b 100644 --- a/src/autointent/_advisor/_render.py +++ b/src/autointent/_advisor/_render.py @@ -13,13 +13,13 @@ if TYPE_CHECKING: from ._report import PreflightReport -_SEVERITY_TAG = {"ample": "✓", "tight": "⚠", "over": "✗"} +_SEVERITY_TAG = {"ample": "✓", "tight": "⚠", "over": "x"} _PHASE_ORDER = ("resource", "data", "config") _PHASE_LABEL = {"resource": "Resource", "data": "Data", "config": "Config"} def _batch_hint(driver: dict[str, Any]) -> str: - """Per-driver batch annotation: '64 → 32', '64', '64 (no fit)', or ''.""" + """Per-driver batch annotation: '64 -> 32', '64', '64 (no fit)', or ''.""" bs = driver.get("batch_size") if bs is None: return "" @@ -30,7 +30,7 @@ def _batch_hint(driver: dict[str, Any]) -> str: return f"{bs} (no fit)" if mx == bs: return str(bs) - return f"{bs} → {mx}" + return f"{bs} -> {mx}" _DRIVERS_LIMIT = 8 @@ -137,9 +137,9 @@ def render_recommendation( """Compact table for the ``recommend`` subcommand.""" lines = ["", "Recommendation:"] if chosen: - lines.append(f" → {chosen}") + lines.append(f" -> {chosen}") else: - lines.append(" → none of the bundled presets fit your hardware as-is.") + lines.append(" -> none of the bundled presets fit your hardware as-is.") lines.append("") lines.append(f"{'Preset':<24} {'Status':<14} {'VRAM':<10} {'Time':<10} {'Headroom':<10}") lines.append("-" * 68) diff --git a/tests/advisor/test_estimates_and_cli.py b/tests/advisor/test_estimates_and_cli.py index 2f16555ae..d87f90740 100644 --- a/tests/advisor/test_estimates_and_cli.py +++ b/tests/advisor/test_estimates_and_cli.py @@ -23,14 +23,11 @@ @pytest.fixture(autouse=True) def _force_offline(monkeypatch: pytest.MonkeyPatch) -> None: - """Pin the HF Hub probe to "offline" so tests don't hit the network.""" - from autointent._advisor import _estimates, _hub + """Force HF Hub lookups to fail so tests don't hit the network.""" + from autointent._advisor import _hub - _hub.hub_reachable.cache_clear() _hub.resolve_model.cache_clear() - offline = lambda *_a, **_kw: False # noqa: E731 - monkeypatch.setattr(_hub, "hub_reachable", offline) - monkeypatch.setattr(_estimates, "hub_reachable", offline) + monkeypatch.setattr(_hub, "_hub_metadata", lambda _name: None) def _profile(vram_gb: float = 16.0) -> HardwareProfile: @@ -50,7 +47,6 @@ def test_every_preset_inspects_without_raising(preset: str) -> None: stats = DatasetStats.placeholder(n_samples=500, n_classes=10, avg_tokens=24) report = run_preflight(cfg, stats, _profile(vram_gb=16.0), preset_name=preset) assert report.preset_name == preset - assert report.low_confidence is True # we forced offline # always at least one resource-phase finding assert any(f.phase == "resource" for f in report.findings) diff --git a/tests/advisor/test_estimates_internals.py b/tests/advisor/test_estimates_internals.py index c63acde9d..3fa293b7e 100644 --- a/tests/advisor/test_estimates_internals.py +++ b/tests/advisor/test_estimates_internals.py @@ -19,15 +19,41 @@ from autointent._advisor._hub import ModelMeta from autointent._advisor._report import DatasetStats, Severity +# Per-name ModelMeta fixtures used by the offline tests. Production resolution +# (HF Hub config.json + safetensors metadata) is mocked away so the batch-fit +# math doesn't depend on whatever fallback the heuristic path returns. +_FAKE_SHAPES: dict[str, tuple[int, int, int]] = { + # (total_params, hidden_size, n_layers) + "microsoft/deberta-v3-large": (350_000_000, 1024, 24), + "microsoft/deberta-v3-small": (140_000_000, 768, 6), + "sentence-transformers/all-MiniLM-L6-v2": (33_000_000, 384, 6), + "intfloat/multilingual-e5-large-instruct": (560_000_000, 1024, 24), +} + + +def _fake_resolve(model_name: str) -> ModelMeta: + known = _FAKE_SHAPES.get(model_name) + params, hidden, layers = known or (110_000_000, 768, 12) + return ModelMeta( + name=model_name, + total_params=params, + weight_bytes_per_param=4, + total_file_bytes=params * 4, + cached_locally=False, + confidence="hub" if known else "heuristic", + hidden_size=hidden, + n_layers=layers, + ) + @pytest.fixture(autouse=True) def _offline(monkeypatch: pytest.MonkeyPatch) -> None: - _hub.hub_reachable.cache_clear() _hub.resolve_model.cache_clear() - offline = lambda *_a, **_kw: False # noqa: E731 - monkeypatch.setattr(_hub, "hub_reachable", offline) - monkeypatch.setattr(_estimates, "hub_reachable", offline) monkeypatch.setattr(_hub, "_is_warm_cached", lambda _name: False) + # Inject deterministic ModelMeta per name; both the _hub re-export and the + # _estimates rebinding need to be replaced for run_preflight to pick it up. + monkeypatch.setattr(_hub, "resolve_model", _fake_resolve) + monkeypatch.setattr(_estimates, "resolve_model", _fake_resolve) def _profile(vram_gb: float = 16.0, accelerator: str = "cuda") -> HardwareProfile: @@ -89,7 +115,7 @@ def test_below_yellow_is_green(self) -> None: assert _classify_severity(estimate=1.0, budget=10.0) == Severity.AMPLE def test_above_yellow_threshold(self) -> None: - assert _classify_severity(estimate=8.0, budget=10.0) == Severity.TIGHT + assert _classify_severity(estimate=9.5, budget=10.0) == Severity.TIGHT def test_at_or_above_red_threshold(self) -> None: assert _classify_severity(estimate=10.0, budget=10.0) == Severity.OVER @@ -104,7 +130,7 @@ class TestVramForTransformer: def meta(self) -> ModelMeta: return ModelMeta( name="x", - params_millions=100.0, + total_params=100_000_000, weight_bytes_per_param=4, total_file_bytes=0, cached_locally=False, @@ -146,10 +172,11 @@ def test_amp_does_reduce_activation_side_vram(self, meta: ModelMeta) -> None: amp = _vram_for_transformer(meta, "full-finetune", mixed_precision=True, batch_size=64, seq_len=128) assert amp < fp32 + def test_ram_scales_with_dataset_size() -> None: meta = ModelMeta( name="x", - params_millions=100.0, + total_params=100_000_000, weight_bytes_per_param=4, total_file_bytes=0, cached_locally=False, @@ -177,7 +204,7 @@ def test_dump_modules_adds_disk_during_training(self) -> None: } ], "hpo_config": {"n_trials": 5}, - "dump_modules": True, + "logging_config": {"dump_modules": True}, } report = run_preflight(cfg, DatasetStats.placeholder(), _profile()) assert report.resource.disk_dump_gb > 0 @@ -201,8 +228,7 @@ def test_refit_after_increases_time(self) -> None: "hpo_config": {"n_trials": 10}, } baseline = run_preflight(cfg, DatasetStats.placeholder(), _profile()) - cfg_refit = {**cfg, "refit_after": True} - bumped = run_preflight(cfg_refit, DatasetStats.placeholder(), _profile()) + bumped = run_preflight(cfg, DatasetStats.placeholder(), _profile(), refit_after=True) assert bumped.resource.time_hours > baseline.resource.time_hours def test_catboost_gpu_without_cuda_flags_config(self) -> None: @@ -249,7 +275,7 @@ def test_offline_flips_low_confidence(self) -> None: } report = run_preflight(cfg, DatasetStats.placeholder(), _profile()) assert report.low_confidence is True - assert any("HF Hub unreachable" in n for n in report.notes) + assert any("Heuristic fallback" in n for n in report.notes) def test_rare_classes_with_linear_scorer_flag_red(self) -> None: cfg = { @@ -471,13 +497,13 @@ def test_driver_records_current_and_max_batch(self) -> None: report = run_preflight( self._bert_cfg("microsoft/deberta-v3-large", batch_size=64), DatasetStats.placeholder(), - _profile(vram_gb=8.0), + _profile(vram_gb=6.5), ) drivers = [d for d in report.resource.drivers if d["module"] == "bert"] assert drivers d = drivers[0] assert d["batch_size"] == 64 - # vram_gb=8 with ~5 GB weights leaves little room for activations → max < 64. + # vram_gb=6.5 against ~5 GB weights x 0.9 tight ratio -> little activation room, max < 64. assert d["max_batch_size"] is not None assert 0 < d["max_batch_size"] < 64 @@ -523,7 +549,7 @@ def test_multiple_drivers_carry_independent_max_batch(self) -> None: report = run_preflight(cfg, DatasetStats.placeholder(), _profile(vram_gb=10.0)) small = next(d for d in report.resource.drivers if "small" in d["model"]) large = next(d for d in report.resource.drivers if "large" in d["model"]) - # The smaller model has more headroom → larger max batch (or equal-cap when both saturate). + # The smaller model has more headroom -> larger max batch (or equal-cap when both saturate). assert small["max_batch_size"] >= large["max_batch_size"] @@ -551,7 +577,7 @@ def test_dump_disk_is_bounded_by_per_node_max_not_sum_of_all_variants(self) -> N } ], "hpo_config": {"n_trials": 4}, - "dump_modules": True, + "logging_config": {"dump_modules": True}, } report = run_preflight(cfg, DatasetStats.placeholder(), _profile()) # Per-node max ~ deberta-v3-large weights (~350M x 4 ~ 1.3 GB). Two-candidate @@ -588,7 +614,7 @@ def test_dump_disk_sums_across_nodes(self) -> None: }, ], "hpo_config": {"n_trials": 2}, - "dump_modules": True, + "logging_config": {"dump_modules": True}, } report = run_preflight(cfg, DatasetStats.placeholder(), _profile()) embedder = _hub.resolve_model("sentence-transformers/all-MiniLM-L6-v2") diff --git a/tests/advisor/test_hub_heuristics.py b/tests/advisor/test_hub_heuristics.py index b43b95522..c19018235 100644 --- a/tests/advisor/test_hub_heuristics.py +++ b/tests/advisor/test_hub_heuristics.py @@ -1,8 +1,8 @@ -"""Tests for the offline name-pattern heuristics in `_hub`. +"""Tests for the offline heuristic fallback in `_hub`. -The advisor must produce a sensible estimate even when HF Hub is -unreachable, so these tests pin the public `hub_reachable` to False and -exercise the heuristic path directly. +The advisor must produce a sensible estimate even when HF Hub is unreachable. +Without a per-name heuristic, every offline lookup collapses to a single +BERT-base-sized default — these tests pin that contract. """ from __future__ import annotations @@ -14,41 +14,28 @@ @pytest.fixture(autouse=True) def _offline(monkeypatch: pytest.MonkeyPatch) -> None: - _hub.hub_reachable.cache_clear() _hub.resolve_model.cache_clear() - monkeypatch.setattr(_hub, "hub_reachable", lambda *_a, **_kw: False) + # Force `_hub_metadata` to behave as if the live Hub were unreachable so + # resolve_model falls through to `_heuristic_metadata`. + monkeypatch.setattr(_hub, "_hub_metadata", lambda _name: None) monkeypatch.setattr(_hub, "_is_warm_cached", lambda _name: False) -@pytest.mark.parametrize( - ("name", "expected_min_m", "expected_max_m"), - [ - ("microsoft/deberta-v3-large", 200, 500), - ("microsoft/deberta-v3-small", 30, 200), - ("sentence-transformers/all-MiniLM-L6-v2", 20, 80), - ("intfloat/multilingual-e5-large-instruct", 300, 700), - ("intfloat/e5-small", 20, 80), - ("distilbert-base-uncased", 40, 150), - ("bert-base-uncased", 70, 200), - ], -) -def test_name_heuristic_picks_reasonable_bucket(name: str, expected_min_m: int, expected_max_m: int) -> None: - meta = _hub.resolve_model(name) - assert meta.confidence == "heuristic" - assert expected_min_m <= meta.params_millions <= expected_max_m, ( - f"{name} got {meta.params_millions}M; expected [{expected_min_m}, {expected_max_m}]" - ) - - -def test_unknown_name_falls_back_to_bert_base() -> None: - meta = _hub.resolve_model("totally-made-up/no-such-model") - assert meta.confidence == "heuristic" - assert meta.params_millions == pytest.approx(110.0) +def test_offline_lookup_uses_bert_base_default() -> None: + """Every offline lookup returns the same BERT-base-sized fallback.""" + for name in ( + "microsoft/deberta-v3-large", + "sentence-transformers/all-MiniLM-L6-v2", + "totally-made-up/no-such-model", + ): + meta = _hub.resolve_model(name) + assert meta.confidence == "heuristic" + assert meta.total_params == _hub._DEFAULT_HEURISTIC_PARAMS def test_weights_gb_matches_params_times_bytes() -> None: meta = _hub.resolve_model("microsoft/deberta-v3-large") - expected_gb = meta.params_millions * 1_000_000 * meta.weight_bytes_per_param / (1024**3) + expected_gb = meta.total_params * meta.weight_bytes_per_param / (1024**3) assert meta.weights_gb == pytest.approx(expected_gb) @@ -75,5 +62,5 @@ def test_metadata_fallback_uses_heuristic_when_hub_unreachable() -> None: the live Hub is unreachable (autouse fixture forces offline).""" meta = _hub.resolve_model("microsoft/deberta-v3-large") assert meta.confidence == "heuristic" - assert meta.params_millions > 0 + assert meta.total_params > 0 assert meta.disk_gb > 0 diff --git a/tests/advisor/test_render.py b/tests/advisor/test_render.py index 2c0604a11..7a806c7f2 100644 --- a/tests/advisor/test_render.py +++ b/tests/advisor/test_render.py @@ -56,7 +56,7 @@ def test_contains_phase_blocks(self) -> None: out = render_text(_populated_report()) assert "Resource:" in out assert "Data:" in out - # Config phase has no findings → block omitted + # Config phase has no findings -> block omitted assert "Config:" not in out def test_includes_drivers_block(self) -> None: @@ -119,7 +119,7 @@ def _two_reports(self) -> list[tuple[str, PreflightReport]]: def test_lists_chosen_preset_when_present(self) -> None: out = render_recommendation(self._two_reports(), chosen="a") - assert "→ a" in out + assert "-> a" in out def test_handles_no_chosen(self) -> None: out = render_recommendation(self._two_reports(), chosen=None) @@ -140,7 +140,7 @@ class TestBatchHint: """Per-driver batch cell rendered in the Drivers-of-cost table.""" def test_arrow_when_max_differs(self) -> None: - assert _batch_hint({"batch_size": 64, "max_batch_size": 32}) == "64 → 32" + assert _batch_hint({"batch_size": 64, "max_batch_size": 32}) == "64 -> 32" def test_plain_when_max_equals_current(self) -> None: assert _batch_hint({"batch_size": 64, "max_batch_size": 64}) == "64" @@ -152,7 +152,7 @@ def test_empty_when_no_batch(self) -> None: assert _batch_hint({"batch_size": None, "max_batch_size": None}) == "" def test_increase_arrow(self) -> None: - assert _batch_hint({"batch_size": 32, "max_batch_size": 128}) == "32 → 128" + assert _batch_hint({"batch_size": 32, "max_batch_size": 128}) == "32 -> 128" def test_dataset_stats_in_text_block() -> None: From e0f14866a898714df279f73ff1dccd9f288725ed Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 16 Jun 2026 21:26:39 +0300 Subject: [PATCH 13/16] remove from init --- src/autointent/_advisor/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/autointent/_advisor/__init__.py b/src/autointent/_advisor/__init__.py index 28422c78d..d8eb6f5ba 100644 --- a/src/autointent/_advisor/__init__.py +++ b/src/autointent/_advisor/__init__.py @@ -10,10 +10,9 @@ from ._estimates import run_preflight from ._hardware import HardwareProfile, detect_hardware from ._report import DatasetStats, Finding, PreflightReport, RecommendationResult, ResourceEstimate, Severity -from ._workflows import BUNDLED_PRESETS, inspect, load_config, recommend, stats_from_dataset +from ._workflows import inspect, load_config, recommend, stats_from_dataset __all__ = [ - "BUNDLED_PRESETS", "DatasetStats", "Finding", "HardwareProfile", From 6496b4e1873d93ef5e5cc53cdb43b2612611f4d5 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 16 Jun 2026 21:29:13 +0300 Subject: [PATCH 14/16] revert pyproject.toml --- pyproject.toml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ace1a3c77..276361669 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -298,12 +298,6 @@ module = [ "dspy.evaluate.auto_evaluation", "codecarbon", "catboost", - "openai", - "openai.*", - "tiktoken", - "peft", - "sentence_transformers", - "psutil", ] ignore_missing_imports = true From bc3df74af98ae1e63aac3b3f69391b223d03585d Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 16 Jun 2026 21:37:55 +0300 Subject: [PATCH 15/16] update typing --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 276361669..7677ac2aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -120,6 +120,7 @@ typing = [ "joblib-stubs (>=1.4.2.5.20240918,<2.0.0)", "pandas-stubs (>= 2.2.3.250527, <3.0.0)", "types-aiofiles (>=24.1.0.20250606)", + "types-psutil>=7.2.2.20260518", ] docs = [ "sphinx (>=8.1.3,<9.0.0)", From 7cb0f53e9929465637290dbfce27c7661ee9bd61 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Mon, 22 Jun 2026 13:10:14 +0300 Subject: [PATCH 16/16] refactor --- src/autointent/_advisor/_estimates.py | 798 ------------------ src/autointent/_advisor/_hub.py | 2 +- .../_advisor/{_workflows.py => workflows.py} | 17 +- src/autointent/custom_types/_types.py | 12 +- tests/advisor/test_estimates_internals.py | 33 +- 5 files changed, 26 insertions(+), 836 deletions(-) delete mode 100644 src/autointent/_advisor/_estimates.py rename src/autointent/_advisor/{_workflows.py => workflows.py} (92%) diff --git a/src/autointent/_advisor/_estimates.py b/src/autointent/_advisor/_estimates.py deleted file mode 100644 index faeb4b1ee..000000000 --- a/src/autointent/_advisor/_estimates.py +++ /dev/null @@ -1,798 +0,0 @@ -"""Resource-phase estimation: walk the search space and aggregate cost. - -Implements an honest worst-case for the modules the proposal lists as -in-scope. Formulas are intentionally coarse — the advisor's contract is -"heuristic upper bound, not measurement". Time and VRAM are the noisiest; -treat them as ballparks, not budgets. -""" - -from __future__ import annotations - -import logging -from dataclasses import dataclass -from typing import TYPE_CHECKING, Any - -from pydantic import ValidationError - -from autointent._optimization_config import OptimizationConfig -from autointent.configs._embedder import ( - EmbedderConfig, - OpenaiEmbeddingConfig, - SentenceTransformerEmbeddingConfig, - VllmEmbeddingConfig, -) - -from ._hub import resolve_model -from ._report import PreflightReport, ResourceEstimate, Severity - -if TYPE_CHECKING: - from collections.abc import Iterable - - from ._hardware import HardwareProfile - from ._hub import ModelMeta - from ._report import DatasetStats - -_MULTICLASS_THRESHOLD = 2 -_BYTES_PER_GB = 1024**3 # binary GiB convention; matches all advisor byte->GB conversions - -# Fallback architecture shape (BERT-base) used only when the model's actual -# config.json couldn't be fetched from HF Hub — see _hub._shape_from_config. -_DEFAULT_HIDDEN = 768 -_DEFAULT_LAYERS = 12 - -logger = logging.getLogger(__name__) - - -def _validated_config(config: dict[str, Any]) -> OptimizationConfig: - """Validate ``config`` against the project's canonical ``OptimizationConfig``. - - The advisor is best-effort: a malformed user config should still produce a - report (with placeholder costs) rather than crashing, so any validation - error falls back to the model defaults. - """ - try: - return OptimizationConfig.model_validate(config) - except ValidationError as e: - logger.warning("Advisor config failed validation; falling back to defaults: %s", e) - # OptimizationConfig requires `search_space`; build a minimal valid default. - return OptimizationConfig.model_validate({"search_space": []}) - - -_TIGHT_RATIO = 0.9 - -# Union variants of EmbedderConfig that carry a model_name attribute. -# HashingVectorizerEmbeddingConfig and the bare BaseEmbedderConfig don't have -# one (sklearn vectorizer / abstract base), so we filter them out below. -_MODEL_BACKED_EMBEDDERS = ( - SentenceTransformerEmbeddingConfig, - OpenaiEmbeddingConfig, - VllmEmbeddingConfig, -) - - -def _embedder_model_name(embedder: EmbedderConfig) -> str | None: - """Return the embedder's model_name when the config variant carries one.""" - if isinstance(embedder, _MODEL_BACKED_EMBEDDERS): - return embedder.model_name - return None - - -# Maps each fine-tunable transformer module to its training-mode label. -# Modules not listed (or listed as "inference") run the encoder forward-only. -# Note: dnnc keeps the cross-encoder frozen and trains an sklearn LogisticRegressionCV -# head on top of its features (see autointent._wrappers.ranker.Ranker._fit), so the -# encoder's VRAM profile matches inference rather than fine-tuning. -_TRANSFORMER_TRAINING_MODE = { - "bert": "full-finetune", - "ptuning": "lora", - "lora": "lora", -} - -# Fallback max_length when the search-space entry doesn't pin it. Used both as -# the default in _vram_for_transformer and in the entry-walk seq_len resolution. -_DEFAULT_SEQ_LEN = 128 - -# Coefficients for the linear / catboost time formulas (proposal §"Algorithm"). -_LINEAR_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-8 -_CATBOOST_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-9 -_CATBOOST_GPU_SPEEDUP = 10.0 -# LogisticRegressionCV defaults: Cs=10, cv=3 -> 31 inner fits + 1 final refit. -_LOGREG_CV_MULTIPLIER = 31 -_CATBOOST_DEFAULT_BINS = 254 -# Bytes per histogram bucket / tree node — order-of-magnitude constants. -_CATBOOST_BYTES_PER_TREE_NODE = 32 - - -def _extract_model_names(module_entry: dict[str, Any]) -> list[str]: - """Pull model name(s) from a search-space module entry.""" - candidates: list[str] = [] - cfg = module_entry.get("classification_model_config") - if isinstance(cfg, list): - candidates.extend(c["model_name"] for c in cfg if isinstance(c, dict) and c.get("model_name")) - elif isinstance(cfg, dict) and cfg.get("model_name"): - candidates.append(cfg["model_name"]) - embedder_cfg = module_entry.get("embedder_config") - if isinstance(embedder_cfg, list): - candidates.extend(c["model_name"] for c in embedder_cfg if isinstance(c, dict) and c.get("model_name")) - elif isinstance(embedder_cfg, dict) and embedder_cfg.get("model_name"): - candidates.append(embedder_cfg["model_name"]) - return candidates - - -def _max_int(value: Any, default: int) -> int: # noqa: ANN401 - if value is None: - return default - if isinstance(value, list) and value: - return max(int(x) for x in value) - if isinstance(value, dict): - return int(value.get("high", default)) - try: - return int(value) - except (TypeError, ValueError): - return default - - -def _walk_modules_indexed( - search_space: list[dict[str, Any]], -) -> Iterable[tuple[int, str, dict[str, Any]]]: - """Yield (node_index, node_type, module_entry) — index lets us bound per-node max cost.""" - for node_idx, node in enumerate(search_space or []): - node_type = node.get("node_type", "?") - for entry in node.get("search_space", []) or []: - yield node_idx, node_type, entry - - -def _walk_modules(search_space: list[dict[str, Any]]) -> Iterable[tuple[str, dict[str, Any]]]: - """Yield (node_type, module_entry) pairs — index-agnostic view over `_walk_modules_indexed`.""" - for _, node_type, entry in _walk_modules_indexed(search_space): - yield node_type, entry - - -def _weights_vram_for_transformer(meta: ModelMeta, mode: str) -> float: - """Weight-side VRAM in GB — weights + grads + Adam optimizer state. Excludes activations. - - Modes: - * ``inference``: forward only — weights + ~30% intermediate-tensor overhead. - * ``lora``: frozen base + small trainable adapters + their grads/optimizer (~0.5 GB). - * ``full-finetune`` (default): weights + grads + Adam (m, v) = 4x weights. - """ - weights_gb = meta.weights_gb - if mode == "inference": - return weights_gb * 1.3 - if mode == "lora": - return weights_gb * 1.3 + 0.5 - return weights_gb * 4.0 - - -def _vram_for_transformer( - meta: ModelMeta, - mode: str, - mixed_precision: bool, - *, - batch_size: int = 0, - seq_len: int = _DEFAULT_SEQ_LEN, -) -> float: - """Total VRAM in GB: weights + grads + optimizer state + activations x batch. - - Activation accounting differs by mode — training keeps per-layer outputs for - backward; inference only needs one or two layers in flight. - """ - base = _weights_vram_for_transformer(meta, mode) - if batch_size <= 0: - return base - per_sample = _activations_gb_per_sample( - meta, seq_len, mixed_precision=mixed_precision, is_training=mode != "inference" - ) - return base + per_sample * batch_size - - -def _ram_for_module(meta: ModelMeta, stats: DatasetStats) -> float: - """RAM in GB. Loose upper bound.""" - return meta.weights_gb + (stats.n_samples * stats.avg_tokens * 4) / _BYTES_PER_GB - - -def _floor_to_power_of_two(n: int) -> int: - """Largest power of two ≤ n; returns 0 when n < 1.""" - if n < 1: - return 0 - power = 1 - while power * 2 <= n: - power *= 2 - return power - - -def _n_layers(meta: ModelMeta | None) -> int: - """Layer count from the model's ``config.json``; falls back to BERT-base when absent.""" - if meta is not None and meta.n_layers is not None: - return meta.n_layers - return _DEFAULT_LAYERS - - -def _activations_gb_per_sample( - meta: ModelMeta | None, - seq_len: int, - *, - mixed_precision: bool, - is_training: bool, -) -> float: - """Heuristic activation memory per sample. - - Training: ``seq_len x hidden x layers x const`` — per-layer outputs are kept - for backward. - Inference: ``seq_len x hidden x const`` — only one or two layers' outputs in - flight at once. - Mixed precision halves activation bytes. - """ - hidden = _embedder_dim(meta) - # Training keeps every layer's outputs for backward -> scales x n_layers. - # The 16-byte/token/layer coefficient bundles fp32 activation + ~4x backward overhead. - # Inference only holds ~1-2 layers' outputs in flight at once. - bytes_per_sample = seq_len * hidden * _n_layers(meta) * 16 if is_training else seq_len * hidden * 8 - if mixed_precision: - bytes_per_sample //= 2 - return bytes_per_sample / _BYTES_PER_GB - - -def _max_fitting_batch_size( - *, - weight_vram_gb: float, - vram_budget_gb: float, - per_sample_gb: float, -) -> int: - """Largest batch that keeps total VRAM under the AMPLE/TIGHT threshold. - - Returns 0 when even the weights blow the budget. Result is rounded down to - the nearest power of two. - """ - if per_sample_gb <= 0: - return 0 - target_vram = vram_budget_gb * _TIGHT_RATIO - available_for_activations = target_vram - weight_vram_gb - if available_for_activations <= 0: - return 0 - return _floor_to_power_of_two(int(available_for_activations / per_sample_gb)) - - -def _embedder_dim(meta: ModelMeta | None) -> int: - """Hidden size from the model's ``config.json``; falls back to BERT-base when absent.""" - if meta is not None and meta.hidden_size is not None: - return meta.hidden_size - return _DEFAULT_HIDDEN - - -def _largest_embedder(seen_models: dict[str, ModelMeta]) -> ModelMeta | None: - if not seen_models: - return None - return max(seen_models.values(), key=lambda m: m.total_params) - - -def _ram_for_linear(*, stats: DatasetStats, embedder_dim: int) -> float: - """Float64 design matrix dominates; coefficients and L-BFGS history are small.""" - data_bytes = 8.0 * stats.n_samples * embedder_dim - coef_bytes = 8.0 * max(1, stats.n_classes) * embedder_dim - lbfgs_bytes = 10.0 * 8.0 * embedder_dim - return (data_bytes + coef_bytes + lbfgs_bytes) / _BYTES_PER_GB - - -def _time_for_linear( - *, - n_trials: int, - n_samples: int, - embedder_dim: int, - max_iter: int, - cv_multiplier: int, - class_multiplier: int, -) -> float: - seconds = ( - n_trials - * _LINEAR_CPU_S_PER_SAMPLE_FEATURE_ITER - * n_samples - * embedder_dim - * max_iter - * cv_multiplier - * class_multiplier - ) - return seconds / 3600.0 - - -def _ram_for_catboost(*, stats: DatasetStats, n_features: int, iterations: int, depth: int) -> float: - data_bytes = 4.0 * stats.n_samples * n_features - histograms_bytes = 4.0 * n_features * _CATBOOST_DEFAULT_BINS - trees_bytes = iterations * (2**depth) * _CATBOOST_BYTES_PER_TREE_NODE - return float((data_bytes + histograms_bytes + trees_bytes) / _BYTES_PER_GB) - - -def _time_for_catboost( - *, - n_trials: int, - n_samples: int, - n_features: int, - iterations: int, - depth: int, - class_multiplier: int, - on_gpu: bool, -) -> float: - coeff = _CATBOOST_CPU_S_PER_SAMPLE_FEATURE_ITER - if on_gpu: - coeff /= _CATBOOST_GPU_SPEEDUP - seconds = n_trials * iterations * coeff * n_samples * n_features * depth * class_multiplier - return seconds / 3600.0 - - -def _time_for_transformer( - *, - n_trials: int, - epochs: int, - batch_size: int, - n_samples: int, -) -> float: - """Transformer training time in hours, assuming a flat 1 second per step. - - The advisor has no real wall-time calibration across hardware tiers / model - sizes, so the report uses ``time_hours`` as a step-count proxy rather than - pretending to estimate seconds. Users should treat the number as ordering / - ballpark information, not a budget. - """ - steps = max(1, (n_samples // max(1, batch_size))) * epochs - return (n_trials * steps) / 3600.0 - - -def _classify_severity(estimate: float, budget: float) -> Severity: - if estimate <= 0: - return Severity.AMPLE - if budget <= 0: - return Severity.TIGHT - ratio = estimate / budget - if ratio >= 1: - return Severity.OVER - if ratio >= _TIGHT_RATIO: - return Severity.TIGHT - return Severity.AMPLE - - -@dataclass -class _ModuleEstimate: - """Per-module cost contribution + the dict that gets rendered in the report.""" - - driver: dict[str, Any] - vram_gb: float - ram_gb: float - time_hours: float - model_weights_gb: float = 0.0 - - -def _refit_factor(*, refit_after: bool, n_trials: int) -> float: - """Wall-time multiplier for ``refit_after=True`` (amortized 1/n_trials extra).""" - return 1 + 1.0 / max(1, n_trials) if refit_after else 1.0 - - -def _split_entries( - search_space: list[dict[str, Any]], -) -> tuple[list[tuple[int, str, dict[str, Any]]], list[tuple[int, str, dict[str, Any]]]]: - """Partition search-space entries into (transformer-bearing, classic).""" - transformer: list[tuple[int, str, dict[str, Any]]] = [] - classic: list[tuple[int, str, dict[str, Any]]] = [] - for node_idx, node_type, entry in _walk_modules_indexed(search_space): - bucket = classic if entry.get("module_name") in {"linear", "catboost"} else transformer - bucket.append((node_idx, node_type, entry)) - return transformer, classic - - -def _estimate_transformer_model( - *, - meta: ModelMeta, - entry: dict[str, Any], - node_type: str, - module: str, - name: str, - stats: DatasetStats, - hardware: HardwareProfile, - n_trials: int, - refit_after: bool, -) -> _ModuleEstimate: - """One row of cost for a transformer module + a specific model checkpoint.""" - mixed_precision = entry.get("dtype") in {"fp16", "bf16"} - mode = _TRANSFORMER_TRAINING_MODE.get(module, "inference") - batch_size = _max_int(entry.get("batch_size"), 32) - epochs = _max_int(entry.get("num_train_epochs"), 1 if mode == "inference" else 10) - seq_len = _max_int(entry.get("max_length"), _DEFAULT_SEQ_LEN) - - vram = _vram_for_transformer(meta, mode, mixed_precision, batch_size=batch_size, seq_len=seq_len) - ram = _ram_for_module(meta, stats) - - driver_max_batch: int | None = None - if hardware.vram_gb > 0: - driver_max_batch = _max_fitting_batch_size( - weight_vram_gb=_weights_vram_for_transformer(meta, mode), - vram_budget_gb=hardware.vram_gb, - per_sample_gb=_activations_gb_per_sample( - meta, seq_len, mixed_precision=mixed_precision, is_training=mode != "inference" - ), - ) - - time_h = _time_for_transformer( - n_trials=n_trials, - epochs=epochs, - batch_size=batch_size, - n_samples=stats.n_samples, - ) - if mode != "inference": - time_h *= _refit_factor(refit_after=refit_after, n_trials=n_trials) - - return _ModuleEstimate( - driver={ - "node_type": node_type, - "module": module, - "model": name, - "mode": mode, - "vram_gb": round(vram, 2), - "ram_gb": round(ram, 2), - "time_hours": round(time_h, 2), - "batch_size": batch_size, - "max_batch_size": driver_max_batch, - "confidence": meta.confidence, - }, - vram_gb=vram, - ram_gb=ram, - time_hours=time_h, - model_weights_gb=meta.weights_gb, - ) - - -def _estimate_classic_entry( - *, - entry: dict[str, Any], - node_type: str, - embedder_meta: ModelMeta | None, - embedder_dim: int, - stats: DatasetStats, - hardware: HardwareProfile, - n_trials: int, - refit_after: bool, -) -> _ModuleEstimate | None: - """Cost row for a linear or catboost scorer (returns ``None`` for any other module).""" - module = entry.get("module_name", "?") - refit = _refit_factor(refit_after=refit_after, n_trials=n_trials) - # Both multinomial (multiclass) and one-vs-rest (multilabel) LR scale linearly in n_classes. - class_multiplier = max(1, stats.n_classes) - - if module == "linear": - cv_multiplier = 1 if stats.multilabel else _LOGREG_CV_MULTIPLIER - ram = _ram_for_linear(stats=stats, embedder_dim=embedder_dim) - time_h = ( - _time_for_linear( - n_trials=n_trials, - n_samples=stats.n_samples, - embedder_dim=embedder_dim, - max_iter=_max_int(entry.get("max_iter"), 100), - cv_multiplier=cv_multiplier, - class_multiplier=class_multiplier, - ) - * refit - ) - vram = 0.0 - mode = "linear-cv" if cv_multiplier > 1 else "linear" - elif module == "catboost": - on_gpu = entry.get("task_type") == "GPU" and hardware.accelerator == "cuda" - # CatBoost MultiClass loss grows per-class trees only above binary; binary uses - # Logloss with one tree per iteration. - cb_class_mult = class_multiplier if stats.n_classes > _MULTICLASS_THRESHOLD or stats.multilabel else 1 - iterations = _max_int(entry.get("iterations"), 1000) - depth = _max_int(entry.get("depth"), 6) - ram_total = _ram_for_catboost(stats=stats, n_features=embedder_dim, iterations=iterations, depth=depth) - time_h = ( - _time_for_catboost( - n_trials=n_trials, - n_samples=stats.n_samples, - n_features=embedder_dim, - iterations=iterations, - depth=depth, - class_multiplier=cb_class_mult, - on_gpu=on_gpu, - ) - * refit - ) - vram, ram = (ram_total, 0.0) if on_gpu else (0.0, ram_total) - mode = "catboost-gpu" if on_gpu else "catboost" - else: - return None - - return _ModuleEstimate( - driver={ - "node_type": node_type, - "module": module, - "model": embedder_meta.name if embedder_meta else "(no embedder)", - "mode": mode, - "vram_gb": round(vram, 2), - "ram_gb": round(ram, 2), - "time_hours": round(time_h, 2), - "batch_size": None, - "max_batch_size": None, - "confidence": embedder_meta.confidence if embedder_meta else "heuristic", - }, - vram_gb=vram, - ram_gb=ram, - time_hours=time_h, - ) - - -def _aggregate_disk( - estimate: ResourceEstimate, - seen_models: dict[str, ModelMeta], - node_max_weights: dict[int, float], - *, - dump_modules: bool, - n_trials: int, -) -> None: - """Fold per-model download/cached sizes into ``estimate`` and apply dump-modules accounting.""" - for meta in seen_models.values(): - if meta.cached_locally: - estimate.disk_cached_gb += meta.disk_gb - else: - estimate.disk_download_gb += meta.disk_gb - if dump_modules: - # Each trial selects one variant per node, so per-trial dumped weights - # are bounded by the heaviest module in each node, summed across nodes. - estimate.disk_dump_gb = sum(node_max_weights.values()) * n_trials - - -def _emit_resource_findings( - report: PreflightReport, - estimate: ResourceEstimate, - hardware: HardwareProfile, - *, - n_jobs: int, -) -> None: - """Translate aggregated estimates into VRAM/RAM/disk/time findings on the report.""" - parallel_gpu = n_jobs > 1 and hardware.accelerator in {"cuda", "mps"} - effective_vram = estimate.vram_gb * n_jobs if parallel_gpu else estimate.vram_gb - # MPS shares one unified pool: parallel workers each allocate weights+activations - # in RAM, so peak RAM also scales with n_jobs on Apple Silicon. - effective_ram = estimate.ram_gb * n_jobs if n_jobs > 1 and hardware.accelerator == "mps" else estimate.ram_gb - - if hardware.accelerator == "cpu" and effective_vram > 0: - report.add( - "resource", - Severity.TIGHT, - f"No GPU detected; transformer modules will be very slow (worst case ~{estimate.time_hours:.1f} h).", - metric="vram", - ) - else: - msg = f"VRAM ~{effective_vram:.1f} GB" - if n_jobs > 1: - msg += f" (= per-trial {estimate.vram_gb:.1f} GB × {n_jobs} parallel trials)" - msg += f" vs available {hardware.vram_gb:.1f} GB" - report.add("resource", _classify_severity(effective_vram, hardware.vram_gb), msg, metric="vram") - - report.add( - "resource", - _classify_severity(effective_ram, hardware.ram_gb), - f"RAM ~{effective_ram:.1f} GB vs available {hardware.ram_gb:.1f} GB", - metric="ram", - ) - - disk_total = estimate.disk_download_gb + estimate.disk_dump_gb - disk_msg = f"Disk ~{estimate.disk_download_gb:.1f} GB to download" - if estimate.disk_cached_gb > 0: - disk_msg += f", {estimate.disk_cached_gb:.1f} GB already cached" - if estimate.disk_dump_gb > 0: - disk_msg += f", +{estimate.disk_dump_gb:.1f} GB during training (dump_modules=True)" - disk_msg += f" vs {hardware.free_disk_gb:.0f} GB free" - report.add("resource", _classify_severity(disk_total, hardware.free_disk_gb), disk_msg, metric="disk") - - if estimate.time_hours > 0: - report.add( - "resource", - Severity.AMPLE, - f"Time ~{estimate.time_hours:.1f} h (worst case, no HPO pruning)", - metric="time", - ) - - -def _resource_phase( - config: dict[str, Any], - stats: DatasetStats, - hardware: HardwareProfile, - report: PreflightReport, - *, - refit_after: bool = False, -) -> None: - cfg = _validated_config(config) - n_trials = cfg.hpo_config.n_trials - n_jobs = cfg.hpo_config.n_jobs - dump_modules = cfg.logging_config.dump_modules - - seen_models: dict[str, ModelMeta] = {} - global_embedder = _embedder_model_name(cfg.embedder_config) - if global_embedder: - seen_models[global_embedder] = resolve_model(global_embedder) - - transformer_entries, classic_entries = _split_entries(cfg.search_space) - - # First pass: transformer modules (also populates seen_models for the classic pass). - module_estimates: list[_ModuleEstimate] = [] - node_max_weights: dict[int, float] = {} - for node_idx, node_type, entry in transformer_entries: - module = entry.get("module_name", "?") - model_names = _extract_model_names(entry) - if not model_names and global_embedder and module in {"knn", "mlknn"}: - model_names = [global_embedder] - for name in model_names: - meta = seen_models.setdefault(name, resolve_model(name)) - me = _estimate_transformer_model( - meta=meta, - entry=entry, - node_type=node_type, - module=module, - name=name, - stats=stats, - hardware=hardware, - n_trials=n_trials, - refit_after=refit_after, - ) - module_estimates.append(me) - # Track heaviest weight per node so dump_modules is bounded by one - # selected variant per node x n_trials, not the sum of all candidates. - node_max_weights[node_idx] = max(node_max_weights.get(node_idx, 0.0), me.model_weights_gb) - - # Second pass: linear / catboost — cost depends on embedder_dim, not a checkpoint. - embedder_meta = _largest_embedder(seen_models) - embedder_dim = _embedder_dim(embedder_meta) - for _, node_type, entry in classic_entries: - classic_estimate = _estimate_classic_entry( - entry=entry, - node_type=node_type, - embedder_meta=embedder_meta, - embedder_dim=embedder_dim, - stats=stats, - hardware=hardware, - n_trials=n_trials, - refit_after=refit_after, - ) - if classic_estimate is not None: - module_estimates.append(classic_estimate) - - estimate = ResourceEstimate(parallel_factor=n_jobs) - for me in module_estimates: - estimate.vram_gb = max(estimate.vram_gb, me.vram_gb) - estimate.ram_gb = max(estimate.ram_gb, me.ram_gb) - estimate.time_hours += me.time_hours - estimate.drivers.append(me.driver) - - _aggregate_disk(estimate, seen_models, node_max_weights, dump_modules=dump_modules, n_trials=n_trials) - - # Flip low_confidence if any model fell back to the heuristic path (Hub - # unreachable, repo missing safetensors metadata, local-path checkpoint). - heuristic_models = [m.name for m in seen_models.values() if m.confidence == "heuristic"] - if heuristic_models: - report.low_confidence = True - report.notes.append( - f"Heuristic fallback used for {len(heuristic_models)} model(s) — sizes are BERT-base " - f"defaults: {', '.join(heuristic_models[:3])}{'...' if len(heuristic_models) > 3 else ''}", # noqa: PLR2004 - ) - - report.resource = estimate - _emit_resource_findings(report, estimate, hardware, n_jobs=n_jobs) - - -def _config_phase( - config: dict[str, Any], - hardware: HardwareProfile, - report: PreflightReport, -) -> None: - hpo = config.get("hpo_config") or {} - n_jobs = int(hpo.get("n_jobs", 1)) - - if n_jobs > 1 and hardware.accelerator in {"cuda", "mps"}: - report.add( - "config", - Severity.TIGHT, - f"hpo_config.n_jobs={n_jobs} on a single GPU multiplies VRAM demand by {n_jobs}×.", - ) - - uses_catboost_gpu = False - for _, entry in _walk_modules(config.get("search_space") or []): - if entry.get("module_name") == "catboost" and entry.get("task_type") == "GPU": - uses_catboost_gpu = True - break - if uses_catboost_gpu and hardware.accelerator != "cuda": - report.add( - "config", - Severity.TIGHT, - "CatBoost task_type=GPU configured but no CUDA detected — will fall back to CPU.", - ) - - -def _data_phase( - config: dict[str, Any], - stats: DatasetStats, - report: PreflightReport, -) -> None: - # token-length truncation (heuristic — we use stats.p95_tokens vs configured max_length) - p95 = stats.p95_tokens or int(stats.avg_tokens * 2.5) - for _, entry in _walk_modules(config.get("search_space") or []): - max_len_value = entry.get("max_length") - if max_len_value is None: - continue - max_len = _max_int(max_len_value, 512) - if p95 > max_len: - severity = Severity.OVER if p95 > max_len * 1.5 else Severity.TIGHT - module_name = entry.get("module_name", "?") - report.add( - "data", - severity, - f"Train tokens p95~{p95} exceeds {module_name}.max_length={max_len}; expect silent truncation.", - ) - - # rare class x linear-CV (LogisticRegressionCV cv=3 needs >=3 samples/class; - # multilabel path uses one-vs-rest without CV so the failure can't occur there) - has_linear = any(e.get("module_name") == "linear" for _, e in _walk_modules(config.get("search_space") or [])) - if has_linear and stats.rare_classes and not stats.multilabel: - report.add( - "data", - Severity.OVER, - (f"LogisticRegressionCV (cv=3) will fail: classes {stats.rare_classes[:5]} have <3 samples."), - ) - - # partial descriptions x description scorer - description_modules = {"description_bi", "description_cross", "description_llm"} - has_description = any( - e.get("module_name") in description_modules for _, e in _walk_modules(config.get("search_space") or []) - ) - if has_description and stats.has_descriptions is False: - report.add( - "data", - Severity.OVER, - "description scorer present but intent descriptions are missing — fill them in or drop the scorer.", - ) - - -def run_preflight( - config: dict[str, Any], - stats: DatasetStats, - hardware: HardwareProfile, - *, - preset_name: str | None = None, - refit_after: bool = False, -) -> PreflightReport: - """Run all three phases and return one report. - - Args: - config: parsed preset / OptimizationConfig dict (top-level keys: - ``search_space``, ``hpo_config``, optional ``embedder_config``, - optional ``logging_config.dump_modules``). - stats: dataset statistics (real or placeholder). - hardware: detected hardware profile. - preset_name: optional friendly name for the report header. - refit_after: matches the ``Pipeline.fit(refit_after=...)`` argument. - When True, time estimates include the extra refit-on-full-data pass. - - Returns: - PreflightReport with findings across resource/data/config phases. - """ - report = PreflightReport( - preset_name=preset_name, - hardware={ - "accelerator": hardware.accelerator, - "device_name": hardware.device_name, - "vram_gb": round(hardware.vram_gb, 2), - "ram_gb": round(hardware.ram_gb, 2), - "free_disk_gb": round(hardware.free_disk_gb, 2), - "device_class": hardware.device_class, - }, - dataset={ - "n_samples": stats.n_samples, - "n_classes": stats.n_classes, - "avg_tokens": stats.avg_tokens, - "p95_tokens": stats.p95_tokens, - "multilabel": stats.multilabel, - "source": stats.source, - }, - ) - report.notes.extend(hardware.notes) - - _resource_phase(config, stats, hardware, report, refit_after=refit_after) - _data_phase(config, stats, report) - _config_phase(config, hardware, report) - - return report diff --git a/src/autointent/_advisor/_hub.py b/src/autointent/_advisor/_hub.py index a5b6126a5..677e6045c 100644 --- a/src/autointent/_advisor/_hub.py +++ b/src/autointent/_advisor/_hub.py @@ -94,7 +94,7 @@ def _hub_metadata(model_name: str) -> ModelMeta | None: return None # Bytes-per-element for safetensors dtype strings. Used to convert the per-dtype # parameter counts (info.safetensors.parameters) into a weighted average - # bytes-per-param for mixed-precision repos. + # bytes-per-param when a checkpoint stores tensors in multiple dtypes. _dtype_bytes: dict[str, int] = { "F64": 8, "F32": 4, diff --git a/src/autointent/_advisor/_workflows.py b/src/autointent/_advisor/workflows.py similarity index 92% rename from src/autointent/_advisor/_workflows.py rename to src/autointent/_advisor/workflows.py index 0bd7ee5bf..2331e225a 100644 --- a/src/autointent/_advisor/_workflows.py +++ b/src/autointent/_advisor/workflows.py @@ -20,9 +20,9 @@ from autointent.custom_types import SearchSpacePreset from autointent.utils import load_preset -from ._estimates import run_preflight from ._hardware import detect_hardware from ._report import DatasetStats, RecommendationResult, Severity +from .runner import run_preflight if TYPE_CHECKING: from collections.abc import Iterable @@ -196,10 +196,11 @@ def recommend( ``RecommendationResult`` with the chosen preset name and full results list. Note: - Among feasible presets we pick the one with the largest estimated - ``time_hours`` (ties broken alphabetically). Higher-quality presets cost - more wall-time, so the slowest feasible preset is also the heaviest - preset that still fits the hardware — i.e. "use what you have". + Among feasible presets we pick the heaviest one that still fits the + hardware budget — "use what you have" semantics. This is a *cost* + ranking, not a quality ranking: a heavier preset is not strictly better + and may overfit on small datasets where a classic-* preset would win on + accuracy. Override ``presets=`` if you want a different ranking. """ hardware = detect_hardware(vram_budget_gb=budget_vram_gb) stats = stats or DatasetStats.placeholder() @@ -221,11 +222,9 @@ def recommend( ) results.append((preset, report)) - # Rank by Literal position (lower index = higher quality); presets the user - # passed via the ``presets`` override but not in BUNDLED_PRESETS sort last. - quality_rank = {name: i for i, name in enumerate(BUNDLED_PRESETS)} + cost_rank = {name: i for i, name in enumerate(BUNDLED_PRESETS)} feasible = [(name, r) for name, r in results if r.is_feasible] - feasible.sort(key=lambda pair: (quality_rank.get(pair[0], len(BUNDLED_PRESETS)), pair[0])) + feasible.sort(key=lambda pair: (cost_rank.get(pair[0], len(BUNDLED_PRESETS)), pair[0])) chosen = feasible[0][0] if feasible else None return RecommendationResult(chosen=chosen, results=results) diff --git a/src/autointent/custom_types/_types.py b/src/autointent/custom_types/_types.py index a54da368d..59e6b87a3 100644 --- a/src/autointent/custom_types/_types.py +++ b/src/autointent/custom_types/_types.py @@ -128,10 +128,14 @@ class Split: "zero-shot-encoders", "classic-light", ] -"""Bundled search-space presets, listed in descending quality order. - -The order is consumed by ``autointent._advisor.recommend`` to pick the -highest-quality feasible preset (lower index = higher quality).""" +"""Bundled search-space presets, listed in descending resource-cost order. + +Heavier presets explore more / larger models and take longer to run. The order +is a cost ranking, **not** a quality ranking: a heavier preset is not strictly +better — e.g. ``transformers-heavy`` will overfit on tiny datasets where a +classic-* preset wins on accuracy. ``autointent._advisor.recommend`` uses this +ordering to pick the heaviest preset that still fits the hardware budget, +which is a reasonable default but not always the right choice for the data.""" class Document(BaseModel): diff --git a/tests/advisor/test_estimates_internals.py b/tests/advisor/test_estimates_internals.py index 3fa293b7e..3d4e4e526 100644 --- a/tests/advisor/test_estimates_internals.py +++ b/tests/advisor/test_estimates_internals.py @@ -138,18 +138,18 @@ def meta(self) -> ModelMeta: ) def test_full_finetune_is_larger_than_lora_is_larger_than_inference(self, meta: ModelMeta) -> None: - inference = _vram_for_transformer(meta, "inference", mixed_precision=False) - lora = _vram_for_transformer(meta, "lora", mixed_precision=False) - full = _vram_for_transformer(meta, "full-finetune", mixed_precision=False) + inference = _vram_for_transformer(meta, "inference") + lora = _vram_for_transformer(meta, "lora") + full = _vram_for_transformer(meta, "full-finetune") assert inference < lora < full def test_inference_activations_are_smaller_than_training(self, meta: ModelMeta) -> None: """Inference doesn't store per-layer outputs for backward — activation memory should be many times smaller than training at the same batch_size.""" - train_total = _vram_for_transformer(meta, "full-finetune", False, batch_size=64, seq_len=128) - train_weights = _vram_for_transformer(meta, "full-finetune", False, batch_size=0) - inf_total = _vram_for_transformer(meta, "inference", False, batch_size=64, seq_len=128) - inf_weights = _vram_for_transformer(meta, "inference", False, batch_size=0) + train_total = _vram_for_transformer(meta, "full-finetune", batch_size=64, seq_len=128) + train_weights = _vram_for_transformer(meta, "full-finetune", batch_size=0) + inf_total = _vram_for_transformer(meta, "inference", batch_size=64, seq_len=128) + inf_weights = _vram_for_transformer(meta, "inference", batch_size=0) train_acts = train_total - train_weights inf_acts = inf_total - inf_weights assert inf_acts > 0 @@ -157,21 +157,6 @@ def test_inference_activations_are_smaller_than_training(self, meta: ModelMeta) # 12-layer model: training activations should be at least ~5x inference. assert train_acts / inf_acts > 5 - def test_amp_does_not_reduce_weight_side_vram(self, meta: ModelMeta) -> None: - """Weight-side AMP accounting: fp16 weights+grads (W) + fp32 master copy (W) - + fp32 Adam moments (2W) = 4W, identical to pure fp32. AMP's savings live - in activations, not the optimizer.""" - full_fp32 = _vram_for_transformer(meta, "full-finetune", mixed_precision=False, batch_size=0) - full_amp = _vram_for_transformer(meta, "full-finetune", mixed_precision=True, batch_size=0) - assert full_amp == pytest.approx(full_fp32) - - def test_amp_does_reduce_activation_side_vram(self, meta: ModelMeta) -> None: - """When a batch is configured, AMP halves activation bytes — total VRAM - with batch should be strictly smaller under AMP than fp32.""" - fp32 = _vram_for_transformer(meta, "full-finetune", mixed_precision=False, batch_size=64, seq_len=128) - amp = _vram_for_transformer(meta, "full-finetune", mixed_precision=True, batch_size=64, seq_len=128) - assert amp < fp32 - def test_ram_scales_with_dataset_size() -> None: meta = ModelMeta( @@ -497,13 +482,13 @@ def test_driver_records_current_and_max_batch(self) -> None: report = run_preflight( self._bert_cfg("microsoft/deberta-v3-large", batch_size=64), DatasetStats.placeholder(), - _profile(vram_gb=6.5), + _profile(vram_gb=7.5), ) drivers = [d for d in report.resource.drivers if d["module"] == "bert"] assert drivers d = drivers[0] assert d["batch_size"] == 64 - # vram_gb=6.5 against ~5 GB weights x 0.9 tight ratio -> little activation room, max < 64. + # vram_gb=7.5 against ~5.9 GB weights x 0.9 tight ratio -> little activation room, max < 64. assert d["max_batch_size"] is not None assert 0 < d["max_batch_size"] < 64