From b6be787cc860feed1c1b552228b5eedb0f6645bc Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Sat, 23 May 2026 19:31:59 +0300
Subject: [PATCH 01/16] add spec

---
 compute-feasibility-advisor-proposal.md | 201 ++++++++++++++++++++++++
 1 file changed, 201 insertions(+)
 create mode 100644 compute-feasibility-advisor-proposal.md

diff --git a/compute-feasibility-advisor-proposal.md b/compute-feasibility-advisor-proposal.md
new file mode 100644
index 000000000..2560d1279
--- /dev/null
+++ b/compute-feasibility-advisor-proposal.md
@@ -0,0 +1,201 @@
+# Compute Feasibility Advisor for AutoIntent
+
+- **Date:** 2026-05-23
+- **Status:** Proposal (pre-implementation)
+- **Audience:** AutoIntent maintainers / contributor picking up the task
+
+## Problem
+
+AutoIntent's main strength is letting a user kick off a full search-space optimization with one call:
+
+```python
+pipeline = Pipeline.from_preset("transformers-heavy")
+pipeline.fit(dataset)
+```
+
+The cost of that convenience is that users — especially those running on a laptop, a single consumer GPU, or a free cloud instance — cannot tell ahead of time whether their hardware can carry the configuration they have just selected.
+
+Concrete failure cases we see today:
+
+- `transformers-heavy` fine-tunes `microsoft/deberta-v3-large` for up to 30 epochs across 40 HPO trials. That needs ~12–18 GB VRAM (full fine-tune, fp32) and many hours of wall time on a single GPU. A user with an 8 GB card finds out by OOM, often several minutes into a run.
+- Swapping `intfloat/multilingual-e5-large-instruct` (2 GB) for `sentence-transformers/all-MiniLM-L6-v2` (90 MB) changes the resource bill by an order of magnitude — but nothing surfaces this difference up front.
+- Disk is a silent failure mode: a search space referencing several large checkpoints can pull >10 GB into the HF cache before any training starts.
+
+The target audience for this feature is users with limited resources who pick a preset, hit `fit()`, and want to know within a second whether they should change something.
+
+## Proposed solution: pre-flight resource advisor
+
+Add a **pre-flight advisor** that, given a parsed search space and a dataset, estimates worst-case disk, RAM, VRAM, and wall-time requirements from public Hugging Face Hub metadata and a small set of formulas, then prints a clear summary with red/yellow/green warnings. By default it is **report-only and never blocks the run**; an opt-in **reduce-to-fit** mode additionally prunes the search space to fit detected hardware.
+
+### Scope
+
+The advisor analyses only the **local, model-bearing** modules whose footprint can be derived from HF Hub metadata. Everything else is either trivial or out of band.
+
+
+| Module category                                                                  | In scope? | Reason                                             |
+| -------------------------------------------------------------------------------- | --------- | -------------------------------------------------- |
+| `SentenceTransformerEmbeddingConfig`                                             | yes       | local transformer, dominant cost on small machines |
+| `VllmEmbeddingConfig`                                                            | yes       | local transformer with extra engine overhead       |
+| `HFModelConfig`-based scorers (`bert`, `lora`, `ptuning`, `dnnc`, cross-encoder) | yes       | the actual heavyweights                            |
+| GCN scorer when configured with a transformer backbone                           | yes       | inherits the backbone cost                         |
+| `OpenaiEmbeddingConfig`                                                          | no        | no local resources to estimate                     |
+| `HashingVectorizerEmbeddingConfig`                                               | no        | trivial cost                                       |
+| `knn`, `mlknn`, `linear`, `sklearn`, `catboost`, `description`                   | no        | negligible next to a fine-tune                     |
+| `decision` and `regex` nodes                                                     | no        | negligible                                         |
+
+
+Rationale: the user's real risk is the heavy transformer-backed modules. A cheap module cannot be the reason a run fails for resource reasons; we don't owe an estimate for it.
+
+### Inputs
+
+- The parsed `OptimizationConfig` (search space, HPO config, embedder/transformer configs).
+- The training `Dataset` (for `dataset_size` and an approximate token-length distribution).
+- Detected local hardware:
+  - Total / available RAM via `psutil`.
+  - Free disk on the AutoIntent / HF cache directory via `shutil.disk_usage`.
+  - Accelerator detection, in priority order:
+    - **CUDA:** per-GPU VRAM and device name via `torch.cuda`.
+    - **MPS (Apple Silicon):** detected via `torch.backends.mps.is_available()`. Apple chips use unified memory, so there is no separate VRAM pool — the "VRAM budget" is a fraction of total system RAM. Default budget = 70 % of total RAM (matching the macOS `PYTORCH_MPS_HIGH_WATERMARK_RATIO` default) with the remainder reserved for the OS and other apps. The fraction is exposed as a knob.
+    - **CPU only:** when neither is available.
+
+### Output
+
+A structured estimate plus a human-readable summary printed to the logger. Example:
+
+```
+Compute feasibility check
+─────────────────────────
+Available : 8 GB VRAM (NVIDIA RTX 3060), 32 GB RAM, 120 GB free disk
+Estimated worst-case requirements for this search space:
+  Disk    : 5.2 GB     (3 unique checkpoints)
+  RAM     : ~4 GB
+  VRAM    : ~14 GB     ⚠  exceeds available
+  Time    : ~6 h       (single-GPU, fp32, rough)
+
+Drivers of cost:
+  scoring.bert   microsoft/deberta-v3-large   full fine-tune × 40 trials × 30 epochs  →  ~14 GB VRAM, ~5 h
+  embedder       intfloat/multilingual-e5-large-instruct                              →  ~2.2 GB VRAM
+
+Suggestions:
+  • Enable mixed precision (fp16/bf16) on the bert scorer
+  • Reduce batch_size from 64 to 16 or 32
+  • Try preset `transformers-light` or `classic-medium`
+
+These numbers are heuristic upper bounds, not measurements.
+```
+
+Numbers are reported with honest precision (one significant figure for time, two for memory) and an explicit "estimate, not measurement" disclaimer.
+
+### Algorithm (proposal, allowed to adjust)
+
+1. **Collect candidates.** Walk the search space; collect every unique `(module_type, model_name, mode)` triple, where `mode ∈ {inference, lora, full-finetune}`. Also collect HPO knobs that drive cost: `n_trials`, `epochs`, `batch_size`, `max_length`, `dtype` (fp16/bf16/fp32).
+2. **Resolve checkpoints.** For each unique `model_name`, query HF Hub for safetensors metadata to read parameter count and weight dtype. Fall back to file-size aggregation if safetensors metadata is missing. Fall back to a "unknown — heuristic only" tag with low-confidence labelling if HF Hub is offline or the repo is private.
+3. **Apply formulas.**
+  - **Disk** = sum over unique checkpoints of total file size, plus a small fixed overhead per checkpoint for tokenizers and config.
+  - **RAM** = max over modules of `params × dtype_bytes + dataset_tokens × 4 bytes`, treated as a loose upper bound for tokenized buffers.
+  - **VRAM per module:**
+    - Inference embedder: `params × dtype_bytes × ~1.3` (small constant for activations).
+    - Full fine-tune (`bert`, GCN backbone, soft-prompt `ptuning`): `params × dtype_bytes × (1 + 1 + 2)` for weights + grads + Adam state, halved when fp16/bf16 mixed precision is configured.
+    - LoRA: inference VRAM + a small adapter constant.
+    - Reranker (cross-encoder, `dnnc`): inference VRAM × small factor for the reranking pass.
+  - **Time per module** = `n_trials × epochs × (dataset_size / batch_size) × per_step_seconds(params, max_length, device_class)`, where `per_step_seconds` is a small static lookup table keyed on coarse device class (`cpu`, `low-gpu`, `mid-gpu`, `high-gpu`, `apple-silicon`) auto-detected from `torch.cuda.get_device_name` or `platform`/`torch.backends.mps`. Total time = sum across modules. MPS time numbers are coarser than CUDA's (one tier for now); we accept that.
+4. **Compare to detected hardware.** Per-dimension status is green / yellow / red against a configurable headroom (defaults: **red** if estimate > 100 % of available, **yellow** if > 70 %). On MPS, "VRAM" and "RAM" estimates draw from the same physical pool; we compare *the larger of the two* against the unified-memory budget rather than each independently.
+5. **Render summary.** Log at INFO. If any dimension is red, emit at WARNING so it shows in non-logging contexts.
+
+### Failure modes
+
+- **HF Hub offline or private repo:** fall back to "unknown model — name-pattern heuristic only", explicit low-confidence label, never raise.
+- **No accelerator (no CUDA and no MPS):** report VRAM as N/A and mark GPU-only modules as "requires GPU" without estimating a (misleading) CPU wall time.
+- **MPS configured but a module is incompatible:** vLLM in particular does not run on MPS. Flag the module as "unsupported on MPS" rather than estimating; do not raise.
+- **MPS with CPU fallback ops:** some PyTorch ops fall back to CPU on MPS, inflating system-RAM usage and wall time beyond the heuristic. Note this in the disclaimer; we don't try to model it.
+- **vLLM configured but not installed:** still estimate (the VRAM accounting is similar), note that the engine itself has additional overhead not captured.
+- **Estimate wildly wrong vs. reality:** always-on disclaimer in the printed summary that these are heuristic upper bounds.
+
+### Reduce-to-fit mode
+
+The feasibility check has two modes sharing the same estimation pipeline:
+
+- **Report mode (default).** Print the summary, return the structured estimate, let the run proceed regardless of severity.
+- **Reduce-to-fit mode (opt-in).** Additionally prune the search space to fit detected hardware before the run starts. Same estimates, same comparisons — just one extra step that produces a reduced search space.
+
+Using the same per-module estimates, the pruner applies three least-destructive steps in order:
+
+1. **Filter discrete-choice hyperparameters.** For lists of cost-driving values (model name, batch size, training epochs), keep only entries whose worst-case estimate fits.
+2. **Cap continuous ranges.** For `{low, high}` ranges of cost-driving parameters, lower the upper bound to the largest fitting value. Ranges of non-cost parameters (learning rate, decision thresholds) are not touched.
+3. **Drop module variants.** If a module entry has any required hyperparameter with no satisfiable value left, drop that module entry from its node's search space.
+
+Guard rails:
+
+- If pruning would leave any node's search space empty, the pruner **raises**. We don't silently produce a non-runnable pipeline, and we don't quietly fall back to report-only — failing loudly is the right contract for a mode whose whole purpose is to make the run feasible. The error message points the user toward a lighter preset.
+- Time is not used as a filter — only memory and disk are. Time is still reported.
+- Headroom thresholds are intentionally generous to avoid over-pruning and are configurable.
+
+Alongside the standard estimate, the caller receives a structured description of what was filtered, capped, and dropped, plus the resulting search space and its recomputed (now green) estimate.
+
+**Drawbacks worth surfacing.**
+
+- **Silent narrowing of intent.** A search space deliberately written to include heavy/light variants for comparison gets halved. The mode is opt-in for this reason.
+- **Over-pruning when our formulas overestimate.** A 30 %-high estimate on a borderline configuration throws away a run that would have succeeded. Generous headroom defaults mitigate; the knob is exposed.
+- **Hard failure when nothing fits.** Raising is intentional — silent degradation to report-only would defeat the mode's purpose — but it is a sharper edge than report mode has.
+- **Pre-trial only.** The rewrite happens before any HPO trial starts. This is fine because the search space is treated as immutable across a study, but worth calling out so nobody tries to make this dynamic later.
+
+## Alternatives considered and rejected
+
+### B. Smoke-test calibration
+
+Run each unique module for one mini-batch / one step before the real fit, measure peak RAM and VRAM with `psutil`, `tracemalloc`, and `torch.cuda.max_memory_allocated`, time the step, and extrapolate to the full search space.
+
+Rejected because:
+
+- It **downloads weights just to estimate** — the disk-headroom check we wanted to provide is defeated by the act of performing it.
+- It can **OOM while predicting OOM**, exactly on the constrained hardware that is the target audience.
+- It adds **seconds to minutes** of wall time before `fit()` does anything, surprising users.
+- It needs per-module "tiny run" hooks; not every scorer has a clean "stop after one step" path.
+- For OpenAI- or vLLM-served embedders, a smoke test costs real money or starts the engine.
+- Still not accurate due to CUDA and CPU cache, memory heating and so on.
+
+### C. Curated benchmark table
+
+Ship a JSON in the package with measured VRAM and per-step time for the bundled-preset checkpoints, broken out by hardware class (cpu / mid-gpu / high-gpu) and mode (inference / lora / full-finetune). Fall back to heuristics for unknown checkpoints.
+
+Rejected because:
+
+- **Maintenance burden:** every new model added to a preset would need entries across the hardware × precision × mode matrix.
+- Numbers **go stale** when `transformers` updates change defaults (attention impl, dtype, gradient checkpointing).
+- It still needs the chosen-solution heuristics as a long-tail fallback — so it adds work on top of Option A without replacing it.
+- **Confident-but-wrong is worse than honest-but-fuzzy.** A table that says "4 GB on 4090" when the user OOMs at 4.5 GB damages trust more than a clearly-labelled range would.
+
+### D. Layered (A by default, opt-in B, embedded table from C, local actuals cache)
+
+Combine all three: ship A as the fast path, allow `calibrate=True` to trigger B for heavy modules only, embed a small table from C for the bundled-preset checkpoints, and write actuals from every real run to a local cache that feeds back into future estimates.
+
+Rejected because:
+
+- **Implementation surface multiplies:** two estimation code paths to keep consistent, a cache schema with versioning and eviction, two failure modes to document.
+- **Discoverability:** users may not learn about `calibrate=True` and the realized value compresses back to roughly Option A anyway.
+- The team's bandwidth doesn't justify the marginal accuracy gain over A for the target audience.
+
+## Comparison
+
+
+| Dimension                        | A (chosen)                     | B (smoke-test)         | C (benchmark table)                | D (layered)                           |
+| -------------------------------- | ------------------------------ | ---------------------- | ---------------------------------- | ------------------------------------- |
+| Wall time at pre-flight          | < 1 s                          | seconds–minutes        | < 1 s                              | < 1 s default, s–min when calibrating |
+| Accuracy on common checkpoints   | medium                         | high                   | high                               | high                                  |
+| Accuracy on custom checkpoints   | medium                         | high                   | medium (fallback)                  | medium–high                           |
+| Time-estimate quality            | low–medium                     | high                   | high                               | high                                  |
+| Disk pre-download required       | no                             | yes                    | no                                 | only when calibrating                 |
+| Risk of OOM during the check     | none                           | real                   | none                               | only when calibrating                 |
+| Network usage                    | 1 cached call per unique model | none beyond normal fit | none                               | combination                           |
+| Implementation effort            | small                          | large                  | medium + ongoing benchmark refresh | large + cache infra                   |
+| Ongoing maintenance              | low (formulas only)            | low                    | high                               | high                                  |
+| Friendly to offline / air-gapped | with fallback                  | yes                    | yes                                | partial                               |
+
+
+The chosen solution accepts a real accuracy gap on time and a moderate accuracy gap on VRAM in exchange for the only profile that fits the target audience's constraints: zero added wall time, zero added downloads, zero added failure modes, and a small one-time implementation cost.
+
+## Out of scope (possible follow-ups)
+
+- Live resource observability during `fit()` (peak RAM / VRAM per trial, abort on overrun).
+- A learned calibration cache from real runs to refine estimates over time.
+

From dceb9854e0dd51171dd42291086266cfb3c269f4 Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Fri, 5 Jun 2026 11:05:27 +0300
Subject: [PATCH 02/16] upd tech spec

---
 compute-feasibility-advisor-proposal.md | 37 +++++++++++++++++++------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/compute-feasibility-advisor-proposal.md b/compute-feasibility-advisor-proposal.md
index 2560d1279..9e7833e3d 100644
--- a/compute-feasibility-advisor-proposal.md
+++ b/compute-feasibility-advisor-proposal.md
@@ -3,6 +3,7 @@
 - **Date:** 2026-05-23
 - **Status:** Proposal (pre-implementation)
 - **Audience:** AutoIntent maintainers / contributor picking up the task
+- **Scope of this document:** technical specification — *what* the advisor estimates and the formulas it uses. Architectural and system-design choices (where the advisor lives in the codebase, how it integrates with the optimizer, the public API surface, file/module layout) are deliberately left to the implementer.
 
 ## Problem
 
@@ -38,13 +39,15 @@ The advisor analyses only the **local, model-bearing** modules whose footprint c
 | `VllmEmbeddingConfig`                                                            | yes       | local transformer with extra engine overhead       |
 | `HFModelConfig`-based scorers (`bert`, `lora`, `ptuning`, `dnnc`, cross-encoder) | yes       | the actual heavyweights                            |
 | GCN scorer when configured with a transformer backbone                           | yes       | inherits the backbone cost                         |
+| `LinearScorer` (sklearn `LogisticRegression` / `LogisticRegressionCV`)           | yes       | dominant cost on presets with no transformer fine-tune; the CV path multiplies a single fit by ~30 |
+| `CatBoostScorer`                                                                 | yes       | dominant cost on presets with no transformer fine-tune; high default `iterations` |
 | `OpenaiEmbeddingConfig`                                                          | no        | no local resources to estimate                     |
 | `HashingVectorizerEmbeddingConfig`                                               | no        | trivial cost                                       |
-| `knn`, `mlknn`, `linear`, `sklearn`, `catboost`, `description`                   | no        | negligible next to a fine-tune                     |
+| `knn`, `mlknn`, generic `sklearn` classifiers via `SklearnScorer`, `description` | no        | bounded so far below any in-scope module that they cannot plausibly be the bottleneck |
 | `decision` and `regex` nodes                                                     | no        | negligible                                         |
 
 
-Rationale: the user's real risk is the heavy transformer-backed modules. A cheap module cannot be the reason a run fails for resource reasons; we don't owe an estimate for it.
+Rationale: the user's real risk is whichever module is the actual bottleneck. On heavy presets that is a transformer fine-tune; on light presets it shifts to `linear` (CV-multiplied) or `catboost` (1000 default iterations × dataset shape). Modules left out of scope are ones whose cost is bounded so far below any in-scope module that they cannot plausibly be the reason a run fails.
 
 ### Inputs
 
@@ -88,17 +91,33 @@ Numbers are reported with honest precision (one significant figure for time, two
 
 ### Algorithm (proposal, allowed to adjust)
 
-1. **Collect candidates.** Walk the search space; collect every unique `(module_type, model_name, mode)` triple, where `mode ∈ {inference, lora, full-finetune}`. Also collect HPO knobs that drive cost: `n_trials`, `epochs`, `batch_size`, `max_length`, `dtype` (fp16/bf16/fp32).
-2. **Resolve checkpoints.** For each unique `model_name`, query HF Hub for safetensors metadata to read parameter count and weight dtype. Fall back to file-size aggregation if safetensors metadata is missing. Fall back to a "unknown — heuristic only" tag with low-confidence labelling if HF Hub is offline or the repo is private.
-3. **Apply formulas.**
-  - **Disk** = sum over unique checkpoints of total file size, plus a small fixed overhead per checkpoint for tokenizers and config.
-  - **RAM** = max over modules of `params × dtype_bytes + dataset_tokens × 4 bytes`, treated as a loose upper bound for tokenized buffers.
+1. **Collect candidates.** Walk the search space; collect every unique in-scope module. For transformer-bearing modules the identity is `(module_type, model_name, mode)` with `mode ∈ {inference, lora, full-finetune}`. For `linear` and `catboost` the identity is `(module_type, embedder_name, task_kind)` with `task_kind ∈ {multiclass, multilabel}` — the routing through `LogisticRegressionCV` vs `MultiOutputClassifier`, and CatBoost's per-class trees, both depend on it. Also collect the HPO knobs that drive cost: `n_trials` plus per-module knobs — transformer (`epochs`, `batch_size`, `max_length`, `dtype` ∈ {fp16, bf16, fp32}), `linear` (`cv`, `max_iter`), `catboost` (`iterations`, `depth`, `task_type`, `features_type`).
+2. **Resolve checkpoints.** For each unique `model_name`, query HF Hub for safetensors metadata to read parameter count and weight dtype. Fall back to file-size aggregation if safetensors metadata is missing. Fall back to a "unknown — heuristic only" tag with low-confidence labelling if HF Hub is offline or the repo is private. `LinearScorer` and `CatBoostScorer` have no checkpoint of their own; they reuse the embedder resolved by this step in their formulas (their cost is parameterised by `embedder_dim`, not parameter count).
+3. **Apply formulas.** All values are honest upper bounds; convergence and early stopping often terminate well below them.
+  - **Disk** = sum over unique downloadable checkpoints of total file size, plus a small fixed overhead per checkpoint for tokenizers and config. `LinearScorer` and `CatBoostScorer` contribute zero (they consume embedder output that is already accounted for upstream).
+  - **RAM per module:**
+    - Transformer modules (any mode): `params × dtype_bytes + dataset_tokens × 4 bytes`, treated as a loose upper bound for tokenized buffers.
+    - `LinearScorer`: `8 × n_samples × embedder_dim` (float64 data matrix — the dominant term) `+ 8 × n_classes × embedder_dim` (coefficients) `+ ~10 × 8 × embedder_dim` (L-BFGS history).
+    - `CatBoostScorer`: `4 × n_samples × n_features` (data, float32 internally) `+ 4 × n_features × n_bins` (histograms; default `n_bins = 254`) `+ iterations × 2^depth × ~32 bytes` (tree storage). For `features_type ∈ {embedding, both}`, `n_features = embedder_dim`. For `features_type = text`, `n_features` is the BoW vocab discovered at fit; bound with a coarse default (e.g. 50 000) and tag the estimate low-confidence.
+    - For `linear` and `catboost`, `embedder_dim` is taken from the largest embedder in the same node group — same worst-case stance as the rest of the estimate.
   - **VRAM per module:**
     - Inference embedder: `params × dtype_bytes × ~1.3` (small constant for activations).
     - Full fine-tune (`bert`, GCN backbone, soft-prompt `ptuning`): `params × dtype_bytes × (1 + 1 + 2)` for weights + grads + Adam state, halved when fp16/bf16 mixed precision is configured.
     - LoRA: inference VRAM + a small adapter constant.
     - Reranker (cross-encoder, `dnnc`): inference VRAM × small factor for the reranking pass.
-  - **Time per module** = `n_trials × epochs × (dataset_size / batch_size) × per_step_seconds(params, max_length, device_class)`, where `per_step_seconds` is a small static lookup table keyed on coarse device class (`cpu`, `low-gpu`, `mid-gpu`, `high-gpu`, `apple-silicon`) auto-detected from `torch.cuda.get_device_name` or `platform`/`torch.backends.mps`. Total time = sum across modules. MPS time numbers are coarser than CUDA's (one tier for now); we accept that.
+    - `LinearScorer`: N/A (sklearn is CPU-only).
+    - `CatBoostScorer`: 0 by default; if `task_type="GPU"` is configured, the RAM formula above lives on device instead.
+  - **Time per module:**
+    - Transformer modules: `n_trials × epochs × (dataset_size / batch_size) × per_step_seconds(params, max_length, device_class)`, where `per_step_seconds` is a small static lookup keyed on coarse device class (`cpu`, `low-gpu`, `mid-gpu`, `high-gpu`, `apple-silicon`) auto-detected from `torch.cuda.get_device_name` or `platform`/`torch.backends.mps`.
+    - `LinearScorer`: `n_trials × C_cpu × n_samples × embedder_dim × max_iter × cv_multiplier × class_multiplier`, where:
+      - `C_cpu ≈ 1e-8 s` per `(sample × feature × iteration)` on a single modern CPU core.
+      - `cv_multiplier = Cs × cv + 1 ≈ 31` for the multiclass path (`LogisticRegressionCV` with default `Cs = 10`, repo default `cv = 3`, plus one final refit). `cv_multiplier = 1` for the multilabel path (no inner CV).
+      - `class_multiplier = n_classes` for the multilabel path (`MultiOutputClassifier` fits one binary LogReg per class); `class_multiplier = 1` otherwise.
+    - `CatBoostScorer`: `n_trials × iterations × C_device × n_samples × n_features × depth × class_multiplier`, where:
+      - `C_device ≈ 1e-9 s` on CPU, ~5–20× faster on GPU. Resolve `C_device` via the same `device_class` lookup as the transformer time formula.
+      - `class_multiplier = n_classes` for both the multiclass `MultiClass` loss (per-class trees per iteration) and the multilabel routing (one CatBoost per class).
+      - Early stopping is not modelled; `iterations` is treated as the upper bound.
+  - Total time = sum across modules. MPS time numbers are coarser than CUDA's (one tier for now); we accept that.
 4. **Compare to detected hardware.** Per-dimension status is green / yellow / red against a configurable headroom (defaults: **red** if estimate > 100 % of available, **yellow** if > 70 %). On MPS, "VRAM" and "RAM" estimates draw from the same physical pool; we compare *the larger of the two* against the unified-memory budget rather than each independently.
 5. **Render summary.** Log at INFO. If any dimension is red, emit at WARNING so it shows in non-logging contexts.
 
@@ -120,7 +139,7 @@ The feasibility check has two modes sharing the same estimation pipeline:
 
 Using the same per-module estimates, the pruner applies three least-destructive steps in order:
 
-1. **Filter discrete-choice hyperparameters.** For lists of cost-driving values (model name, batch size, training epochs), keep only entries whose worst-case estimate fits.
+1. **Filter discrete-choice hyperparameters.** For lists of cost-driving values (model name, batch size, training epochs, CatBoost `iterations` / `depth`, sklearn `cv`), keep only entries whose worst-case estimate fits.
 2. **Cap continuous ranges.** For `{low, high}` ranges of cost-driving parameters, lower the upper bound to the largest fitting value. Ranges of non-cost parameters (learning rate, decision thresholds) are not touched.
 3. **Drop module variants.** If a module entry has any required hyperparameter with no satisfiable value left, drop that module entry from its node's search space.
 

From 94b4e121a7cad8527ceab66984d2e23086799775 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Wed, 10 Jun 2026 02:16:01 +0300
Subject: [PATCH 03/16] add feasibility advisor: CLI script, package, tests;
 expand proposal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- proposal: introduce 3-phase framing (resource/data/config), add
  resource-phase refinements (warm cache, n_jobs × VRAM, refit_after,
  Hub reachability, CatBoost GPU sanity), data-quality phase (token
  truncation, split readiness, partial descriptions, embedder dim),
  config sanity phase, updated example output, CLI surface, out-of-
  scope deferrals
- _advisor package: hardware detection (CUDA/MPS/CPU with broken-CUDA
  fallback), HF Hub metadata + warm-cache probe + offline heuristics,
  three-phase run_preflight returning structured PreflightReport,
  text + JSON renderers
- autointent-advisor CLI: inspect <preset|config> and recommend
  subcommands; placeholder dataset stats when no --dataset given
- 88 offline tests covering hardware fallbacks, every bundled preset,
  severity routing, report serialization, name-pattern heuristics,
  AMP invariant, dump_modules / refit_after, CLI flows

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 compute-feasibility-advisor-proposal.md   |  71 +++-
 pyproject.toml                            |   1 +
 src/autointent/_advisor/__init__.py       |  23 ++
 src/autointent/_advisor/_cli.py           | 243 ++++++++++++++
 src/autointent/_advisor/_estimates.py     | 382 ++++++++++++++++++++++
 src/autointent/_advisor/_hardware.py      | 160 +++++++++
 src/autointent/_advisor/_hub.py           | 183 +++++++++++
 src/autointent/_advisor/_render.py        | 104 ++++++
 src/autointent/_advisor/_report.py        | 113 +++++++
 tests/advisor/__init__.py                 |   0
 tests/advisor/test_estimates_and_cli.py   | 198 +++++++++++
 tests/advisor/test_estimates_internals.py | 319 ++++++++++++++++++
 tests/advisor/test_hardware_detection.py  |  72 ++++
 tests/advisor/test_hub_heuristics.py      |  81 +++++
 tests/advisor/test_render.py              | 151 +++++++++
 tests/advisor/test_report.py              |  85 +++++
 16 files changed, 2181 insertions(+), 5 deletions(-)
 create mode 100644 src/autointent/_advisor/__init__.py
 create mode 100644 src/autointent/_advisor/_cli.py
 create mode 100644 src/autointent/_advisor/_estimates.py
 create mode 100644 src/autointent/_advisor/_hardware.py
 create mode 100644 src/autointent/_advisor/_hub.py
 create mode 100644 src/autointent/_advisor/_render.py
 create mode 100644 src/autointent/_advisor/_report.py
 create mode 100644 tests/advisor/__init__.py
 create mode 100644 tests/advisor/test_estimates_and_cli.py
 create mode 100644 tests/advisor/test_estimates_internals.py
 create mode 100644 tests/advisor/test_hardware_detection.py
 create mode 100644 tests/advisor/test_hub_heuristics.py
 create mode 100644 tests/advisor/test_render.py
 create mode 100644 tests/advisor/test_report.py

diff --git a/compute-feasibility-advisor-proposal.md b/compute-feasibility-advisor-proposal.md
index 9e7833e3d..7ebf70bd9 100644
--- a/compute-feasibility-advisor-proposal.md
+++ b/compute-feasibility-advisor-proposal.md
@@ -49,6 +49,16 @@ The advisor analyses only the **local, model-bearing** modules whose footprint c
 
 Rationale: the user's real risk is whichever module is the actual bottleneck. On heavy presets that is a transformer fine-tune; on light presets it shifts to `linear` (CV-multiplied) or `catboost` (1000 default iterations × dataset shape). Modules left out of scope are ones whose cost is bounded so far below any in-scope module that they cannot plausibly be the reason a run fails.
 
+### Phases
+
+The advisor is one entry point, but internally splits work into three phases that share a single `PreflightReport` object. The split is internal organization — all three run at the same hook point (after `validate_modules`, before `_fit(context)`) and the user sees one summary. Separating them keeps each phase's inputs, formulas, and failure modes scoped:
+
+- **Resource phase.** Disk / RAM / VRAM / wall-time estimates and comparisons against detected hardware. Most of the formulas in this document live here. This is the only phase consumed by the reduce-to-fit pruner.
+- **Data quality phase.** Findings derived from the dataset jointly with the active search space — token-length truncation, split readiness (auto-invokes the existing `check_split_readiness` utility rather than re-implementing it), partial intent descriptions paired with the `description` scorer, embedder/scorer dimension consistency. Reports red/yellow lines but never prunes the search space; the user fixes the dataset or the config.
+- **Configuration sanity phase.** Joint checks across dataset + search-space + hardware that don't slot cleanly into the other two — e.g., `hpo_config.n_jobs > 1` × per-trial VRAM contention, CatBoost `task_type="GPU"` with no CUDA. Pydantic schema validation already runs upstream on `OptimizationConfig`; this phase only adds checks that need joint inspection.
+
+The advisor consumes `validate_modules`'s *post-filter* view of `self.nodes` — it does not duplicate that mutating filter.
+
 ### Inputs
 
 - The parsed `OptimizationConfig` (search space, HPO config, embedder/transformer configs).
@@ -68,12 +78,19 @@ A structured estimate plus a human-readable summary printed to the logger. Examp
 ```
 Compute feasibility check
 ─────────────────────────
-Available : 8 GB VRAM (NVIDIA RTX 3060), 32 GB RAM, 120 GB free disk
-Estimated worst-case requirements for this search space:
-  Disk    : 5.2 GB     (3 unique checkpoints)
+Resource:
+  Available : 8 GB VRAM (NVIDIA RTX 3060), 32 GB RAM, 120 GB free disk
+  Disk    : 5.2 GB to download, 1.1 GB already cached  (3 unique checkpoints)
   RAM     : ~4 GB
-  VRAM    : ~14 GB     ⚠  exceeds available
-  Time    : ~6 h       (single-GPU, fp32, rough)
+  VRAM    : ~14 GB × 2 parallel trials (n_jobs=2) ⚠  exceeds available
+  Time    : ~6 h  (+~12 min for refit_after)            (single-GPU, fp32, rough)
+
+Data:
+  Train tokens p95 : 612 (exceeds bert.max_length=512) ⚠  ~7% truncated
+  Split readiness  : 2 classes have <3 samples — LogisticRegressionCV cv=3 will fail ✗
+
+Config:
+  CatBoost task_type=GPU but no CUDA detected — will fall back to CPU ⚠
 
 Drivers of cost:
   scoring.bert   microsoft/deberta-v3-large   full fine-tune × 40 trials × 30 epochs  →  ~14 GB VRAM, ~5 h
@@ -82,6 +99,7 @@ Drivers of cost:
 Suggestions:
   • Enable mixed precision (fp16/bf16) on the bert scorer
   • Reduce batch_size from 64 to 16 or 32
+  • Set hpo_config.n_jobs=1 — parallel trials are doubling VRAM demand
   • Try preset `transformers-light` or `classic-medium`
 
 These numbers are heuristic upper bounds, not measurements.
@@ -121,6 +139,34 @@ Numbers are reported with honest precision (one significant figure for time, two
 4. **Compare to detected hardware.** Per-dimension status is green / yellow / red against a configurable headroom (defaults: **red** if estimate > 100 % of available, **yellow** if > 70 %). On MPS, "VRAM" and "RAM" estimates draw from the same physical pool; we compare *the larger of the two* against the unified-memory budget rather than each independently.
 5. **Render summary.** Log at INFO. If any dimension is red, emit at WARNING so it shows in non-logging contexts.
 
+#### Resource-phase refinements
+
+These adjust the formulas above for situations that look fine in single-trial isolation but blow up in practice:
+
+- **Cold-vs-warm HF cache (Tier 1).** Before reporting disk, probe each unique `model_name` against the local HF cache via `huggingface_hub.try_to_load_from_cache` / `scan_cache_dir`, keyed off `HF_HOME`. Split the disk line into `to_download` vs `already_cached`. Treat a repo as cached only if the weight shard (`model.safetensors` or equivalent) is present — not just config/tokenizer files. Without this, a repeated run on the same machine alarms the user about gigabytes they already have.
+- **Concurrent-trial × per-trial VRAM (Tier 1).** Multiply the per-trial VRAM estimate by `hpo_config.n_jobs` when `n_jobs > 1` and the active accelerator is GPU. Same for the `dump_modules=True` path on disk: each trial writes module weights to the dump dir, so multiply per-module dump-disk by `n_trials`. vLLM is process-isolated and its contention model differs; note this in the disclaimer.
+- **`refit_after=True` time delta (Tier 2).** When `Pipeline.fit(refit_after=True)`, add one full-data training pass per node to the time estimate. Small term but easy to forget; users running close to their time budget care about it.
+- **HF Hub reachability probe (Tier 2).** One up-front `HfApi().whoami()` (or unauthenticated `HEAD` to `huggingface.co`) at the start of the phase. On failure, consistently downgrade *all* model entries to the "unknown — heuristic only" path instead of timing out per-model 10× on a 10-model search space.
+- **CatBoost `task_type="GPU"` sanity (Tier 2).** When CatBoost is in the search space with `task_type="GPU"` but `torch.cuda.is_available()` is false, tag yellow — CatBoost silently falls back to CPU and the user otherwise sees CPU speeds with no warning.
+
+### Data quality phase
+
+The resource phase predicts whether the run *fits*. The data quality phase predicts whether the run *produces a meaningful result*. Both are caught at the same hook point because both have the same failure mode from the user's perspective: hours of compute followed by a cryptic error or a silently degraded model.
+
+- **Token-length truncation (Tier 1).** Sample ~1000 utterances from the train split, tokenize against each unique transformer's tokenizer, compute `p95_tokens` and `% truncated` against the module's `max_length`. Yellow when >1% truncated; red when >10%. Reuse the tokenizer the resource phase already loaded for parameter-count resolution — don't double-fetch. The existing pipeline silently truncates (sentence-transformers and the HF Trainer both default to `truncation=True`); there is no warning anywhere today.
+- **Auto-invoke `check_split_readiness` (Tier 1).** Call the existing utility at `context/data_handler/_readiness_util.py:44–109` with the active `data_config` and surface its `SplitReadinessResult` — it already returns `underpopulated_classes`, `ready`, and a `reason` string, but is not called anywhere from `Pipeline.fit()` today. When `LinearScorer` with CV is in the search space and any class has `n < cv`, name the module by name in the red line ("`LogisticRegressionCV` cv=3 will fail: classes [X, Y] have <3 samples") rather than emitting a generic split-readiness message.
+- **Partial intent descriptions × `description` scorer (Tier 1).** The dataset constructor already warns once at import when *some* but not all intents have descriptions (`_dataset/_dataset.py:199–207`). The advisor escalates this to red when the `description` scorer is also present in the active search space — otherwise the run will produce NaN embeddings for the missing intents. Action message: "fill in N missing descriptions", not "drop the scorer".
+- **Embedder ↔ scorer dimension consistency (Tier 2).** For `LinearScorer` / `CatBoostScorer` with `features_type="both"`, verify the embedder reachable from the same node group exposes a stable, expected dimension. Cross-node walk; surface as yellow when the resolved dimension cannot be confirmed pre-flight.
+
+### Configuration sanity phase
+
+Pydantic schema validation on `OptimizationConfig` runs upstream at config-load time; this phase only adds checks that require *joint* inspection of dataset + search-space + hardware. With Tier 1 + Tier 2 in scope today, this phase holds two items:
+
+- The `n_jobs × VRAM` callout, surfaced jointly with the resource phase (single line in the rendered output).
+- The CatBoost `task_type="GPU"` without CUDA check, same.
+
+Both could live entirely in the resource phase; they get their own phase because future additions — joint scorer↔decision shape checks, OOS-support mismatches detected up front rather than at module instantiation, embedder-dimension mismatches — slot here naturally. Keep the phase scaffold even if it is currently thin.
+
 ### Failure modes
 
 - **HF Hub offline or private repo:** fall back to "unknown model — name-pattern heuristic only", explicit low-confidence label, never raise.
@@ -137,6 +183,8 @@ The feasibility check has two modes sharing the same estimation pipeline:
 - **Report mode (default).** Print the summary, return the structured estimate, let the run proceed regardless of severity.
 - **Reduce-to-fit mode (opt-in).** Additionally prune the search space to fit detected hardware before the run starts. Same estimates, same comparisons — just one extra step that produces a reduced search space.
 
+Reduce-to-fit consumes only the **resource phase** output. Data-quality and config-sanity findings are reported but never trigger pruning — they require user action (fix the dataset, change a config flag), not search-space narrowing.
+
 Using the same per-module estimates, the pruner applies three least-destructive steps in order:
 
 1. **Filter discrete-choice hyperparameters.** For lists of cost-driving values (model name, batch size, training epochs, CatBoost `iterations` / `depth`, sklearn `cv`), keep only entries whose worst-case estimate fits.
@@ -158,6 +206,15 @@ Alongside the standard estimate, the caller receives a structured description of
 - **Hard failure when nothing fits.** Raising is intentional — silent degradation to report-only would defeat the mode's purpose — but it is a sharper edge than report mode has.
 - **Pre-trial only.** The rewrite happens before any HPO trial starts. This is fine because the search space is treated as immutable across a study, but worth calling out so nobody tries to make this dynamic later.
 
+### CLI surface
+
+The advisor is also exposed as a console script (`autointent-advisor`) so users can answer "what will this cost?" and "what should I run?" without writing Python. Two subcommands:
+
+- **`autointent-advisor inspect <preset-name | path/to/config.yaml>`.** Resolves the preset (or a user-supplied `OptimizationConfig`), detects local hardware, runs the same three-phase advisor that `Pipeline.fit()` runs, and prints the same report. Accepts `--dataset` for a real dataset, or `--n-samples / --n-classes / --avg-tokens` placeholders when the dataset is not yet built — so the script is useful before any training data exists. `--json` emits the structured `PreflightReport` for scripting.
+- **`autointent-advisor recommend [--n-samples ... | --dataset ...] [--budget-time 12h] [--budget-vram-gb 8]`.** Detects local hardware (with manual overrides applied), iterates over the bundled presets in `_presets/`, and tags each as `feasible` / `feasible-with-reduce` / `infeasible`. Ranks feasible presets by quality tier (`heavy > medium > light`) then estimated wall-time; picks the top one as the recommendation. For the heaviest infeasible preset, surfaces the single most-impactful knob change that would make it fit (e.g., "`transformers-heavy` would fit if `batch_size` ≤ 16 and `dtype=fp16`"), reusing the reduce-to-fit pruner's per-knob delta info.
+
+**Constraints (both subcommands).** No model downloads — only HF Hub metadata endpoints (`HfApi().model_info`); never `from_pretrained`. Offline-safe — on Hub unreachability, fall back to the same "heuristic only" path and mark the report low-confidence; do not raise. Hardware-detection failures (broken CUDA install where `torch.cuda.mem_get_info()` raises) fall back to CPU detection and tag the report rather than crashing.
+
 ## Alternatives considered and rejected
 
 ### B. Smoke-test calibration
@@ -217,4 +274,8 @@ The chosen solution accepts a real accuracy gap on time and a moderate accuracy
 
 - Live resource observability during `fit()` (peak RAM / VRAM per trial, abort on overrun).
 - A learned calibration cache from real runs to refine estimates over time.
+- **Determinism / `cudnn.deterministic` check.** Belongs in seed-setting code (`set_seed` utility, `Pipeline.__init__`), not in a feasibility advisor — reproducibility is not a hardware-budget question.
+- **OpenAI / Generator token-cost ($) estimation.** Real value, but pricing tables age badly, the `StructuredOutputCache` hit rate is unknowable upfront, and the API-paying audience overlaps poorly with this advisor's stated audience (resource-constrained local users). Push to a separate `cost_estimator` tool.
+- **Predictive CO₂ / emissions.** `_callbacks/emissions_tracker.py` already does this retrospectively, accurately. A predictive version multiplies our (loose) time estimate by a regional kWh/CO₂ factor — two sources of imprecision compounded. The retrospective number is the trustworthy one.
+- **vLLM startup compile time.** Minutes of overhead before any work, but vLLM is unsupported on MPS, isn't the dominant cost on CUDA once running, and modelling it needs a startup-time lookup table. Note once in the disclaimer; do not model.
 
diff --git a/pyproject.toml b/pyproject.toml
index 202a8cb3f..b47993faf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -141,6 +141,7 @@ Documentation = "https://deeppavlov.github.io/AutoIntent/"
 [project.scripts]
 "basic-aug" = "autointent.generation.utterances.basic.cli:main"
 "evolution-aug" = "autointent.generation.utterances.evolution.cli:main"
+"autointent-advisor" = "autointent._advisor._cli:main"
 
 [build-system]
 requires = ["uv_build>=0.8.7,<0.9.0"]
diff --git a/src/autointent/_advisor/__init__.py b/src/autointent/_advisor/__init__.py
new file mode 100644
index 000000000..5f29b028e
--- /dev/null
+++ b/src/autointent/_advisor/__init__.py
@@ -0,0 +1,23 @@
+"""Pre-flight compute feasibility advisor.
+
+Exposes a small surface used by both ``Pipeline.fit()`` (future integration) and
+the ``autointent-advisor`` CLI script. See ``compute-feasibility-advisor-proposal.md``
+at the repo root for the design document.
+"""
+
+from __future__ import annotations
+
+from ._hardware import HardwareProfile, detect_hardware
+from ._report import DatasetStats, Finding, PreflightReport, ResourceEstimate, Severity
+from ._estimates import run_preflight
+
+__all__ = [
+    "DatasetStats",
+    "Finding",
+    "HardwareProfile",
+    "PreflightReport",
+    "ResourceEstimate",
+    "Severity",
+    "detect_hardware",
+    "run_preflight",
+]
diff --git a/src/autointent/_advisor/_cli.py b/src/autointent/_advisor/_cli.py
new file mode 100644
index 000000000..4e7eae000
--- /dev/null
+++ b/src/autointent/_advisor/_cli.py
@@ -0,0 +1,243 @@
+"""Console-script entry point for the pre-flight advisor.
+
+Two subcommands:
+
+* ``inspect`` — show what a given preset / config will cost on this machine.
+* ``recommend`` — pick the best-fitting bundled preset for this machine.
+
+Both subcommands accept either a real ``--dataset`` (path to load with
+``Dataset.from_*`` constructors) or ``--n-samples / --n-classes / --avg-tokens``
+placeholders so the script is useful before the user has built a dataset.
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+from ._estimates import run_preflight
+from ._hardware import detect_hardware
+from ._render import render_json, render_recommendation, render_text
+from ._report import DatasetStats, PreflightReport
+
+logger = logging.getLogger("autointent.advisor")
+
+BUNDLED_PRESETS = [
+    "transformers-heavy",
+    "transformers-light",
+    "transformers-no-hpo",
+    "nn-heavy",
+    "nn-medium",
+    "classic-heavy",
+    "classic-medium",
+    "classic-light",
+    "zero-shot-encoders",
+    "zero-shot-llm",
+]
+
+# rough quality tiering used by `recommend`
+_QUALITY_TIER = {
+    "transformers-heavy": 5,
+    "nn-heavy": 4,
+    "transformers-light": 4,
+    "nn-medium": 3,
+    "classic-heavy": 3,
+    "transformers-no-hpo": 3,
+    "classic-medium": 2,
+    "classic-light": 1,
+    "zero-shot-encoders": 2,
+    "zero-shot-llm": 4,
+}
+
+
+def _load_config(target: str) -> tuple[dict[str, Any], str]:
+    """Return (config_dict, friendly_name) for either a preset or a path."""
+    path = Path(target)
+    if path.is_file():
+        with path.open(encoding="utf-8") as f:
+            return yaml.safe_load(f), path.stem
+    # treat as a bundled preset name
+    from autointent.utils import load_preset
+
+    return load_preset(target), target  # type: ignore[arg-type]
+
+
+def _stats_from_args(args: argparse.Namespace) -> DatasetStats:
+    if args.dataset:
+        return _stats_from_dataset(args.dataset, multilabel=args.task == "multilabel")
+    return DatasetStats.placeholder(
+        n_samples=args.n_samples,
+        n_classes=args.n_classes,
+        avg_tokens=args.avg_tokens,
+        multilabel=args.task == "multilabel",
+    )
+
+
+def _stats_from_dataset(path: str, *, multilabel: bool) -> DatasetStats:
+    """Best-effort: load a dataset from disk via the existing Dataset constructor."""
+    try:
+        from autointent import Dataset
+    except ImportError:
+        logger.warning("autointent.Dataset unavailable; falling back to placeholders.")
+        return DatasetStats.placeholder(multilabel=multilabel)
+
+    try:
+        ds = Dataset.from_json(path) if path.endswith(".json") else Dataset.from_hub(path)
+    except Exception as e:  # noqa: BLE001
+        logger.warning("Failed to load dataset %s: %s", path, e)
+        return DatasetStats.placeholder(multilabel=multilabel)
+
+    train = ds.get("train") or next(iter(ds.values()), None)
+    if train is None:
+        return DatasetStats.placeholder(multilabel=multilabel)
+
+    utt_col = getattr(ds, "utterance_feature", "utterance")
+    sample = train[:1000] if len(train) > 1000 else train[:]
+    lengths = [len(str(s).split()) for s in sample.get(utt_col, [])]
+    avg_tokens = int(sum(lengths) / max(1, len(lengths))) if lengths else 32
+    p95 = sorted(lengths)[int(len(lengths) * 0.95)] if lengths else avg_tokens * 2
+
+    return DatasetStats(
+        n_samples=len(train),
+        n_classes=getattr(ds, "n_classes", 0) or 0,
+        avg_tokens=avg_tokens,
+        p95_tokens=p95,
+        multilabel=getattr(ds, "multilabel", multilabel),
+        has_descriptions=getattr(ds, "has_descriptions", None),
+        source=f"dataset:{path}",
+    )
+
+
+def _add_common_dataset_args(p: argparse.ArgumentParser) -> None:
+    p.add_argument("--dataset", help="Path or hub id of a dataset; overrides placeholders.")
+    p.add_argument("--n-samples", type=int, default=1_000, help="Placeholder training set size.")
+    p.add_argument("--n-classes", type=int, default=10, help="Placeholder class count.")
+    p.add_argument("--avg-tokens", type=int, default=32, help="Placeholder average token length.")
+    p.add_argument(
+        "--task",
+        choices=("multiclass", "multilabel"),
+        default="multiclass",
+        help="Placeholder task type when --dataset isn't given.",
+    )
+
+
+def cmd_inspect(args: argparse.Namespace) -> int:
+    config, name = _load_config(args.target)
+    hardware = detect_hardware(
+        vram_budget_gb=args.budget_vram_gb,
+    )
+    stats = _stats_from_args(args)
+    report = run_preflight(config, stats, hardware, preset_name=name)
+    if args.json:
+        sys.stdout.write(render_json(report))
+        sys.stdout.write("\n")
+    else:
+        sys.stdout.write(render_text(report))
+        sys.stdout.write("\n")
+    return 0 if report.is_feasible else 1
+
+
+def cmd_recommend(args: argparse.Namespace) -> int:
+    hardware = detect_hardware(vram_budget_gb=args.budget_vram_gb)
+    stats = _stats_from_args(args)
+
+    results: list[tuple[str, PreflightReport]] = []
+    from autointent.utils import load_preset
+
+    for preset in BUNDLED_PRESETS:
+        try:
+            cfg = load_preset(preset)  # type: ignore[arg-type]
+        except Exception as e:  # noqa: BLE001
+            logger.debug("Skipping preset %s: %s", preset, e)
+            continue
+        report = run_preflight(cfg, stats, hardware, preset_name=preset)
+        if args.budget_time_h is not None and report.resource.time_hours > args.budget_time_h:
+            report.add(
+                "resource",
+                report.worst_severity if report.worst_severity.value == "red" else report.worst_severity,  # noqa: PLW0125 - explicit
+                f"Estimated time {report.resource.time_hours:.1f} h exceeds budget {args.budget_time_h} h.",
+            )
+        results.append((preset, report))
+
+    feasible = [(name, r) for name, r in results if r.is_feasible]
+    feasible.sort(
+        key=lambda pair: (-_QUALITY_TIER.get(pair[0], 0), pair[1].resource.time_hours, pair[0])
+    )
+    chosen = feasible[0][0] if feasible else None
+
+    if args.json:
+        import json
+
+        out = {
+            "chosen": chosen,
+            "results": [
+                {"preset": name, "report": r.to_dict()} for name, r in results
+            ],
+        }
+        sys.stdout.write(json.dumps(out, indent=2, default=str))
+        sys.stdout.write("\n")
+    else:
+        sys.stdout.write(render_recommendation(results, chosen))
+        sys.stdout.write("\n")
+        if chosen:
+            sys.stdout.write("\n")
+            sys.stdout.write(render_text(dict(results)[chosen]))
+            sys.stdout.write("\n")
+    return 0 if chosen else 1
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="autointent-advisor",
+        description="Pre-flight feasibility advisor for AutoIntent search-space optimization.",
+    )
+    parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging.")
+
+    sub = parser.add_subparsers(dest="cmd", required=True)
+
+    p_inspect = sub.add_parser(
+        "inspect",
+        help="Inspect a preset or OptimizationConfig and print a feasibility report.",
+    )
+    p_inspect.add_argument("target", help="Preset name (e.g. transformers-light) or path to a YAML config.")
+    p_inspect.add_argument("--json", action="store_true", help="Emit a structured JSON report.")
+    p_inspect.add_argument(
+        "--budget-vram-gb", type=float, default=None, help="Override detected VRAM budget."
+    )
+    _add_common_dataset_args(p_inspect)
+    p_inspect.set_defaults(func=cmd_inspect)
+
+    p_rec = sub.add_parser(
+        "recommend",
+        help="Detect hardware and recommend the best-fitting bundled preset.",
+    )
+    p_rec.add_argument("--json", action="store_true", help="Emit a structured JSON report.")
+    p_rec.add_argument(
+        "--budget-vram-gb", type=float, default=None, help="Override detected VRAM budget."
+    )
+    p_rec.add_argument(
+        "--budget-time-h", type=float, default=None, help="Optional wall-time ceiling in hours."
+    )
+    _add_common_dataset_args(p_rec)
+    p_rec.set_defaults(func=cmd_recommend)
+
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.WARNING,
+        format="%(levelname)s %(name)s: %(message)s",
+    )
+    return args.func(args)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/src/autointent/_advisor/_estimates.py b/src/autointent/_advisor/_estimates.py
new file mode 100644
index 000000000..f60f940a6
--- /dev/null
+++ b/src/autointent/_advisor/_estimates.py
@@ -0,0 +1,382 @@
+"""Resource-phase estimation: walk the search space and aggregate cost.
+
+Implements an honest worst-case for the modules the proposal lists as
+in-scope. Formulas are intentionally coarse — the advisor's contract is
+"heuristic upper bound, not measurement". Time and VRAM are the noisiest;
+treat them as ballparks, not budgets.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Iterable
+
+from ._hardware import HardwareProfile
+from ._hub import ModelMeta, hub_reachable, resolve_model
+from ._report import DatasetStats, PreflightReport, ResourceEstimate, Severity
+
+logger = logging.getLogger(__name__)
+
+# yellow / red thresholds as fraction of available budget
+_YELLOW = 0.7
+_RED = 1.0
+
+# rough per-step seconds, keyed on device class. Scaled by params_millions / 100.
+_PER_STEP_BASELINE_S = {
+    "cpu": 0.5,
+    "low-gpu": 0.04,
+    "mid-gpu": 0.02,
+    "high-gpu": 0.01,
+    "apple-silicon": 0.08,
+}
+
+TRANSFORMER_SCORER_MODULES = {"bert", "lora", "ptuning", "dnnc"}
+
+
+def _extract_model_names(module_entry: dict[str, Any]) -> list[str]:
+    """Pull model name(s) from a search-space module entry."""
+    candidates: list[str] = []
+    cfg = module_entry.get("classification_model_config")
+    if isinstance(cfg, list):
+        for c in cfg:
+            if isinstance(c, dict) and c.get("model_name"):
+                candidates.append(c["model_name"])
+    elif isinstance(cfg, dict) and cfg.get("model_name"):
+        candidates.append(cfg["model_name"])
+    embedder_cfg = module_entry.get("embedder_config")
+    if isinstance(embedder_cfg, list):
+        for c in embedder_cfg:
+            if isinstance(c, dict) and c.get("model_name"):
+                candidates.append(c["model_name"])
+    elif isinstance(embedder_cfg, dict) and embedder_cfg.get("model_name"):
+        candidates.append(embedder_cfg["model_name"])
+    return candidates
+
+
+def _max_int(value: Any, default: int) -> int:
+    if value is None:
+        return default
+    if isinstance(value, list) and value:
+        return max(int(x) for x in value)
+    if isinstance(value, dict):
+        return int(value.get("high", default))
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return default
+
+
+def _walk_modules(search_space: list[dict[str, Any]]) -> Iterable[tuple[str, dict[str, Any]]]:
+    """Yield (node_type, module_entry) pairs."""
+    for node in search_space or []:
+        node_type = node.get("node_type", "?")
+        for entry in node.get("search_space", []) or []:
+            yield node_type, entry
+
+
+def _vram_for_transformer(meta: ModelMeta, mode: str, mixed_precision: bool) -> float:
+    """VRAM in GB for one trial of a transformer-based module.
+
+    Conservative AMP accounting (the proposal flags the prior naive halving
+    as too generous; keep optimizer state at fp32 even in AMP).
+    """
+    weights_gb = meta.weights_gb
+    if mode == "inference":
+        return weights_gb * 1.3
+    if mode == "lora":
+        return weights_gb * 1.3 + 0.5
+    if mode == "reranker":
+        return weights_gb * 1.5
+    # full fine-tune (bert, ptuning, gcn-with-backbone)
+    if mixed_precision:
+        # fp16 weights+grads + fp32 master+adam moments
+        return (weights_gb * 0.5) * 2 + weights_gb * 1 + weights_gb * 2
+    return weights_gb * (1 + 1 + 2)
+
+
+def _ram_for_module(meta: ModelMeta, stats: DatasetStats) -> float:
+    """RAM in GB. Loose upper bound."""
+    return meta.weights_gb + (stats.n_samples * stats.avg_tokens * 4) / (1024**3)
+
+
+def _time_for_transformer(
+    *,
+    meta: ModelMeta,
+    n_trials: int,
+    epochs: int,
+    batch_size: int,
+    n_samples: int,
+    device_class: str,
+) -> float:
+    per_step = _PER_STEP_BASELINE_S[device_class] * (meta.params_millions / 100.0)
+    steps = max(1, (n_samples // max(1, batch_size))) * epochs
+    return (n_trials * steps * per_step) / 3600.0
+
+
+def _classify_severity(estimate: float, budget: float) -> Severity:
+    if budget <= 0:
+        return Severity.YELLOW
+    ratio = estimate / budget
+    if ratio >= _RED:
+        return Severity.RED
+    if ratio >= _YELLOW:
+        return Severity.YELLOW
+    return Severity.GREEN
+
+
+def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
+    config: dict[str, Any],
+    stats: DatasetStats,
+    hardware: HardwareProfile,
+    report: PreflightReport,
+) -> None:
+    hpo = config.get("hpo_config") or {}
+    n_trials = int(hpo.get("n_trials", 1))
+    n_jobs = int(hpo.get("n_jobs", 1))
+    refit_after = bool(config.get("refit_after", False))
+    dump_modules = bool(config.get("dump_modules", False))
+
+    if not hub_reachable():
+        report.low_confidence = True
+        report.notes.append("HF Hub unreachable — all model sizes are name-pattern heuristics.")
+
+    seen_models: dict[str, ModelMeta] = {}
+    estimate = ResourceEstimate(parallel_factor=max(1, n_jobs))
+
+    embedder_cfg = config.get("embedder_config") or {}
+    global_embedder = embedder_cfg.get("model_name") if isinstance(embedder_cfg, dict) else None
+    if global_embedder:
+        seen_models[global_embedder] = resolve_model(global_embedder)
+
+    for node_type, entry in _walk_modules(config.get("search_space") or []):
+        module = entry.get("module_name", "?")
+        model_names = _extract_model_names(entry)
+        if not model_names and global_embedder and module in {"linear", "catboost", "knn", "mlknn"}:
+            model_names = [global_embedder]
+
+        for name in model_names:
+            meta = seen_models.setdefault(name, resolve_model(name))
+
+            mixed_precision = entry.get("dtype") in {"fp16", "bf16"}
+            if module == "bert":
+                mode = "full-finetune"
+            elif module == "lora":
+                mode = "lora"
+            elif module == "dnnc":
+                mode = "reranker"
+            elif module == "ptuning":
+                mode = "full-finetune"
+            else:
+                mode = "inference"
+
+            batch_size = _max_int(entry.get("batch_size"), 32)
+            epochs = _max_int(entry.get("num_train_epochs"), 1 if mode == "inference" else 10)
+
+            vram = _vram_for_transformer(meta, mode, mixed_precision)
+            ram = _ram_for_module(meta, stats)
+
+            time_h = 0.0
+            if mode != "inference":
+                time_h = _time_for_transformer(
+                    meta=meta,
+                    n_trials=n_trials,
+                    epochs=epochs,
+                    batch_size=batch_size,
+                    n_samples=stats.n_samples,
+                    device_class=hardware.device_class,
+                )
+            if refit_after and mode != "inference":
+                time_h *= 1 + 1.0 / max(1, n_trials)
+
+            estimate.vram_gb = max(estimate.vram_gb, vram)
+            estimate.ram_gb = max(estimate.ram_gb, ram)
+            estimate.time_hours += time_h
+            estimate.drivers.append(
+                {
+                    "node_type": node_type,
+                    "module": module,
+                    "model": name,
+                    "mode": mode,
+                    "vram_gb": round(vram, 2),
+                    "ram_gb": round(ram, 2),
+                    "time_hours": round(time_h, 2),
+                    "confidence": meta.confidence,
+                }
+            )
+
+    for meta in seen_models.values():
+        if meta.cached_locally:
+            estimate.disk_cached_gb += meta.disk_gb
+        else:
+            estimate.disk_download_gb += meta.disk_gb
+
+    if dump_modules:
+        weights_total = sum(m.weights_gb for m in seen_models.values())
+        estimate.disk_dump_gb = weights_total * n_trials
+
+    if n_jobs > 1 and hardware.accelerator in {"cuda", "mps"}:
+        effective_vram = estimate.vram_gb * n_jobs
+    else:
+        effective_vram = estimate.vram_gb
+
+    report.resource = estimate
+
+    # render findings
+    vram_sev = _classify_severity(effective_vram, hardware.vram_gb)
+    if hardware.accelerator == "cpu" and effective_vram > 0:
+        report.add(
+            "resource",
+            Severity.YELLOW,
+            f"No GPU detected; transformer modules will be very slow (worst case ~{estimate.time_hours:.1f} h).",
+            metric="vram",
+        )
+    else:
+        msg = f"VRAM ~{effective_vram:.1f} GB"
+        if n_jobs > 1:
+            msg += f" (= per-trial {estimate.vram_gb:.1f} GB × {n_jobs} parallel trials)"
+        msg += f" vs available {hardware.vram_gb:.1f} GB"
+        report.add("resource", vram_sev, msg, metric="vram")
+
+    ram_sev = _classify_severity(estimate.ram_gb, hardware.ram_gb)
+    report.add(
+        "resource",
+        ram_sev,
+        f"RAM ~{estimate.ram_gb:.1f} GB vs available {hardware.ram_gb:.1f} GB",
+        metric="ram",
+    )
+
+    disk_total = estimate.disk_download_gb + estimate.disk_dump_gb
+    disk_sev = _classify_severity(disk_total, hardware.free_disk_gb)
+    disk_msg = f"Disk ~{estimate.disk_download_gb:.1f} GB to download"
+    if estimate.disk_cached_gb > 0:
+        disk_msg += f", {estimate.disk_cached_gb:.1f} GB already cached"
+    if estimate.disk_dump_gb > 0:
+        disk_msg += f", +{estimate.disk_dump_gb:.1f} GB during training (dump_modules=True)"
+    disk_msg += f" vs {hardware.free_disk_gb:.0f} GB free"
+    report.add("resource", disk_sev, disk_msg, metric="disk")
+
+    if estimate.time_hours > 0:
+        time_msg = f"Time ~{estimate.time_hours:.1f} h (worst case, no HPO pruning)"
+        report.add("resource", Severity.GREEN, time_msg, metric="time")
+
+
+def _config_phase(
+    config: dict[str, Any],
+    hardware: HardwareProfile,
+    report: PreflightReport,
+) -> None:
+    hpo = config.get("hpo_config") or {}
+    n_jobs = int(hpo.get("n_jobs", 1))
+
+    if n_jobs > 1 and hardware.accelerator in {"cuda", "mps"}:
+        report.add(
+            "config",
+            Severity.YELLOW,
+            f"hpo_config.n_jobs={n_jobs} on a single GPU multiplies VRAM demand by {n_jobs}×.",
+        )
+
+    uses_catboost_gpu = False
+    for _, entry in _walk_modules(config.get("search_space") or []):
+        if entry.get("module_name") == "catboost" and entry.get("task_type") == "GPU":
+            uses_catboost_gpu = True
+            break
+    if uses_catboost_gpu and hardware.accelerator != "cuda":
+        report.add(
+            "config",
+            Severity.YELLOW,
+            "CatBoost task_type=GPU configured but no CUDA detected — will fall back to CPU.",
+        )
+
+
+def _data_phase(
+    config: dict[str, Any],
+    stats: DatasetStats,
+    report: PreflightReport,
+) -> None:
+    # token-length truncation (heuristic — we use stats.p95_tokens vs configured max_length)
+    p95 = stats.p95_tokens or int(stats.avg_tokens * 2.5)
+    for _, entry in _walk_modules(config.get("search_space") or []):
+        max_len_value = entry.get("max_length")
+        if max_len_value is None:
+            continue
+        max_len = _max_int(max_len_value, 512)
+        if p95 > max_len:
+            severity = Severity.RED if p95 > max_len * 1.5 else Severity.YELLOW
+            report.add(
+                "data",
+                severity,
+                f"Train tokens p95~{p95} exceeds {entry.get('module_name', '?')}.max_length={max_len}; expect silent truncation.",
+            )
+
+    # rare class × linear-CV
+    has_linear = any(
+        e.get("module_name") == "linear" for _, e in _walk_modules(config.get("search_space") or [])
+    )
+    if has_linear and stats.rare_classes:
+        report.add(
+            "data",
+            Severity.RED,
+            (
+                "LogisticRegressionCV (cv=3) will fail: classes "
+                f"{stats.rare_classes[:5]} have <3 samples."
+            ),
+        )
+
+    # partial descriptions × description scorer
+    has_description = any(
+        e.get("module_name") == "description"
+        for _, e in _walk_modules(config.get("search_space") or [])
+    )
+    if has_description and stats.has_descriptions is False:
+        report.add(
+            "data",
+            Severity.RED,
+            "description scorer present but intent descriptions are missing — fill them in or drop the scorer.",
+        )
+
+
+def run_preflight(
+    config: dict[str, Any],
+    stats: DatasetStats,
+    hardware: HardwareProfile,
+    *,
+    preset_name: str | None = None,
+) -> PreflightReport:
+    """Run all three phases and return one report.
+
+    Args:
+        config: parsed preset / OptimizationConfig dict (top-level keys:
+            ``search_space``, ``hpo_config``, optional ``embedder_config``).
+        stats: dataset statistics (real or placeholder).
+        hardware: detected hardware profile.
+        preset_name: optional friendly name for the report header.
+
+    Returns:
+        PreflightReport with findings across resource/data/config phases.
+    """
+    report = PreflightReport(
+        preset_name=preset_name,
+        hardware={
+            "accelerator": hardware.accelerator,
+            "device_name": hardware.device_name,
+            "vram_gb": round(hardware.vram_gb, 2),
+            "ram_gb": round(hardware.ram_gb, 2),
+            "free_disk_gb": round(hardware.free_disk_gb, 2),
+            "device_class": hardware.device_class,
+        },
+        dataset={
+            "n_samples": stats.n_samples,
+            "n_classes": stats.n_classes,
+            "avg_tokens": stats.avg_tokens,
+            "p95_tokens": stats.p95_tokens,
+            "multilabel": stats.multilabel,
+            "source": stats.source,
+        },
+    )
+    report.notes.extend(hardware.notes)
+
+    _resource_phase(config, stats, hardware, report)
+    _data_phase(config, stats, report)
+    _config_phase(config, hardware, report)
+
+    return report
diff --git a/src/autointent/_advisor/_hardware.py b/src/autointent/_advisor/_hardware.py
new file mode 100644
index 000000000..2bda6120f
--- /dev/null
+++ b/src/autointent/_advisor/_hardware.py
@@ -0,0 +1,160 @@
+"""Local hardware detection.
+
+Probes CPU / RAM / disk and the highest-priority accelerator available
+(CUDA → MPS → CPU). All probes are wrapped to fall back safely on a
+broken install (e.g. CUDA driver mismatch) rather than crash the advisor.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import platform
+import shutil
+from dataclasses import dataclass, field
+from typing import Literal
+
+logger = logging.getLogger(__name__)
+
+Accelerator = Literal["cuda", "mps", "cpu"]
+
+# matches macOS PYTORCH_MPS_HIGH_WATERMARK_RATIO default
+MPS_DEFAULT_BUDGET_RATIO = 0.7
+
+
+@dataclass
+class HardwareProfile:
+    accelerator: Accelerator
+    device_name: str
+    vram_gb: float
+    ram_gb: float
+    free_disk_gb: float
+    cpu_count: int
+    notes: list[str] = field(default_factory=list)
+
+    @property
+    def device_class(self) -> str:
+        if self.accelerator == "cpu":
+            return "cpu"
+        if self.accelerator == "mps":
+            return "apple-silicon"
+        if self.vram_gb >= 24:
+            return "high-gpu"
+        if self.vram_gb >= 12:
+            return "mid-gpu"
+        return "low-gpu"
+
+
+def _detect_ram_gb() -> float:
+    try:
+        import psutil
+
+        return psutil.virtual_memory().total / (1024**3)
+    except ImportError:
+        logger.debug("psutil unavailable; RAM unknown")
+        return 0.0
+
+
+def _detect_free_disk_gb(path: str | None = None) -> float:
+    cache = path or os.environ.get("HF_HOME") or os.path.expanduser("~/.cache/huggingface")
+    probe_path = cache if os.path.exists(cache) else os.path.expanduser("~")
+    try:
+        usage = shutil.disk_usage(probe_path)
+        return usage.free / (1024**3)
+    except OSError as e:
+        logger.debug("disk usage probe failed at %s: %s", probe_path, e)
+        return 0.0
+
+
+def _detect_cuda() -> tuple[float, str] | None:
+    try:
+        import torch
+
+        if not torch.cuda.is_available():
+            return None
+        idx = 0
+        try:
+            free, total = torch.cuda.mem_get_info(idx)
+            vram_gb = total / (1024**3)
+        except (RuntimeError, AttributeError) as e:
+            logger.debug("torch.cuda.mem_get_info failed: %s", e)
+            return None
+        name = torch.cuda.get_device_name(idx)
+        return vram_gb, name
+    except ImportError:
+        return None
+    except Exception as e:  # noqa: BLE001 - protect the advisor from torch quirks
+        logger.debug("CUDA detection raised: %s", e)
+        return None
+
+
+def _detect_mps(ram_gb: float, budget_ratio: float = MPS_DEFAULT_BUDGET_RATIO) -> tuple[float, str] | None:
+    try:
+        import torch
+
+        if not (hasattr(torch.backends, "mps") and torch.backends.mps.is_available()):
+            return None
+        # apple silicon: unified memory; budget is fraction of total RAM
+        return ram_gb * budget_ratio, f"Apple Silicon ({platform.machine()})"
+    except ImportError:
+        return None
+    except Exception as e:  # noqa: BLE001
+        logger.debug("MPS detection raised: %s", e)
+        return None
+
+
+def detect_hardware(
+    *,
+    vram_budget_gb: float | None = None,
+    mps_budget_ratio: float = MPS_DEFAULT_BUDGET_RATIO,
+) -> HardwareProfile:
+    """Detect the local hardware, with optional manual overrides.
+
+    Args:
+        vram_budget_gb: when set, overrides the detected VRAM (use for
+            shared-GPU machines where part of the device is taken).
+        mps_budget_ratio: fraction of total RAM treated as the MPS
+            "VRAM" budget on Apple Silicon.
+
+    Returns:
+        HardwareProfile reflecting current machine state.
+    """
+    notes: list[str] = []
+    ram_gb = _detect_ram_gb()
+    free_disk_gb = _detect_free_disk_gb()
+    cpu_count = os.cpu_count() or 1
+
+    cuda = _detect_cuda()
+    if cuda is not None:
+        vram_gb, device_name = cuda
+        accel: Accelerator = "cuda"
+    else:
+        mps = _detect_mps(ram_gb, mps_budget_ratio)
+        if mps is not None:
+            vram_gb, device_name = mps
+            accel = "mps"
+            notes.append(
+                f"MPS unified memory: VRAM budget = {mps_budget_ratio:.0%} of RAM."
+            )
+        else:
+            vram_gb = 0.0
+            device_name = platform.processor() or "cpu"
+            accel = "cpu"
+
+    if vram_budget_gb is not None:
+        if vram_gb and vram_budget_gb > vram_gb:
+            notes.append(
+                f"Manual --budget-vram-gb={vram_budget_gb} exceeds detected {vram_gb:.1f} GB; using override."
+            )
+        notes.append(f"Using manual VRAM budget: {vram_budget_gb} GB.")
+        vram_gb = vram_budget_gb
+
+    return HardwareProfile(
+        accelerator=accel,
+        device_name=device_name,
+        vram_gb=vram_gb,
+        ram_gb=ram_gb,
+        free_disk_gb=free_disk_gb,
+        cpu_count=cpu_count,
+        notes=notes,
+    )
diff --git a/src/autointent/_advisor/_hub.py b/src/autointent/_advisor/_hub.py
new file mode 100644
index 000000000..80ccb7133
--- /dev/null
+++ b/src/autointent/_advisor/_hub.py
@@ -0,0 +1,183 @@
+"""HF Hub metadata lookups + warm-cache probe.
+
+Memoized per-process. Offline-safe: every probe falls back to a
+heuristic value rather than raising. The advisor flips the report's
+``low_confidence`` flag when a fallback is taken.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import re
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+# Coarse heuristic estimates keyed on name fragments. Used only when HF Hub
+# is unreachable and we can't get safetensors metadata. Values in millions.
+_NAME_HEURISTICS = [
+    (re.compile(r"(?i)(deberta|roberta|bert).*(xxlarge|huge)"), 1_500),
+    (re.compile(r"(?i)(deberta|roberta|bert).*xlarge"), 750),
+    (re.compile(r"(?i)(deberta|roberta|bert).*large"), 350),
+    (re.compile(r"(?i)e5.*large"), 560),
+    (re.compile(r"(?i)e5.*small"), 33),
+    (re.compile(r"(?i)mpnet"), 110),
+    (re.compile(r"(?i)minilm"), 33),
+    (re.compile(r"(?i)distil"), 66),
+    (re.compile(r"(?i)small"), 60),
+    (re.compile(r"(?i)base"), 110),
+    (re.compile(r"(?i)large"), 350),
+]
+
+
+@dataclass
+class ModelMeta:
+    name: str
+    params_millions: float
+    weight_bytes_per_param: int
+    total_file_bytes: int
+    cached_locally: bool
+    confidence: str  # "hub" | "heuristic"
+
+    @property
+    def disk_gb(self) -> float:
+        return self.total_file_bytes / (1024**3)
+
+    @property
+    def weights_gb(self) -> float:
+        return (self.params_millions * 1_000_000 * self.weight_bytes_per_param) / (1024**3)
+
+
+@lru_cache(maxsize=1)
+def hub_reachable(timeout_s: float = 2.0) -> bool:
+    """Single up-front probe. Memoized per process."""
+    try:
+        from huggingface_hub import HfApi
+
+        HfApi().list_models(limit=1)
+    except ImportError:
+        logger.debug("huggingface_hub not installed; assuming offline")
+        return False
+    except Exception as e:  # noqa: BLE001
+        logger.debug("HF Hub probe failed: %s", e)
+        return False
+    else:
+        return True
+
+
+def _heuristic_params_millions(model_name: str) -> float:
+    for pattern, m in _NAME_HEURISTICS:
+        if pattern.search(model_name):
+            return float(m)
+    return 110.0  # generic BERT-base default
+
+
+def _is_warm_cached(model_name: str) -> bool:
+    """True when the weight shard is present in the local HF cache."""
+    try:
+        from huggingface_hub import scan_cache_dir, try_to_load_from_cache
+    except ImportError:
+        return False
+
+    weight_files = ["model.safetensors", "pytorch_model.bin", "model.safetensors.index.json"]
+    for fname in weight_files:
+        path = try_to_load_from_cache(model_name, fname)
+        if path is not None and path is not False:
+            return True
+
+    # sharded models won't match the single-file probe; fall back to a scan
+    try:
+        cache = scan_cache_dir()
+    except Exception as e:  # noqa: BLE001
+        logger.debug("scan_cache_dir failed: %s", e)
+        return False
+    return any(repo.repo_id == model_name for repo in cache.repos)
+
+
+def _hub_metadata(model_name: str) -> ModelMeta | None:
+    try:
+        from huggingface_hub import HfApi
+    except ImportError:
+        return None
+
+    try:
+        info = HfApi().model_info(model_name, files_metadata=True)
+    except Exception as e:  # noqa: BLE001
+        logger.debug("model_info(%s) failed: %s", model_name, e)
+        return None
+
+    params_millions = 0.0
+    weight_bytes_per_param = 4
+    safetensors = getattr(info, "safetensors", None)
+    if safetensors is not None:
+        params_total = getattr(safetensors, "total", None) or sum(
+            getattr(safetensors, "parameters", {}).values() or [0]
+        )
+        if params_total:
+            params_millions = params_total / 1_000_000
+            params_map: dict[str, Any] = getattr(safetensors, "parameters", {}) or {}
+            if any("F16" in k or "BF16" in k for k in params_map):
+                weight_bytes_per_param = 2
+
+    total_file_bytes = 0
+    for sibling in getattr(info, "siblings", []) or []:
+        size = getattr(sibling, "size", None)
+        if size:
+            total_file_bytes += int(size)
+
+    if params_millions == 0:
+        params_millions = _heuristic_params_millions(model_name)
+
+    if total_file_bytes == 0:
+        total_file_bytes = int(params_millions * 1_000_000 * weight_bytes_per_param)
+
+    return ModelMeta(
+        name=model_name,
+        params_millions=params_millions,
+        weight_bytes_per_param=weight_bytes_per_param,
+        total_file_bytes=total_file_bytes,
+        cached_locally=_is_warm_cached(model_name),
+        confidence="hub",
+    )
+
+
+def _heuristic_metadata(model_name: str) -> ModelMeta:
+    params_millions = _heuristic_params_millions(model_name)
+    weight_bytes_per_param = 4
+    total_file_bytes = int(params_millions * 1_000_000 * weight_bytes_per_param)
+    return ModelMeta(
+        name=model_name,
+        params_millions=params_millions,
+        weight_bytes_per_param=weight_bytes_per_param,
+        total_file_bytes=total_file_bytes,
+        cached_locally=_is_warm_cached(model_name),
+        confidence="heuristic",
+    )
+
+
+@lru_cache(maxsize=64)
+def resolve_model(model_name: str) -> ModelMeta:
+    """Resolve metadata for a single model name. Memoized per process.
+
+    Always returns a value — never raises — so the advisor can keep going
+    on offline machines or for unknown checkpoints.
+    """
+    if model_name.startswith("local:") or os.path.isabs(model_name):
+        return ModelMeta(
+            name=model_name,
+            params_millions=_heuristic_params_millions(model_name),
+            weight_bytes_per_param=4,
+            total_file_bytes=0,
+            cached_locally=True,
+            confidence="heuristic",
+        )
+
+    if hub_reachable():
+        meta = _hub_metadata(model_name)
+        if meta is not None:
+            return meta
+
+    return _heuristic_metadata(model_name)
diff --git a/src/autointent/_advisor/_render.py b/src/autointent/_advisor/_render.py
new file mode 100644
index 000000000..52168aa75
--- /dev/null
+++ b/src/autointent/_advisor/_render.py
@@ -0,0 +1,104 @@
+"""Rendering for the pre-flight report.
+
+Text output is grouped by phase (Resource / Data / Config) plus a Drivers
+section and the always-on disclaimer. JSON output dumps the structured
+report straight through.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from ._report import PreflightReport
+
+_SEVERITY_TAG = {"green": "✓", "yellow": "⚠", "red": "✗"}
+_PHASE_ORDER = ("resource", "data", "config")
+_PHASE_LABEL = {"resource": "Resource", "data": "Data", "config": "Config"}
+
+
+def render_text(report: "PreflightReport") -> str:
+    lines: list[str] = []
+    title = "Compute feasibility check"
+    if report.preset_name:
+        title += f" — {report.preset_name}"
+    lines.append(title)
+    lines.append("─" * len(title))
+
+    hw = report.hardware
+    lines.append(
+        f"Hardware: {hw.get('accelerator', '?')} ({hw.get('device_name', '?')}),"
+        f" {hw.get('vram_gb', 0):.1f} GB VRAM, {hw.get('ram_gb', 0):.0f} GB RAM,"
+        f" {hw.get('free_disk_gb', 0):.0f} GB free disk"
+    )
+    ds = report.dataset
+    lines.append(
+        f"Dataset: n_samples={ds.get('n_samples')}, n_classes={ds.get('n_classes')},"
+        f" avg_tokens={ds.get('avg_tokens')} ({ds.get('source')})"
+    )
+    lines.append("")
+
+    for phase in _PHASE_ORDER:
+        bucket = [f for f in report.findings if f.phase == phase]
+        if not bucket:
+            continue
+        lines.append(f"{_PHASE_LABEL[phase]}:")
+        for f in bucket:
+            tag = _SEVERITY_TAG.get(f.severity.value, "·")
+            lines.append(f"  {tag} {f.message}")
+        lines.append("")
+
+    if report.resource.drivers:
+        lines.append("Drivers of cost:")
+        for d in report.resource.drivers[:8]:
+            lines.append(
+                f"  {d['node_type']}.{d['module']:<10} {d['model']:<48}"
+                f"  {d['mode']:<14}  VRAM ~{d['vram_gb']} GB, time ~{d['time_hours']} h"
+                f"  [{d['confidence']}]"
+            )
+        if len(report.resource.drivers) > 8:
+            lines.append(f"  … and {len(report.resource.drivers) - 8} more")
+        lines.append("")
+
+    if report.notes:
+        lines.append("Notes:")
+        for note in report.notes:
+            lines.append(f"  • {note}")
+        lines.append("")
+
+    summary = f"Verdict: {'feasible' if report.is_feasible else 'INFEASIBLE'} "
+    summary += f"(worst severity: {report.worst_severity.value})"
+    if report.low_confidence:
+        summary += " — low-confidence (heuristic fallback in use)"
+    lines.append(summary)
+    lines.append("Note: estimates are heuristic upper bounds, not measurements.")
+    return "\n".join(lines)
+
+
+def render_json(report: "PreflightReport") -> str:
+    return json.dumps(report.to_dict(), indent=2, default=str)
+
+
+def render_recommendation(
+    results: list[tuple[str, "PreflightReport"]],
+    chosen: str | None,
+) -> str:
+    """Compact table for the ``recommend`` subcommand."""
+    lines = ["", "Recommendation:"]
+    if chosen:
+        lines.append(f"  → {chosen}")
+    else:
+        lines.append("  → none of the bundled presets fit your hardware as-is.")
+    lines.append("")
+    lines.append(f"{'Preset':<24} {'Status':<14} {'VRAM':<10} {'Time':<10} {'Worst':<8}")
+    lines.append("-" * 68)
+    for name, report in results:
+        verdict = "feasible" if report.is_feasible else "infeasible"
+        lines.append(
+            f"{name:<24} {verdict:<14} "
+            f"{report.resource.vram_gb:>4.1f} GB   "
+            f"{report.resource.time_hours:>4.1f} h    "
+            f"{report.worst_severity.value:<8}"
+        )
+    return "\n".join(lines)
diff --git a/src/autointent/_advisor/_report.py b/src/autointent/_advisor/_report.py
new file mode 100644
index 000000000..0250482a5
--- /dev/null
+++ b/src/autointent/_advisor/_report.py
@@ -0,0 +1,113 @@
+"""Dataclasses for the pre-flight advisor's structured report."""
+
+from __future__ import annotations
+
+from dataclasses import asdict, dataclass, field
+from enum import Enum
+from typing import Any, Literal
+
+
+class Severity(str, Enum):
+    GREEN = "green"
+    YELLOW = "yellow"
+    RED = "red"
+
+
+Phase = Literal["resource", "data", "config"]
+
+
+@dataclass(frozen=True)
+class Finding:
+    """A single advisor finding rendered as one line in the summary."""
+
+    phase: Phase
+    severity: Severity
+    message: str
+    metric: str | None = None
+
+
+@dataclass
+class ResourceEstimate:
+    """Aggregated resource numbers across the search space."""
+
+    disk_download_gb: float = 0.0
+    disk_cached_gb: float = 0.0
+    disk_dump_gb: float = 0.0
+    ram_gb: float = 0.0
+    vram_gb: float = 0.0
+    time_hours: float = 0.0
+    parallel_factor: int = 1
+    drivers: list[dict[str, Any]] = field(default_factory=list)
+
+    @property
+    def total_disk_gb(self) -> float:
+        return self.disk_download_gb + self.disk_dump_gb
+
+
+@dataclass
+class DatasetStats:
+    """Minimal stats the advisor needs about the user's dataset.
+
+    Built either from a real ``Dataset`` or from CLI placeholder flags.
+    """
+
+    n_samples: int
+    n_classes: int
+    avg_tokens: int
+    p95_tokens: int | None = None
+    multilabel: bool = False
+    has_descriptions: bool | None = None
+    rare_classes: list[str] = field(default_factory=list)
+    source: str = "placeholder"
+
+    @classmethod
+    def placeholder(
+        cls,
+        n_samples: int = 1_000,
+        n_classes: int = 10,
+        avg_tokens: int = 32,
+        multilabel: bool = False,
+    ) -> "DatasetStats":
+        return cls(
+            n_samples=n_samples,
+            n_classes=n_classes,
+            avg_tokens=avg_tokens,
+            p95_tokens=int(avg_tokens * 2.5),
+            multilabel=multilabel,
+        )
+
+
+@dataclass
+class PreflightReport:
+    """One report covering all three phases."""
+
+    findings: list[Finding] = field(default_factory=list)
+    resource: ResourceEstimate = field(default_factory=ResourceEstimate)
+    hardware: dict[str, Any] = field(default_factory=dict)
+    dataset: dict[str, Any] = field(default_factory=dict)
+    preset_name: str | None = None
+    low_confidence: bool = False
+    notes: list[str] = field(default_factory=list)
+
+    def add(self, phase: Phase, severity: Severity, message: str, metric: str | None = None) -> None:
+        self.findings.append(Finding(phase=phase, severity=severity, message=message, metric=metric))
+
+    @property
+    def worst_severity(self) -> Severity:
+        order = {Severity.GREEN: 0, Severity.YELLOW: 1, Severity.RED: 2}
+        if not self.findings:
+            return Severity.GREEN
+        return max((f.severity for f in self.findings), key=lambda s: order[s])
+
+    @property
+    def is_feasible(self) -> bool:
+        return self.worst_severity != Severity.RED
+
+    def to_dict(self) -> dict[str, Any]:
+        d = asdict(self)
+        d["findings"] = [
+            {**asdict(f), "severity": f.severity.value} for f in self.findings
+        ]
+        d["worst_severity"] = self.worst_severity.value
+        d["is_feasible"] = self.is_feasible
+        return d
diff --git a/tests/advisor/__init__.py b/tests/advisor/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/advisor/test_estimates_and_cli.py b/tests/advisor/test_estimates_and_cli.py
new file mode 100644
index 000000000..18c2615a6
--- /dev/null
+++ b/tests/advisor/test_estimates_and_cli.py
@@ -0,0 +1,198 @@
+"""End-to-end smoke tests for the advisor.
+
+These run offline — HF Hub probes are monkeypatched to fail so the
+advisor falls back to its name-pattern heuristics. Verifies that:
+
+* every bundled preset can be inspected without raising;
+* the recommend subcommand picks something on a generous budget and
+  nothing on a hostile one;
+* ``--json`` emits parseable JSON.
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+
+import pytest
+
+from autointent._advisor import DatasetStats, HardwareProfile, run_preflight
+from autointent._advisor._cli import BUNDLED_PRESETS, main
+from autointent.utils import load_preset
+
+
+@pytest.fixture(autouse=True)
+def _force_offline(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Pin the HF Hub probe to "offline" so tests don't hit the network."""
+    from autointent._advisor import _estimates, _hub
+
+    _hub.hub_reachable.cache_clear()
+    _hub.resolve_model.cache_clear()
+    offline = lambda *_a, **_kw: False  # noqa: E731
+    monkeypatch.setattr(_hub, "hub_reachable", offline)
+    monkeypatch.setattr(_estimates, "hub_reachable", offline)
+
+
+def _profile(vram_gb: float = 16.0) -> HardwareProfile:
+    return HardwareProfile(
+        accelerator="cuda" if vram_gb > 0 else "cpu",
+        device_name="test-gpu" if vram_gb > 0 else "test-cpu",
+        vram_gb=vram_gb,
+        ram_gb=32.0,
+        free_disk_gb=200.0,
+        cpu_count=8,
+    )
+
+
+@pytest.mark.parametrize("preset", BUNDLED_PRESETS)
+def test_every_preset_inspects_without_raising(preset: str) -> None:
+    cfg = load_preset(preset)  # type: ignore[arg-type]
+    stats = DatasetStats.placeholder(n_samples=500, n_classes=10, avg_tokens=24)
+    report = run_preflight(cfg, stats, _profile(vram_gb=16.0), preset_name=preset)
+    assert report.preset_name == preset
+    assert report.low_confidence is True  # we forced offline
+    # always at least one resource-phase finding
+    assert any(f.phase == "resource" for f in report.findings)
+
+
+def test_heavy_preset_is_infeasible_on_2gb_budget() -> None:
+    cfg = load_preset("transformers-heavy")  # type: ignore[arg-type]
+    stats = DatasetStats.placeholder(n_samples=5000, n_classes=20, avg_tokens=40)
+    report = run_preflight(cfg, stats, _profile(vram_gb=2.0), preset_name="transformers-heavy")
+    assert not report.is_feasible, "deberta-v3-large should not fit in 2 GB"
+
+
+def test_light_preset_is_feasible_on_8gb_budget() -> None:
+    cfg = load_preset("transformers-light")  # type: ignore[arg-type]
+    stats = DatasetStats.placeholder(n_samples=1000, n_classes=10, avg_tokens=24)
+    report = run_preflight(cfg, stats, _profile(vram_gb=8.0), preset_name="transformers-light")
+    assert report.is_feasible
+
+
+def test_n_jobs_doubles_vram_findings() -> None:
+    cfg = load_preset("transformers-light")  # type: ignore[arg-type]
+    cfg = {**cfg, "hpo_config": {**(cfg.get("hpo_config") or {}), "n_jobs": 4}}
+    stats = DatasetStats.placeholder()
+    report = run_preflight(cfg, stats, _profile(vram_gb=4.0))
+    assert any("parallel trials" in f.message for f in report.findings)
+    assert any(f.phase == "config" and "n_jobs" in f.message for f in report.findings)
+
+
+def test_cli_inspect_json_is_parseable(capsys: pytest.CaptureFixture[str]) -> None:
+    rc = main(
+        [
+            "inspect",
+            "transformers-light",
+            "--n-samples",
+            "500",
+            "--n-classes",
+            "5",
+            "--avg-tokens",
+            "20",
+            "--json",
+            "--budget-vram-gb",
+            "16",
+        ]
+    )
+    captured = capsys.readouterr()
+    payload = json.loads(captured.out)
+    assert payload["preset_name"] == "transformers-light"
+    assert "findings" in payload
+    assert payload["worst_severity"] in {"green", "yellow", "red"}
+    # rc is 0 on feasible, 1 otherwise
+    assert rc in (0, 1)
+
+
+def test_cli_inspect_text_runs(capsys: pytest.CaptureFixture[str]) -> None:
+    main(
+        [
+            "inspect",
+            "transformers-light",
+            "--n-samples",
+            "200",
+            "--n-classes",
+            "5",
+            "--avg-tokens",
+            "15",
+            "--budget-vram-gb",
+            "16",
+        ]
+    )
+    out = capsys.readouterr().out
+    assert "Compute feasibility check" in out
+    assert "Verdict:" in out
+
+
+def test_cli_recommend_picks_a_preset_on_generous_hardware(
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    rc = main(
+        [
+            "recommend",
+            "--n-samples",
+            "1000",
+            "--n-classes",
+            "10",
+            "--avg-tokens",
+            "20",
+            "--budget-vram-gb",
+            "24",
+        ]
+    )
+    out = capsys.readouterr().out
+    assert "Recommendation:" in out
+    assert rc == 0
+
+
+def test_partial_descriptions_with_description_scorer_flags_red() -> None:
+    cfg = {
+        "search_space": [
+            {
+                "node_type": "scoring",
+                "search_space": [
+                    {"module_name": "description"},
+                ],
+            }
+        ],
+    }
+    stats = DatasetStats(
+        n_samples=500,
+        n_classes=10,
+        avg_tokens=24,
+        has_descriptions=False,
+    )
+    report = run_preflight(cfg, stats, _profile(vram_gb=16.0))
+    assert any(
+        f.phase == "data" and "description" in f.message.lower() for f in report.findings
+    )
+
+
+def test_long_dataset_triggers_truncation_warning() -> None:
+    cfg = {
+        "search_space": [
+            {
+                "node_type": "scoring",
+                "search_space": [
+                    {
+                        "module_name": "bert",
+                        "classification_model_config": [
+                            {"model_name": "microsoft/deberta-v3-small"}
+                        ],
+                        "max_length": [128],
+                    }
+                ],
+            }
+        ],
+    }
+    stats = DatasetStats(
+        n_samples=500,
+        n_classes=10,
+        avg_tokens=80,
+        p95_tokens=512,  # well over 128
+    )
+    report = run_preflight(cfg, stats, _profile(vram_gb=16.0))
+    assert any("truncation" in f.message.lower() for f in report.findings)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__, "-v"]))
diff --git a/tests/advisor/test_estimates_internals.py b/tests/advisor/test_estimates_internals.py
new file mode 100644
index 000000000..0317ff27b
--- /dev/null
+++ b/tests/advisor/test_estimates_internals.py
@@ -0,0 +1,319 @@
+"""Targeted tests for `_estimates` helpers + edge cases of `run_preflight`."""
+
+from __future__ import annotations
+
+import pytest
+
+from autointent._advisor import _estimates, _hub
+from autointent._advisor._estimates import (
+    _classify_severity,
+    _extract_model_names,
+    _max_int,
+    _ram_for_module,
+    _vram_for_transformer,
+    run_preflight,
+)
+from autointent._advisor._hardware import HardwareProfile
+from autointent._advisor._hub import ModelMeta
+from autointent._advisor._report import DatasetStats, Severity
+
+
+@pytest.fixture(autouse=True)
+def _offline(monkeypatch: pytest.MonkeyPatch) -> None:
+    _hub.hub_reachable.cache_clear()
+    _hub.resolve_model.cache_clear()
+    offline = lambda *_a, **_kw: False  # noqa: E731
+    monkeypatch.setattr(_hub, "hub_reachable", offline)
+    monkeypatch.setattr(_estimates, "hub_reachable", offline)
+    monkeypatch.setattr(_hub, "_is_warm_cached", lambda _name: False)
+
+
+def _profile(vram_gb: float = 16.0, accelerator: str = "cuda") -> HardwareProfile:
+    return HardwareProfile(
+        accelerator=accelerator,  # type: ignore[arg-type]
+        device_name=f"test-{accelerator}",
+        vram_gb=vram_gb,
+        ram_gb=32.0,
+        free_disk_gb=200.0,
+        cpu_count=8,
+    )
+
+
+class TestMaxInt:
+    def test_none_returns_default(self) -> None:
+        assert _max_int(None, 7) == 7
+
+    def test_list_picks_max(self) -> None:
+        assert _max_int([1, 5, 3], 0) == 5
+
+    def test_range_dict_uses_high(self) -> None:
+        assert _max_int({"low": 1, "high": 9}, 0) == 9
+
+    def test_scalar_int_passes_through(self) -> None:
+        assert _max_int(42, 0) == 42
+
+    def test_garbage_returns_default(self) -> None:
+        assert _max_int("not-a-number", 11) == 11
+
+
+class TestExtractModelNames:
+    def test_classification_model_config_as_list(self) -> None:
+        entry = {"classification_model_config": [{"model_name": "foo/bar"}]}
+        assert _extract_model_names(entry) == ["foo/bar"]
+
+    def test_classification_model_config_as_dict(self) -> None:
+        entry = {"classification_model_config": {"model_name": "foo/bar"}}
+        assert _extract_model_names(entry) == ["foo/bar"]
+
+    def test_embedder_config_picked_up(self) -> None:
+        entry = {"embedder_config": [{"model_name": "e/b"}]}
+        assert _extract_model_names(entry) == ["e/b"]
+
+    def test_multiple_choices_all_returned(self) -> None:
+        entry = {
+            "classification_model_config": [
+                {"model_name": "a/x"},
+                {"model_name": "b/y"},
+            ]
+        }
+        assert _extract_model_names(entry) == ["a/x", "b/y"]
+
+    def test_empty_entry(self) -> None:
+        assert _extract_model_names({}) == []
+
+
+class TestClassifySeverity:
+    def test_below_yellow_is_green(self) -> None:
+        assert _classify_severity(estimate=1.0, budget=10.0) == Severity.GREEN
+
+    def test_above_yellow_threshold(self) -> None:
+        assert _classify_severity(estimate=8.0, budget=10.0) == Severity.YELLOW
+
+    def test_at_or_above_red_threshold(self) -> None:
+        assert _classify_severity(estimate=10.0, budget=10.0) == Severity.RED
+        assert _classify_severity(estimate=12.0, budget=10.0) == Severity.RED
+
+    def test_zero_budget_returns_yellow(self) -> None:
+        assert _classify_severity(estimate=1.0, budget=0.0) == Severity.YELLOW
+
+
+class TestVramForTransformer:
+    @pytest.fixture
+    def meta(self) -> ModelMeta:
+        return ModelMeta(
+            name="x",
+            params_millions=100.0,
+            weight_bytes_per_param=4,
+            total_file_bytes=0,
+            cached_locally=False,
+            confidence="hub",
+        )
+
+    def test_full_finetune_is_larger_than_lora_is_larger_than_inference(
+        self, meta: ModelMeta
+    ) -> None:
+        inference = _vram_for_transformer(meta, "inference", mixed_precision=False)
+        lora = _vram_for_transformer(meta, "lora", mixed_precision=False)
+        full = _vram_for_transformer(meta, "full-finetune", mixed_precision=False)
+        assert inference < lora < full
+
+    def test_amp_does_not_naively_halve(self, meta: ModelMeta) -> None:
+        """The proposal calls out that AMP doesn't halve total VRAM — fp32 master
+        weights and Adam moments don't shrink. Weight-side accounting comes out
+        equal to fp32; the only savings (activations) aren't modeled by us."""
+        full_fp32 = _vram_for_transformer(meta, "full-finetune", mixed_precision=False)
+        full_amp = _vram_for_transformer(meta, "full-finetune", mixed_precision=True)
+        assert full_amp / full_fp32 == pytest.approx(1.0)
+        assert full_amp / full_fp32 > 0.5  # explicit check vs the naive-halving formula
+
+    def test_reranker_uses_inference_class(self, meta: ModelMeta) -> None:
+        inference = _vram_for_transformer(meta, "inference", mixed_precision=False)
+        reranker = _vram_for_transformer(meta, "reranker", mixed_precision=False)
+        assert reranker > inference
+
+
+def test_ram_scales_with_dataset_size() -> None:
+    meta = ModelMeta(
+        name="x",
+        params_millions=100.0,
+        weight_bytes_per_param=4,
+        total_file_bytes=0,
+        cached_locally=False,
+        confidence="hub",
+    )
+    small = _ram_for_module(meta, DatasetStats.placeholder(n_samples=100))
+    big = _ram_for_module(meta, DatasetStats.placeholder(n_samples=10_000_000, avg_tokens=128))
+    assert big > small
+
+
+class TestRunPreflightFeatures:
+    def test_dump_modules_adds_disk_during_training(self) -> None:
+        cfg = {
+            "search_space": [
+                {
+                    "node_type": "scoring",
+                    "search_space": [
+                        {
+                            "module_name": "bert",
+                            "classification_model_config": [
+                                {"model_name": "microsoft/deberta-v3-small"}
+                            ],
+                            "num_train_epochs": [3],
+                            "batch_size": [16],
+                        }
+                    ],
+                }
+            ],
+            "hpo_config": {"n_trials": 5},
+            "dump_modules": True,
+        }
+        report = run_preflight(cfg, DatasetStats.placeholder(), _profile())
+        assert report.resource.disk_dump_gb > 0
+        assert any("during training" in f.message for f in report.findings)
+
+    def test_refit_after_increases_time(self) -> None:
+        cfg = {
+            "search_space": [
+                {
+                    "node_type": "scoring",
+                    "search_space": [
+                        {
+                            "module_name": "bert",
+                            "classification_model_config": [
+                                {"model_name": "microsoft/deberta-v3-small"}
+                            ],
+                            "num_train_epochs": [3],
+                            "batch_size": [16],
+                        }
+                    ],
+                }
+            ],
+            "hpo_config": {"n_trials": 10},
+        }
+        baseline = run_preflight(cfg, DatasetStats.placeholder(), _profile())
+        cfg_refit = {**cfg, "refit_after": True}
+        bumped = run_preflight(cfg_refit, DatasetStats.placeholder(), _profile())
+        assert bumped.resource.time_hours > baseline.resource.time_hours
+
+    def test_catboost_gpu_without_cuda_flags_config(self) -> None:
+        cfg = {
+            "search_space": [
+                {
+                    "node_type": "scoring",
+                    "search_space": [
+                        {"module_name": "catboost", "task_type": "GPU"},
+                    ],
+                }
+            ],
+        }
+        report = run_preflight(cfg, DatasetStats.placeholder(), _profile(accelerator="cpu"))
+        assert any(
+            f.phase == "config" and "CatBoost" in f.message for f in report.findings
+        )
+
+    def test_catboost_gpu_with_cuda_is_silent(self) -> None:
+        cfg = {
+            "search_space": [
+                {
+                    "node_type": "scoring",
+                    "search_space": [
+                        {"module_name": "catboost", "task_type": "GPU"},
+                    ],
+                }
+            ],
+        }
+        report = run_preflight(cfg, DatasetStats.placeholder(), _profile(accelerator="cuda"))
+        assert not any(
+            f.phase == "config" and "CatBoost" in f.message for f in report.findings
+        )
+
+    def test_offline_flips_low_confidence(self) -> None:
+        cfg = {
+            "search_space": [
+                {
+                    "node_type": "scoring",
+                    "search_space": [
+                        {
+                            "module_name": "bert",
+                            "classification_model_config": [{"model_name": "any/model"}],
+                        }
+                    ],
+                }
+            ]
+        }
+        report = run_preflight(cfg, DatasetStats.placeholder(), _profile())
+        assert report.low_confidence is True
+        assert any("HF Hub unreachable" in n for n in report.notes)
+
+    def test_rare_classes_with_linear_scorer_flag_red(self) -> None:
+        cfg = {
+            "search_space": [
+                {
+                    "node_type": "scoring",
+                    "search_space": [
+                        {"module_name": "linear"},
+                    ],
+                }
+            ]
+        }
+        stats = DatasetStats(
+            n_samples=20,
+            n_classes=5,
+            avg_tokens=10,
+            rare_classes=["intent_a", "intent_b"],
+        )
+        report = run_preflight(cfg, stats, _profile())
+        assert any(
+            f.phase == "data" and "LogisticRegressionCV" in f.message and f.severity == Severity.RED
+            for f in report.findings
+        )
+
+    def test_truncation_red_when_p95_dominates_max_length(self) -> None:
+        cfg = {
+            "search_space": [
+                {
+                    "node_type": "scoring",
+                    "search_space": [
+                        {
+                            "module_name": "bert",
+                            "max_length": [128],
+                            "classification_model_config": [
+                                {"model_name": "some/model"}
+                            ],
+                        }
+                    ],
+                }
+            ]
+        }
+        stats = DatasetStats(n_samples=500, n_classes=5, avg_tokens=50, p95_tokens=400)
+        report = run_preflight(cfg, stats, _profile())
+        red = [f for f in report.findings if f.phase == "data" and f.severity == Severity.RED]
+        assert red, "p95=400 > 1.5 * max_length=128 should be red"
+
+    def test_truncation_yellow_when_p95_only_slightly_exceeds(self) -> None:
+        cfg = {
+            "search_space": [
+                {
+                    "node_type": "scoring",
+                    "search_space": [
+                        {
+                            "module_name": "bert",
+                            "max_length": [128],
+                            "classification_model_config": [
+                                {"model_name": "some/model"}
+                            ],
+                        }
+                    ],
+                }
+            ]
+        }
+        stats = DatasetStats(n_samples=500, n_classes=5, avg_tokens=50, p95_tokens=140)
+        report = run_preflight(cfg, stats, _profile())
+        yellows = [
+            f
+            for f in report.findings
+            if f.phase == "data"
+            and f.severity == Severity.YELLOW
+            and "truncation" in f.message.lower()
+        ]
+        assert yellows
diff --git a/tests/advisor/test_hardware_detection.py b/tests/advisor/test_hardware_detection.py
new file mode 100644
index 000000000..d8131fb19
--- /dev/null
+++ b/tests/advisor/test_hardware_detection.py
@@ -0,0 +1,72 @@
+"""Hardware detection has to be safe on every machine — broken CUDA, no GPU,
+no psutil. Verify the fallbacks work without raising.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import patch
+
+import pytest
+
+from autointent._advisor._hardware import detect_hardware
+
+
+def test_cpu_fallback_when_no_accelerator() -> None:
+    with (
+        patch("autointent._advisor._hardware._detect_cuda", return_value=None),
+        patch("autointent._advisor._hardware._detect_mps", return_value=None),
+    ):
+        hw = detect_hardware()
+    assert hw.accelerator == "cpu"
+    assert hw.vram_gb == 0.0
+    assert hw.device_class == "cpu"
+
+
+def test_cuda_branch_classifies_low_gpu() -> None:
+    with (
+        patch(
+            "autointent._advisor._hardware._detect_cuda",
+            return_value=(8.0, "NVIDIA RTX 3060"),
+        ),
+    ):
+        hw = detect_hardware()
+    assert hw.accelerator == "cuda"
+    assert hw.vram_gb == pytest.approx(8.0)
+    assert hw.device_class == "low-gpu"
+
+
+def test_mps_budget_uses_ram_fraction() -> None:
+    with (
+        patch("autointent._advisor._hardware._detect_cuda", return_value=None),
+        patch("autointent._advisor._hardware._detect_ram_gb", return_value=32.0),
+        patch(
+            "autointent._advisor._hardware._detect_mps",
+            side_effect=lambda ram, ratio: (ram * ratio, "Apple Silicon (arm64)"),
+        ),
+    ):
+        hw = detect_hardware()
+    assert hw.accelerator == "mps"
+    assert hw.vram_gb == pytest.approx(32.0 * 0.7)
+    assert any("MPS unified memory" in n for n in hw.notes)
+
+
+def test_vram_budget_override_applies() -> None:
+    with (
+        patch(
+            "autointent._advisor._hardware._detect_cuda",
+            return_value=(24.0, "NVIDIA RTX 4090"),
+        ),
+    ):
+        hw = detect_hardware(vram_budget_gb=8.0)
+    assert hw.vram_gb == pytest.approx(8.0)
+    assert any("manual VRAM budget" in n for n in hw.notes)
+
+
+def test_broken_cuda_returns_none_does_not_crash() -> None:
+    # _detect_cuda swallows torch quirks already; verify the wrapper holds.
+    with (
+        patch("autointent._advisor._hardware._detect_cuda", return_value=None),
+        patch("autointent._advisor._hardware._detect_mps", return_value=None),
+    ):
+        hw = detect_hardware()
+    assert hw.accelerator == "cpu"
diff --git a/tests/advisor/test_hub_heuristics.py b/tests/advisor/test_hub_heuristics.py
new file mode 100644
index 000000000..54a03431d
--- /dev/null
+++ b/tests/advisor/test_hub_heuristics.py
@@ -0,0 +1,81 @@
+"""Tests for the offline name-pattern heuristics in `_hub`.
+
+The advisor must produce a sensible estimate even when HF Hub is
+unreachable, so these tests pin the public `hub_reachable` to False and
+exercise the heuristic path directly.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from autointent._advisor import _hub
+
+
+@pytest.fixture(autouse=True)
+def _offline(monkeypatch: pytest.MonkeyPatch) -> None:
+    _hub.hub_reachable.cache_clear()
+    _hub.resolve_model.cache_clear()
+    monkeypatch.setattr(_hub, "hub_reachable", lambda *_a, **_kw: False)
+    monkeypatch.setattr(_hub, "_is_warm_cached", lambda _name: False)
+
+
+@pytest.mark.parametrize(
+    ("name", "expected_min_m", "expected_max_m"),
+    [
+        ("microsoft/deberta-v3-large", 200, 500),
+        ("microsoft/deberta-v3-small", 30, 200),
+        ("sentence-transformers/all-MiniLM-L6-v2", 20, 80),
+        ("intfloat/multilingual-e5-large-instruct", 300, 700),
+        ("intfloat/e5-small", 20, 80),
+        ("distilbert-base-uncased", 40, 150),
+        ("bert-base-uncased", 70, 200),
+    ],
+)
+def test_name_heuristic_picks_reasonable_bucket(
+    name: str, expected_min_m: int, expected_max_m: int
+) -> None:
+    meta = _hub.resolve_model(name)
+    assert meta.confidence == "heuristic"
+    assert expected_min_m <= meta.params_millions <= expected_max_m, (
+        f"{name} got {meta.params_millions}M; expected [{expected_min_m}, {expected_max_m}]"
+    )
+
+
+def test_unknown_name_falls_back_to_bert_base() -> None:
+    meta = _hub.resolve_model("totally-made-up/no-such-model")
+    assert meta.confidence == "heuristic"
+    assert meta.params_millions == pytest.approx(110.0)
+
+
+def test_weights_gb_matches_params_times_bytes() -> None:
+    meta = _hub.resolve_model("microsoft/deberta-v3-large")
+    expected_gb = meta.params_millions * 1_000_000 * meta.weight_bytes_per_param / (1024**3)
+    assert meta.weights_gb == pytest.approx(expected_gb)
+
+
+def test_local_path_returns_zero_disk() -> None:
+    meta = _hub.resolve_model("/tmp/local/path/to/model")
+    assert meta.total_file_bytes == 0
+    assert meta.cached_locally is True
+
+
+def test_disk_gb_falls_back_to_param_size_when_siblings_unknown() -> None:
+    meta = _hub.resolve_model("intfloat/multilingual-e5-large-instruct")
+    assert meta.disk_gb > 0
+    assert meta.disk_gb == pytest.approx(meta.weights_gb, rel=0.01)
+
+
+def test_resolve_is_memoized() -> None:
+    a = _hub.resolve_model("microsoft/deberta-v3-large")
+    b = _hub.resolve_model("microsoft/deberta-v3-large")
+    assert a is b
+
+
+def test_metadata_fallback_uses_heuristic_when_hub_unreachable() -> None:
+    """End-to-end: resolve_model must return a usable ModelMeta even when
+    the live Hub is unreachable (autouse fixture forces offline)."""
+    meta = _hub.resolve_model("microsoft/deberta-v3-large")
+    assert meta.confidence == "heuristic"
+    assert meta.params_millions > 0
+    assert meta.disk_gb > 0
diff --git a/tests/advisor/test_render.py b/tests/advisor/test_render.py
new file mode 100644
index 000000000..e82d7573b
--- /dev/null
+++ b/tests/advisor/test_render.py
@@ -0,0 +1,151 @@
+"""Output rendering: text formatting and JSON serialization."""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from autointent._advisor._render import render_json, render_recommendation, render_text
+from autointent._advisor._report import (
+    DatasetStats,
+    PreflightReport,
+    ResourceEstimate,
+    Severity,
+)
+
+
+def _populated_report() -> PreflightReport:
+    r = PreflightReport(
+        preset_name="example",
+        hardware={
+            "accelerator": "cuda",
+            "device_name": "RTX 3060",
+            "vram_gb": 8.0,
+            "ram_gb": 32.0,
+            "free_disk_gb": 100.0,
+            "device_class": "low-gpu",
+        },
+        dataset={"n_samples": 500, "n_classes": 10, "avg_tokens": 30, "source": "placeholder"},
+        resource=ResourceEstimate(
+            disk_download_gb=2.5,
+            disk_cached_gb=0.5,
+            ram_gb=1.0,
+            vram_gb=4.0,
+            time_hours=1.2,
+            drivers=[
+                {
+                    "node_type": "scoring",
+                    "module": "bert",
+                    "model": "x/y",
+                    "mode": "full-finetune",
+                    "vram_gb": 4.0,
+                    "ram_gb": 1.0,
+                    "time_hours": 1.2,
+                    "confidence": "hub",
+                }
+            ],
+        ),
+        notes=["MPS unified memory note"],
+    )
+    r.add("resource", Severity.YELLOW, "VRAM ~6 GB vs available 8 GB")
+    r.add("data", Severity.RED, "rare classes blocked")
+    return r
+
+
+class TestRenderText:
+    def test_contains_phase_blocks(self) -> None:
+        out = render_text(_populated_report())
+        assert "Resource:" in out
+        assert "Data:" in out
+        # Config phase has no findings → block omitted
+        assert "Config:" not in out
+
+    def test_includes_drivers_block(self) -> None:
+        out = render_text(_populated_report())
+        assert "Drivers of cost:" in out
+        assert "x/y" in out
+
+    def test_verdict_reflects_worst_severity(self) -> None:
+        out = render_text(_populated_report())
+        assert "Verdict: INFEASIBLE" in out
+        assert "worst severity: red" in out
+
+    def test_disclaimer_always_present(self) -> None:
+        out = render_text(_populated_report())
+        assert "heuristic upper bounds" in out
+
+    def test_low_confidence_tag_when_offline(self) -> None:
+        r = _populated_report()
+        r.low_confidence = True
+        out = render_text(r)
+        assert "low-confidence" in out
+
+    def test_preset_name_in_title(self) -> None:
+        out = render_text(_populated_report())
+        assert "Compute feasibility check — example" in out
+
+    def test_empty_report_still_renders(self) -> None:
+        out = render_text(PreflightReport())
+        assert "Compute feasibility check" in out
+        assert "Verdict: feasible" in out
+
+
+class TestRenderJson:
+    def test_is_valid_json(self) -> None:
+        json.loads(render_json(_populated_report()))
+
+    def test_findings_have_string_severity(self) -> None:
+        d = json.loads(render_json(_populated_report()))
+        for f in d["findings"]:
+            assert f["severity"] in {"green", "yellow", "red"}
+
+    def test_worst_severity_and_feasibility_serialized(self) -> None:
+        d = json.loads(render_json(_populated_report()))
+        assert d["worst_severity"] == "red"
+        assert d["is_feasible"] is False
+
+    def test_empty_report_serializes(self) -> None:
+        d = json.loads(render_json(PreflightReport()))
+        assert d["worst_severity"] == "green"
+        assert d["is_feasible"] is True
+
+
+class TestRenderRecommendation:
+    def _two_reports(self) -> list[tuple[str, PreflightReport]]:
+        a = PreflightReport(preset_name="a", resource=ResourceEstimate(vram_gb=2.0, time_hours=0.5))
+        a.add("resource", Severity.GREEN, "ok")
+        b = PreflightReport(preset_name="b", resource=ResourceEstimate(vram_gb=8.0, time_hours=4.0))
+        b.add("resource", Severity.RED, "too big")
+        return [("a", a), ("b", b)]
+
+    def test_lists_chosen_preset_when_present(self) -> None:
+        out = render_recommendation(self._two_reports(), chosen="a")
+        assert "→ a" in out
+
+    def test_handles_no_chosen(self) -> None:
+        out = render_recommendation(self._two_reports(), chosen=None)
+        assert "none of the bundled presets" in out
+
+    def test_includes_all_presets_in_table(self) -> None:
+        out = render_recommendation(self._two_reports(), chosen="a")
+        assert "a " in out  # preset name
+        assert "b " in out
+
+    def test_shows_status_per_preset(self) -> None:
+        out = render_recommendation(self._two_reports(), chosen="a")
+        assert "feasible" in out
+        assert "infeasible" in out
+
+
+def test_dataset_stats_in_text_block() -> None:
+    stats = DatasetStats.placeholder(n_samples=777, n_classes=4)
+    r = PreflightReport(dataset={
+        "n_samples": stats.n_samples,
+        "n_classes": stats.n_classes,
+        "avg_tokens": stats.avg_tokens,
+        "source": stats.source,
+    })
+    out = render_text(r)
+    assert "777" in out
+    assert "n_classes=4" in out
diff --git a/tests/advisor/test_report.py b/tests/advisor/test_report.py
new file mode 100644
index 000000000..52f2e675e
--- /dev/null
+++ b/tests/advisor/test_report.py
@@ -0,0 +1,85 @@
+"""Unit tests for the report dataclasses."""
+
+from __future__ import annotations
+
+import pytest
+
+from autointent._advisor._report import (
+    DatasetStats,
+    Finding,
+    PreflightReport,
+    ResourceEstimate,
+    Severity,
+)
+
+
+class TestSeverityOrdering:
+    def test_worst_severity_on_empty_report_is_green(self) -> None:
+        assert PreflightReport().worst_severity == Severity.GREEN
+
+    def test_red_beats_yellow_beats_green(self) -> None:
+        r = PreflightReport()
+        r.add("resource", Severity.GREEN, "ok")
+        r.add("data", Severity.YELLOW, "warn")
+        assert r.worst_severity == Severity.YELLOW
+        r.add("config", Severity.RED, "fail")
+        assert r.worst_severity == Severity.RED
+
+    def test_is_feasible_flips_on_any_red(self) -> None:
+        r = PreflightReport()
+        r.add("resource", Severity.YELLOW, "warn")
+        assert r.is_feasible is True
+        r.add("data", Severity.RED, "fail")
+        assert r.is_feasible is False
+
+
+class TestDatasetStatsPlaceholder:
+    def test_defaults_populate_p95_above_avg(self) -> None:
+        stats = DatasetStats.placeholder()
+        assert stats.n_samples == 1_000
+        assert stats.p95_tokens is not None
+        assert stats.p95_tokens > stats.avg_tokens
+        assert stats.source == "placeholder"
+
+    def test_overrides_propagate(self) -> None:
+        stats = DatasetStats.placeholder(n_samples=42, n_classes=3, avg_tokens=80, multilabel=True)
+        assert stats.n_samples == 42
+        assert stats.n_classes == 3
+        assert stats.avg_tokens == 80
+        assert stats.multilabel is True
+
+
+class TestResourceEstimate:
+    def test_total_disk_sums_download_and_dump(self) -> None:
+        e = ResourceEstimate(disk_download_gb=2.5, disk_dump_gb=4.0)
+        assert e.total_disk_gb == pytest.approx(6.5)
+
+    def test_total_disk_ignores_cached(self) -> None:
+        e = ResourceEstimate(disk_download_gb=1.0, disk_cached_gb=100.0, disk_dump_gb=0.5)
+        assert e.total_disk_gb == pytest.approx(1.5)
+
+
+class TestToDictSerialization:
+    def test_findings_round_trip_severity_as_string(self) -> None:
+        r = PreflightReport()
+        r.add("resource", Severity.RED, "boom")
+        d = r.to_dict()
+        assert d["worst_severity"] == "red"
+        assert d["is_feasible"] is False
+        assert d["findings"] == [
+            {"phase": "resource", "severity": "red", "message": "boom", "metric": None},
+        ]
+
+    def test_hardware_and_dataset_pass_through(self) -> None:
+        r = PreflightReport(
+            hardware={"accelerator": "cuda", "vram_gb": 8.0},
+            dataset={"n_samples": 100, "n_classes": 5},
+        )
+        d = r.to_dict()
+        assert d["hardware"]["accelerator"] == "cuda"
+        assert d["dataset"]["n_samples"] == 100
+
+    def test_finding_is_frozen(self) -> None:
+        f = Finding(phase="resource", severity=Severity.GREEN, message="ok")
+        with pytest.raises(Exception):  # noqa: PT011 - dataclass.FrozenInstanceError varies
+            f.message = "changed"  # type: ignore[misc]

From c8675b9a69cf5c1492380a1bdac2c8a3885281b2 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Mon, 15 Jun 2026 23:58:59 +0300
Subject: [PATCH 04/16] fix

---
 src/autointent/_advisor/__init__.py           |   2 +-
 src/autointent/_advisor/_cli.py               |  40 +--
 src/autointent/_advisor/_estimates.py         | 217 ++++++++++++++--
 src/autointent/_advisor/_hardware.py          |  59 ++---
 src/autointent/_advisor/_hub.py               |  20 +-
 src/autointent/_advisor/_render.py            |   6 +-
 src/autointent/_advisor/_report.py            |   6 +-
 tests/advisor/test_estimates_and_cli.py       |  43 +++-
 tests/advisor/test_estimates_internals.py     | 242 +++++++++++++++---
 tests/advisor/test_hub_heuristics.py          |   4 +-
 tests/advisor/test_render.py                  |  16 +-
 .../advanced/02_embedder_configuration.py     |   4 +-
 12 files changed, 495 insertions(+), 164 deletions(-)

diff --git a/src/autointent/_advisor/__init__.py b/src/autointent/_advisor/__init__.py
index 5f29b028e..3ff898816 100644
--- a/src/autointent/_advisor/__init__.py
+++ b/src/autointent/_advisor/__init__.py
@@ -7,9 +7,9 @@
 
 from __future__ import annotations
 
+from ._estimates import run_preflight
 from ._hardware import HardwareProfile, detect_hardware
 from ._report import DatasetStats, Finding, PreflightReport, ResourceEstimate, Severity
-from ._estimates import run_preflight
 
 __all__ = [
     "DatasetStats",
diff --git a/src/autointent/_advisor/_cli.py b/src/autointent/_advisor/_cli.py
index 4e7eae000..b3f43aab5 100644
--- a/src/autointent/_advisor/_cli.py
+++ b/src/autointent/_advisor/_cli.py
@@ -20,10 +20,13 @@
 
 import yaml
 
+from autointent import Dataset
+from autointent.utils import load_preset
+
 from ._estimates import run_preflight
 from ._hardware import detect_hardware
 from ._render import render_json, render_recommendation, render_text
-from ._report import DatasetStats, PreflightReport
+from ._report import DatasetStats, PreflightReport, Severity
 
 logger = logging.getLogger("autointent.advisor")
 
@@ -62,8 +65,6 @@ def _load_config(target: str) -> tuple[dict[str, Any], str]:
         with path.open(encoding="utf-8") as f:
             return yaml.safe_load(f), path.stem
     # treat as a bundled preset name
-    from autointent.utils import load_preset
-
     return load_preset(target), target  # type: ignore[arg-type]
 
 
@@ -80,15 +81,9 @@ def _stats_from_args(args: argparse.Namespace) -> DatasetStats:
 
 def _stats_from_dataset(path: str, *, multilabel: bool) -> DatasetStats:
     """Best-effort: load a dataset from disk via the existing Dataset constructor."""
-    try:
-        from autointent import Dataset
-    except ImportError:
-        logger.warning("autointent.Dataset unavailable; falling back to placeholders.")
-        return DatasetStats.placeholder(multilabel=multilabel)
-
     try:
         ds = Dataset.from_json(path) if path.endswith(".json") else Dataset.from_hub(path)
-    except Exception as e:  # noqa: BLE001
+    except (OSError, ValueError) as e:
         logger.warning("Failed to load dataset %s: %s", path, e)
         return DatasetStats.placeholder(multilabel=multilabel)
 
@@ -147,27 +142,24 @@ def cmd_recommend(args: argparse.Namespace) -> int:
     stats = _stats_from_args(args)
 
     results: list[tuple[str, PreflightReport]] = []
-    from autointent.utils import load_preset
 
     for preset in BUNDLED_PRESETS:
         try:
             cfg = load_preset(preset)  # type: ignore[arg-type]
-        except Exception as e:  # noqa: BLE001
+        except (OSError, ValueError, KeyError) as e:
             logger.debug("Skipping preset %s: %s", preset, e)
             continue
         report = run_preflight(cfg, stats, hardware, preset_name=preset)
         if args.budget_time_h is not None and report.resource.time_hours > args.budget_time_h:
             report.add(
                 "resource",
-                report.worst_severity if report.worst_severity.value == "red" else report.worst_severity,  # noqa: PLW0125 - explicit
+                Severity.RED,
                 f"Estimated time {report.resource.time_hours:.1f} h exceeds budget {args.budget_time_h} h.",
             )
         results.append((preset, report))
 
     feasible = [(name, r) for name, r in results if r.is_feasible]
-    feasible.sort(
-        key=lambda pair: (-_QUALITY_TIER.get(pair[0], 0), pair[1].resource.time_hours, pair[0])
-    )
+    feasible.sort(key=lambda pair: (-_QUALITY_TIER.get(pair[0], 0), pair[1].resource.time_hours, pair[0]))
     chosen = feasible[0][0] if feasible else None
 
     if args.json:
@@ -175,9 +167,7 @@ def cmd_recommend(args: argparse.Namespace) -> int:
 
         out = {
             "chosen": chosen,
-            "results": [
-                {"preset": name, "report": r.to_dict()} for name, r in results
-            ],
+            "results": [{"preset": name, "report": r.to_dict()} for name, r in results],
         }
         sys.stdout.write(json.dumps(out, indent=2, default=str))
         sys.stdout.write("\n")
@@ -206,9 +196,7 @@ def build_parser() -> argparse.ArgumentParser:
     )
     p_inspect.add_argument("target", help="Preset name (e.g. transformers-light) or path to a YAML config.")
     p_inspect.add_argument("--json", action="store_true", help="Emit a structured JSON report.")
-    p_inspect.add_argument(
-        "--budget-vram-gb", type=float, default=None, help="Override detected VRAM budget."
-    )
+    p_inspect.add_argument("--budget-vram-gb", type=float, default=None, help="Override detected VRAM budget.")
     _add_common_dataset_args(p_inspect)
     p_inspect.set_defaults(func=cmd_inspect)
 
@@ -217,12 +205,8 @@ def build_parser() -> argparse.ArgumentParser:
         help="Detect hardware and recommend the best-fitting bundled preset.",
     )
     p_rec.add_argument("--json", action="store_true", help="Emit a structured JSON report.")
-    p_rec.add_argument(
-        "--budget-vram-gb", type=float, default=None, help="Override detected VRAM budget."
-    )
-    p_rec.add_argument(
-        "--budget-time-h", type=float, default=None, help="Optional wall-time ceiling in hours."
-    )
+    p_rec.add_argument("--budget-vram-gb", type=float, default=None, help="Override detected VRAM budget.")
+    p_rec.add_argument("--budget-time-h", type=float, default=None, help="Optional wall-time ceiling in hours.")
     _add_common_dataset_args(p_rec)
     p_rec.set_defaults(func=cmd_recommend)
 
diff --git a/src/autointent/_advisor/_estimates.py b/src/autointent/_advisor/_estimates.py
index f60f940a6..e8f619303 100644
--- a/src/autointent/_advisor/_estimates.py
+++ b/src/autointent/_advisor/_estimates.py
@@ -9,7 +9,8 @@
 from __future__ import annotations
 
 import logging
-from typing import Any, Iterable
+from collections.abc import Iterable
+from typing import Any
 
 from ._hardware import HardwareProfile
 from ._hub import ModelMeta, hub_reachable, resolve_model
@@ -32,6 +33,16 @@
 
 TRANSFORMER_SCORER_MODULES = {"bert", "lora", "ptuning", "dnnc"}
 
+# Coefficients for the linear / catboost time formulas (proposal §"Algorithm").
+_LINEAR_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-8
+_CATBOOST_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-9
+_CATBOOST_GPU_SPEEDUP = 10.0
+# LogisticRegressionCV defaults: Cs=10, cv=3 → 31 inner fits + 1 final refit.
+_LOGREG_CV_MULTIPLIER = 31
+_CATBOOST_DEFAULT_BINS = 254
+# Bytes per histogram bucket / tree node — order-of-magnitude constants.
+_CATBOOST_BYTES_PER_TREE_NODE = 32
+
 
 def _extract_model_names(module_entry: dict[str, Any]) -> list[str]:
     """Pull model name(s) from a search-space module entry."""
@@ -74,11 +85,22 @@ def _walk_modules(search_space: list[dict[str, Any]]) -> Iterable[tuple[str, dic
             yield node_type, entry
 
 
+def _walk_modules_indexed(
+    search_space: list[dict[str, Any]],
+) -> Iterable[tuple[int, str, dict[str, Any]]]:
+    """Yield (node_index, node_type, module_entry) — index lets us bound per-node max cost."""
+    for node_idx, node in enumerate(search_space or []):
+        node_type = node.get("node_type", "?")
+        for entry in node.get("search_space", []) or []:
+            yield node_idx, node_type, entry
+
+
 def _vram_for_transformer(meta: ModelMeta, mode: str, mixed_precision: bool) -> float:
     """VRAM in GB for one trial of a transformer-based module.
 
-    Conservative AMP accounting (the proposal flags the prior naive halving
-    as too generous; keep optimizer state at fp32 even in AMP).
+    Full fine-tune fp32: weights + grads + Adam (m, v) = 4W.
+    Full fine-tune AMP: fp16 weights + fp16 grads + fp32 master copy + fp32 Adam = 3W.
+    (Activations are not modeled separately.)
     """
     weights_gb = meta.weights_gb
     if mode == "inference":
@@ -87,11 +109,9 @@ def _vram_for_transformer(meta: ModelMeta, mode: str, mixed_precision: bool) ->
         return weights_gb * 1.3 + 0.5
     if mode == "reranker":
         return weights_gb * 1.5
-    # full fine-tune (bert, ptuning, gcn-with-backbone)
     if mixed_precision:
-        # fp16 weights+grads + fp32 master+adam moments
-        return (weights_gb * 0.5) * 2 + weights_gb * 1 + weights_gb * 2
-    return weights_gb * (1 + 1 + 2)
+        return weights_gb * 3.0
+    return weights_gb * 4.0
 
 
 def _ram_for_module(meta: ModelMeta, stats: DatasetStats) -> float:
@@ -99,6 +119,82 @@ def _ram_for_module(meta: ModelMeta, stats: DatasetStats) -> float:
     return meta.weights_gb + (stats.n_samples * stats.avg_tokens * 4) / (1024**3)
 
 
+def _embedder_dim(meta: ModelMeta | None) -> int:
+    """Coarse hidden-size guess from parameter count.
+
+    Concrete points: MiniLM (33M) ~384, BERT-base (110M) ~768, BERT-large (350M) ~1024.
+    """
+    if meta is None:
+        return 768
+    params = meta.params_millions
+    if params >= 300:
+        return 1024
+    if params >= 100:
+        return 768
+    if params >= 50:
+        return 512
+    return 384
+
+
+def _largest_embedder(seen_models: dict[str, ModelMeta]) -> ModelMeta | None:
+    if not seen_models:
+        return None
+    return max(seen_models.values(), key=lambda m: m.params_millions)
+
+
+def _ram_for_linear(*, stats: DatasetStats, embedder_dim: int) -> float:
+    """Float64 design matrix dominates; coefficients and L-BFGS history are small."""
+    data_bytes = 8.0 * stats.n_samples * embedder_dim
+    coef_bytes = 8.0 * max(1, stats.n_classes) * embedder_dim
+    lbfgs_bytes = 10.0 * 8.0 * embedder_dim
+    return (data_bytes + coef_bytes + lbfgs_bytes) / (1024**3)
+
+
+def _time_for_linear(
+    *,
+    n_trials: int,
+    n_samples: int,
+    embedder_dim: int,
+    max_iter: int,
+    cv_multiplier: int,
+    class_multiplier: int,
+) -> float:
+    seconds = (
+        n_trials
+        * _LINEAR_CPU_S_PER_SAMPLE_FEATURE_ITER
+        * n_samples
+        * embedder_dim
+        * max_iter
+        * cv_multiplier
+        * class_multiplier
+    )
+    return seconds / 3600.0
+
+
+def _ram_for_catboost(*, stats: DatasetStats, n_features: int, iterations: int, depth: int) -> float:
+    data_bytes = 4.0 * stats.n_samples * n_features
+    histograms_bytes = 4.0 * n_features * _CATBOOST_DEFAULT_BINS
+    trees_bytes = iterations * (2**depth) * _CATBOOST_BYTES_PER_TREE_NODE
+    return (data_bytes + histograms_bytes + trees_bytes) / (1024**3)
+
+
+def _time_for_catboost(
+    *,
+    n_trials: int,
+    n_samples: int,
+    n_features: int,
+    iterations: int,
+    depth: int,
+    class_multiplier: int,
+    on_gpu: bool,
+) -> float:
+    coeff = _CATBOOST_CPU_S_PER_SAMPLE_FEATURE_ITER
+    if on_gpu:
+        coeff /= _CATBOOST_GPU_SPEEDUP
+    seconds = n_trials * iterations * coeff * n_samples * n_features * depth * class_multiplier
+    return seconds / 3600.0
+
+
 def _time_for_transformer(
     *,
     meta: ModelMeta,
@@ -148,10 +244,24 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
     if global_embedder:
         seen_models[global_embedder] = resolve_model(global_embedder)
 
-    for node_type, entry in _walk_modules(config.get("search_space") or []):
+    # First pass: walk transformer-bearing modules (collects seen_models for embedder_dim lookup).
+    transformer_entries: list[tuple[int, str, dict[str, Any]]] = []
+    classic_entries: list[tuple[int, str, dict[str, Any]]] = []
+    for node_idx, node_type, entry in _walk_modules_indexed(config.get("search_space") or []):
+        module = entry.get("module_name", "?")
+        if module in {"linear", "catboost"}:
+            classic_entries.append((node_idx, node_type, entry))
+        else:
+            transformer_entries.append((node_idx, node_type, entry))
+
+    # Track the heaviest module per node so dump_modules accounting is bounded by
+    # "one selected variant per node × n_trials", not "sum of every candidate".
+    node_max_weights: dict[int, float] = {}
+
+    for node_idx, node_type, entry in transformer_entries:
         module = entry.get("module_name", "?")
         model_names = _extract_model_names(entry)
-        if not model_names and global_embedder and module in {"linear", "catboost", "knn", "mlknn"}:
+        if not model_names and global_embedder and module in {"knn", "mlknn"}:
             model_names = [global_embedder]
 
         for name in model_names:
@@ -191,6 +301,7 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
             estimate.vram_gb = max(estimate.vram_gb, vram)
             estimate.ram_gb = max(estimate.ram_gb, ram)
             estimate.time_hours += time_h
+            node_max_weights[node_idx] = max(node_max_weights.get(node_idx, 0.0), meta.weights_gb)
             estimate.drivers.append(
                 {
                     "node_type": node_type,
@@ -204,6 +315,76 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
                 }
             )
 
+    # Second pass: linear / catboost — cost depends on embedder_dim, not a checkpoint.
+    embedder_meta = _largest_embedder(seen_models)
+    embedder_dim = _embedder_dim(embedder_meta)
+    class_multiplier_classic = max(1, stats.n_classes) if stats.multilabel else 1
+    for _node_idx, node_type, entry in classic_entries:
+        module = entry.get("module_name", "?")
+        if module == "linear":
+            max_iter = _max_int(entry.get("max_iter"), 100)
+            cv_multiplier = 1 if stats.multilabel else _LOGREG_CV_MULTIPLIER
+            ram = _ram_for_linear(stats=stats, embedder_dim=embedder_dim)
+            time_h = _time_for_linear(
+                n_trials=n_trials,
+                n_samples=stats.n_samples,
+                embedder_dim=embedder_dim,
+                max_iter=max_iter,
+                cv_multiplier=cv_multiplier,
+                class_multiplier=class_multiplier_classic,
+            )
+            if refit_after:
+                time_h *= 1 + 1.0 / max(1, n_trials)
+            vram = 0.0
+            mode = "linear-cv" if cv_multiplier > 1 else "linear"
+            confidence = embedder_meta.confidence if embedder_meta else "heuristic"
+        elif module == "catboost":
+            iterations = _max_int(entry.get("iterations"), 1000)
+            depth = _max_int(entry.get("depth"), 6)
+            on_gpu = entry.get("task_type") == "GPU" and hardware.accelerator == "cuda"
+            # CatBoost's multiclass MultiClass loss already grows per-class trees.
+            cb_class_mult = max(1, stats.n_classes)
+            ram = _ram_for_catboost(
+                stats=stats,
+                n_features=embedder_dim,
+                iterations=iterations,
+                depth=depth,
+            )
+            time_h = _time_for_catboost(
+                n_trials=n_trials,
+                n_samples=stats.n_samples,
+                n_features=embedder_dim,
+                iterations=iterations,
+                depth=depth,
+                class_multiplier=cb_class_mult,
+                on_gpu=on_gpu,
+            )
+            if refit_after:
+                time_h *= 1 + 1.0 / max(1, n_trials)
+            vram = ram if on_gpu else 0.0
+            if on_gpu:
+                ram = 0.0
+            mode = "catboost-gpu" if on_gpu else "catboost"
+            confidence = embedder_meta.confidence if embedder_meta else "heuristic"
+        else:
+            continue
+
+        estimate.vram_gb = max(estimate.vram_gb, vram)
+        estimate.ram_gb = max(estimate.ram_gb, ram)
+        estimate.time_hours += time_h
+        estimate.drivers.append(
+            {
+                "node_type": node_type,
+                "module": module,
+                "model": embedder_meta.name if embedder_meta else "(no embedder)",
+                "mode": mode,
+                "vram_gb": round(vram, 2),
+                "ram_gb": round(ram, 2),
+                "time_hours": round(time_h, 2),
+                "confidence": confidence,
+            }
+        )
+
     for meta in seen_models.values():
         if meta.cached_locally:
             estimate.disk_cached_gb += meta.disk_gb
@@ -211,8 +392,10 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
             estimate.disk_download_gb += meta.disk_gb
 
     if dump_modules:
-        weights_total = sum(m.weights_gb for m in seen_models.values())
-        estimate.disk_dump_gb = weights_total * n_trials
+        # Each trial selects one variant per node, so per-trial dumped weights
+        # are bounded by the heaviest module in each node, summed across nodes.
+        per_trial_dump_gb = sum(node_max_weights.values())
+        estimate.disk_dump_gb = per_trial_dump_gb * n_trials
 
     if n_jobs > 1 and hardware.accelerator in {"cuda", "mps"}:
         effective_vram = estimate.vram_gb * n_jobs
@@ -309,23 +492,17 @@ def _data_phase(
             )
 
     # rare class × linear-CV
-    has_linear = any(
-        e.get("module_name") == "linear" for _, e in _walk_modules(config.get("search_space") or [])
-    )
+    has_linear = any(e.get("module_name") == "linear" for _, e in _walk_modules(config.get("search_space") or []))
     if has_linear and stats.rare_classes:
         report.add(
             "data",
             Severity.RED,
-            (
-                "LogisticRegressionCV (cv=3) will fail: classes "
-                f"{stats.rare_classes[:5]} have <3 samples."
-            ),
+            (f"LogisticRegressionCV (cv=3) will fail: classes {stats.rare_classes[:5]} have <3 samples."),
         )
 
     # partial descriptions × description scorer
     has_description = any(
-        e.get("module_name") == "description"
-        for _, e in _walk_modules(config.get("search_space") or [])
+        e.get("module_name") == "description" for _, e in _walk_modules(config.get("search_space") or [])
     )
     if has_description and stats.has_descriptions is False:
         report.add(
diff --git a/src/autointent/_advisor/_hardware.py b/src/autointent/_advisor/_hardware.py
index 2bda6120f..9c0cae049 100644
--- a/src/autointent/_advisor/_hardware.py
+++ b/src/autointent/_advisor/_hardware.py
@@ -14,6 +14,9 @@
 from dataclasses import dataclass, field
 from typing import Literal
 
+import psutil
+import torch
+
 logger = logging.getLogger(__name__)
 
 Accelerator = Literal["cuda", "mps", "cpu"]
@@ -46,13 +49,7 @@ def device_class(self) -> str:
 
 
 def _detect_ram_gb() -> float:
-    try:
-        import psutil
-
-        return psutil.virtual_memory().total / (1024**3)
-    except ImportError:
-        logger.debug("psutil unavailable; RAM unknown")
-        return 0.0
+    return psutil.virtual_memory().total / (1024**3)
 
 
 def _detect_free_disk_gb(path: str | None = None) -> float:
@@ -67,40 +64,24 @@ def _detect_free_disk_gb(path: str | None = None) -> float:
 
 
 def _detect_cuda() -> tuple[float, str] | None:
-    try:
-        import torch
-
-        if not torch.cuda.is_available():
-            return None
-        idx = 0
-        try:
-            free, total = torch.cuda.mem_get_info(idx)
-            vram_gb = total / (1024**3)
-        except (RuntimeError, AttributeError) as e:
-            logger.debug("torch.cuda.mem_get_info failed: %s", e)
-            return None
-        name = torch.cuda.get_device_name(idx)
-        return vram_gb, name
-    except ImportError:
+    if not torch.cuda.is_available():
         return None
-    except Exception as e:  # noqa: BLE001 - protect the advisor from torch quirks
-        logger.debug("CUDA detection raised: %s", e)
+    idx = 0
+    try:
+        _free, total = torch.cuda.mem_get_info(idx)
+        vram_gb = total / (1024**3)
+    except (RuntimeError, AttributeError) as e:
+        logger.debug("torch.cuda.mem_get_info failed: %s", e)
         return None
+    name = torch.cuda.get_device_name(idx)
+    return vram_gb, name
 
 
 def _detect_mps(ram_gb: float, budget_ratio: float = MPS_DEFAULT_BUDGET_RATIO) -> tuple[float, str] | None:
-    try:
-        import torch
-
-        if not (hasattr(torch.backends, "mps") and torch.backends.mps.is_available()):
-            return None
-        # apple silicon: unified memory; budget is fraction of total RAM
-        return ram_gb * budget_ratio, f"Apple Silicon ({platform.machine()})"
-    except ImportError:
-        return None
-    except Exception as e:  # noqa: BLE001
-        logger.debug("MPS detection raised: %s", e)
+    if not (hasattr(torch.backends, "mps") and torch.backends.mps.is_available()):
         return None
+    # apple silicon: unified memory; budget is fraction of total RAM
+    return ram_gb * budget_ratio, f"Apple Silicon ({platform.machine()})"
 
 
 def detect_hardware(
@@ -133,9 +114,7 @@ def detect_hardware(
         if mps is not None:
             vram_gb, device_name = mps
             accel = "mps"
-            notes.append(
-                f"MPS unified memory: VRAM budget = {mps_budget_ratio:.0%} of RAM."
-            )
+            notes.append(f"MPS unified memory: VRAM budget = {mps_budget_ratio:.0%} of RAM.")
         else:
             vram_gb = 0.0
             device_name = platform.processor() or "cpu"
@@ -143,9 +122,7 @@ def detect_hardware(
 
     if vram_budget_gb is not None:
         if vram_gb and vram_budget_gb > vram_gb:
-            notes.append(
-                f"Manual --budget-vram-gb={vram_budget_gb} exceeds detected {vram_gb:.1f} GB; using override."
-            )
+            notes.append(f"Manual --budget-vram-gb={vram_budget_gb} exceeds detected {vram_gb:.1f} GB; using override.")
         notes.append(f"Using manual VRAM budget: {vram_budget_gb} GB.")
         vram_gb = vram_budget_gb
 
diff --git a/src/autointent/_advisor/_hub.py b/src/autointent/_advisor/_hub.py
index 80ccb7133..613ab6b40 100644
--- a/src/autointent/_advisor/_hub.py
+++ b/src/autointent/_advisor/_hub.py
@@ -14,6 +14,8 @@
 from functools import lru_cache
 from typing import Any
 
+from huggingface_hub import HfApi, scan_cache_dir, try_to_load_from_cache
+
 logger = logging.getLogger(__name__)
 
 # Coarse heuristic estimates keyed on name fragments. Used only when HF Hub
@@ -55,17 +57,11 @@ def weights_gb(self) -> float:
 def hub_reachable(timeout_s: float = 2.0) -> bool:
     """Single up-front probe. Memoized per process."""
     try:
-        from huggingface_hub import HfApi
-
         HfApi().list_models(limit=1)
-    except ImportError:
-        logger.debug("huggingface_hub not installed; assuming offline")
-        return False
     except Exception as e:  # noqa: BLE001
         logger.debug("HF Hub probe failed: %s", e)
         return False
-    else:
-        return True
+    return True
 
 
 def _heuristic_params_millions(model_name: str) -> float:
@@ -77,11 +73,6 @@ def _heuristic_params_millions(model_name: str) -> float:
 
 def _is_warm_cached(model_name: str) -> bool:
     """True when the weight shard is present in the local HF cache."""
-    try:
-        from huggingface_hub import scan_cache_dir, try_to_load_from_cache
-    except ImportError:
-        return False
-
     weight_files = ["model.safetensors", "pytorch_model.bin", "model.safetensors.index.json"]
     for fname in weight_files:
         path = try_to_load_from_cache(model_name, fname)
@@ -98,11 +89,6 @@ def _is_warm_cached(model_name: str) -> bool:
 
 
 def _hub_metadata(model_name: str) -> ModelMeta | None:
-    try:
-        from huggingface_hub import HfApi
-    except ImportError:
-        return None
-
     try:
         info = HfApi().model_info(model_name, files_metadata=True)
     except Exception as e:  # noqa: BLE001
diff --git a/src/autointent/_advisor/_render.py b/src/autointent/_advisor/_render.py
index 52168aa75..fe0f32dd7 100644
--- a/src/autointent/_advisor/_render.py
+++ b/src/autointent/_advisor/_render.py
@@ -18,7 +18,7 @@
 _PHASE_LABEL = {"resource": "Resource", "data": "Data", "config": "Config"}
 
 
-def render_text(report: "PreflightReport") -> str:
+def render_text(report: PreflightReport) -> str:
     lines: list[str] = []
     title = "Compute feasibility check"
     if report.preset_name:
@@ -76,12 +76,12 @@ def render_text(report: "PreflightReport") -> str:
     return "\n".join(lines)
 
 
-def render_json(report: "PreflightReport") -> str:
+def render_json(report: PreflightReport) -> str:
     return json.dumps(report.to_dict(), indent=2, default=str)
 
 
 def render_recommendation(
-    results: list[tuple[str, "PreflightReport"]],
+    results: list[tuple[str, PreflightReport]],
     chosen: str | None,
 ) -> str:
     """Compact table for the ``recommend`` subcommand."""
diff --git a/src/autointent/_advisor/_report.py b/src/autointent/_advisor/_report.py
index 0250482a5..6b930db95 100644
--- a/src/autointent/_advisor/_report.py
+++ b/src/autointent/_advisor/_report.py
@@ -67,7 +67,7 @@ def placeholder(
         n_classes: int = 10,
         avg_tokens: int = 32,
         multilabel: bool = False,
-    ) -> "DatasetStats":
+    ) -> DatasetStats:
         return cls(
             n_samples=n_samples,
             n_classes=n_classes,
@@ -105,9 +105,7 @@ def is_feasible(self) -> bool:
 
     def to_dict(self) -> dict[str, Any]:
         d = asdict(self)
-        d["findings"] = [
-            {**asdict(f), "severity": f.severity.value} for f in self.findings
-        ]
+        d["findings"] = [{**asdict(f), "severity": f.severity.value} for f in self.findings]
         d["worst_severity"] = self.worst_severity.value
         d["is_feasible"] = self.is_feasible
         return d
diff --git a/tests/advisor/test_estimates_and_cli.py b/tests/advisor/test_estimates_and_cli.py
index 18c2615a6..00537226d 100644
--- a/tests/advisor/test_estimates_and_cli.py
+++ b/tests/advisor/test_estimates_and_cli.py
@@ -162,9 +162,7 @@ def test_partial_descriptions_with_description_scorer_flags_red() -> None:
         has_descriptions=False,
     )
     report = run_preflight(cfg, stats, _profile(vram_gb=16.0))
-    assert any(
-        f.phase == "data" and "description" in f.message.lower() for f in report.findings
-    )
+    assert any(f.phase == "data" and "description" in f.message.lower() for f in report.findings)
 
 
 def test_long_dataset_triggers_truncation_warning() -> None:
@@ -175,9 +173,7 @@ def test_long_dataset_triggers_truncation_warning() -> None:
                 "search_space": [
                     {
                         "module_name": "bert",
-                        "classification_model_config": [
-                            {"model_name": "microsoft/deberta-v3-small"}
-                        ],
+                        "classification_model_config": [{"model_name": "microsoft/deberta-v3-small"}],
                         "max_length": [128],
                     }
                 ],
@@ -194,5 +190,40 @@ def test_long_dataset_triggers_truncation_warning() -> None:
     assert any("truncation" in f.message.lower() for f in report.findings)
 
 
+def test_cli_recommend_budget_time_flags_red_for_overbudget_presets(
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    """Tight time budget must flag every preset that exceeds it with RED severity.
+
+    Previously the budget path used a tautological severity expression and the
+    breach never escalated the finding — covers the regression."""
+    main(
+        [
+            "recommend",
+            "--n-samples",
+            "1000",
+            "--n-classes",
+            "10",
+            "--avg-tokens",
+            "20",
+            "--budget-vram-gb",
+            "48",
+            "--budget-time-h",
+            "0.0001",
+            "--json",
+        ]
+    )
+    payload = json.loads(capsys.readouterr().out)
+    flagged = [
+        r
+        for r in payload["results"]
+        if any(f["severity"] == "red" and "exceeds budget" in f["message"] for f in r["report"]["findings"])
+    ]
+    assert flagged, "budget-time-h breach should produce RED severity findings"
+    # Any preset above the budget must be marked infeasible.
+    for r in flagged:
+        assert r["report"]["is_feasible"] is False
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__, "-v"]))
diff --git a/tests/advisor/test_estimates_internals.py b/tests/advisor/test_estimates_internals.py
index 0317ff27b..713db27f7 100644
--- a/tests/advisor/test_estimates_internals.py
+++ b/tests/advisor/test_estimates_internals.py
@@ -2,6 +2,8 @@
 
 from __future__ import annotations
 
+from typing import Any
+
 import pytest
 
 from autointent._advisor import _estimates, _hub
@@ -109,22 +111,19 @@ def meta(self) -> ModelMeta:
             confidence="hub",
         )
 
-    def test_full_finetune_is_larger_than_lora_is_larger_than_inference(
-        self, meta: ModelMeta
-    ) -> None:
+    def test_full_finetune_is_larger_than_lora_is_larger_than_inference(self, meta: ModelMeta) -> None:
         inference = _vram_for_transformer(meta, "inference", mixed_precision=False)
         lora = _vram_for_transformer(meta, "lora", mixed_precision=False)
         full = _vram_for_transformer(meta, "full-finetune", mixed_precision=False)
         assert inference < lora < full
 
-    def test_amp_does_not_naively_halve(self, meta: ModelMeta) -> None:
-        """The proposal calls out that AMP doesn't halve total VRAM — fp32 master
-        weights and Adam moments don't shrink. Weight-side accounting comes out
-        equal to fp32; the only savings (activations) aren't modeled by us."""
+    def test_amp_partially_reduces_full_finetune_vram(self, meta: ModelMeta) -> None:
+        """AMP saves on fp16 weights+grads (W down from 2W); Adam state stays
+        fp32 (2W). Total 3W vs fp32's 4W — real but not a full halving."""
         full_fp32 = _vram_for_transformer(meta, "full-finetune", mixed_precision=False)
         full_amp = _vram_for_transformer(meta, "full-finetune", mixed_precision=True)
-        assert full_amp / full_fp32 == pytest.approx(1.0)
-        assert full_amp / full_fp32 > 0.5  # explicit check vs the naive-halving formula
+        assert full_amp < full_fp32
+        assert full_amp / full_fp32 == pytest.approx(0.75)
 
     def test_reranker_uses_inference_class(self, meta: ModelMeta) -> None:
         inference = _vram_for_transformer(meta, "inference", mixed_precision=False)
@@ -155,9 +154,7 @@ def test_dump_modules_adds_disk_during_training(self) -> None:
                     "search_space": [
                         {
                             "module_name": "bert",
-                            "classification_model_config": [
-                                {"model_name": "microsoft/deberta-v3-small"}
-                            ],
+                            "classification_model_config": [{"model_name": "microsoft/deberta-v3-small"}],
                             "num_train_epochs": [3],
                             "batch_size": [16],
                         }
@@ -179,9 +176,7 @@ def test_refit_after_increases_time(self) -> None:
                     "search_space": [
                         {
                             "module_name": "bert",
-                            "classification_model_config": [
-                                {"model_name": "microsoft/deberta-v3-small"}
-                            ],
+                            "classification_model_config": [{"model_name": "microsoft/deberta-v3-small"}],
                             "num_train_epochs": [3],
                             "batch_size": [16],
                         }
@@ -207,9 +202,7 @@ def test_catboost_gpu_without_cuda_flags_config(self) -> None:
             ],
         }
         report = run_preflight(cfg, DatasetStats.placeholder(), _profile(accelerator="cpu"))
-        assert any(
-            f.phase == "config" and "CatBoost" in f.message for f in report.findings
-        )
+        assert any(f.phase == "config" and "CatBoost" in f.message for f in report.findings)
 
     def test_catboost_gpu_with_cuda_is_silent(self) -> None:
         cfg = {
@@ -223,9 +216,7 @@ def test_catboost_gpu_with_cuda_is_silent(self) -> None:
             ],
         }
         report = run_preflight(cfg, DatasetStats.placeholder(), _profile(accelerator="cuda"))
-        assert not any(
-            f.phase == "config" and "CatBoost" in f.message for f in report.findings
-        )
+        assert not any(f.phase == "config" and "CatBoost" in f.message for f in report.findings)
 
     def test_offline_flips_low_confidence(self) -> None:
         cfg = {
@@ -277,9 +268,7 @@ def test_truncation_red_when_p95_dominates_max_length(self) -> None:
                         {
                             "module_name": "bert",
                             "max_length": [128],
-                            "classification_model_config": [
-                                {"model_name": "some/model"}
-                            ],
+                            "classification_model_config": [{"model_name": "some/model"}],
                         }
                     ],
                 }
@@ -299,9 +288,7 @@ def test_truncation_yellow_when_p95_only_slightly_exceeds(self) -> None:
                         {
                             "module_name": "bert",
                             "max_length": [128],
-                            "classification_model_config": [
-                                {"model_name": "some/model"}
-                            ],
+                            "classification_model_config": [{"model_name": "some/model"}],
                         }
                     ],
                 }
@@ -312,8 +299,203 @@ def test_truncation_yellow_when_p95_only_slightly_exceeds(self) -> None:
         yellows = [
             f
             for f in report.findings
-            if f.phase == "data"
-            and f.severity == Severity.YELLOW
-            and "truncation" in f.message.lower()
+            if f.phase == "data" and f.severity == Severity.YELLOW and "truncation" in f.message.lower()
         ]
         assert yellows
+
+
+class TestLinearCatboostFormulas:
+    """Cost surfaces for the classic (sklearn / catboost) scorers."""
+
+    def _embedder_node(self) -> dict[str, Any]:
+        return {
+            "node_type": "embedder",
+            "search_space": [
+                {
+                    "module_name": "sentence_transformer",
+                    "embedder_config": [{"model_name": "sentence-transformers/all-MiniLM-L6-v2"}],
+                }
+            ],
+        }
+
+    def test_linear_contributes_ram_and_time(self) -> None:
+        cfg = {
+            "search_space": [
+                self._embedder_node(),
+                {
+                    "node_type": "scoring",
+                    "search_space": [{"module_name": "linear", "max_iter": [200]}],
+                },
+            ],
+            "hpo_config": {"n_trials": 5},
+        }
+        stats = DatasetStats.placeholder(n_samples=100_000, n_classes=10, avg_tokens=24)
+        report = run_preflight(cfg, stats, _profile())
+        linear_drivers = [d for d in report.resource.drivers if d["module"] == "linear"]
+        assert len(linear_drivers) == 1
+        assert report.resource.ram_gb > 0
+        assert report.resource.time_hours > 0
+        assert linear_drivers[0]["vram_gb"] == 0  # sklearn is CPU-only
+
+    def test_logreg_cv_multiplier_dominates_multiclass_time(self) -> None:
+        """Multiclass linear uses LogisticRegressionCV (Cs*cv+1 ≈ 31 inner fits);
+        multilabel uses one LogReg per class (cv_multiplier=1). At equal n_classes,
+        multiclass must be much slower than the per-class multilabel path."""
+        base = {
+            "search_space": [
+                self._embedder_node(),
+                {
+                    "node_type": "scoring",
+                    "search_space": [{"module_name": "linear", "max_iter": [1000]}],
+                },
+            ],
+            "hpo_config": {"n_trials": 1},
+        }
+        multiclass = run_preflight(
+            base,
+            DatasetStats.placeholder(n_samples=100_000, n_classes=10, multilabel=False),
+            _profile(),
+        )
+        multilabel = run_preflight(
+            base,
+            DatasetStats.placeholder(n_samples=100_000, n_classes=10, multilabel=True),
+            _profile(),
+        )
+        # multiclass: 31 inner fits x 1 model; multilabel: 1 fit x n_classes=10 models.
+        # 31 > 10 => multiclass is the slower path.
+        assert multiclass.resource.time_hours > multilabel.resource.time_hours
+
+    def test_catboost_contributes_ram_and_time_on_cpu(self) -> None:
+        cfg = {
+            "search_space": [
+                self._embedder_node(),
+                {
+                    "node_type": "scoring",
+                    "search_space": [
+                        {
+                            "module_name": "catboost",
+                            "iterations": [1000],
+                            "depth": [6],
+                        }
+                    ],
+                },
+            ],
+            "hpo_config": {"n_trials": 3},
+        }
+        stats = DatasetStats.placeholder(n_samples=100_000, n_classes=8, avg_tokens=24)
+        report = run_preflight(cfg, stats, _profile(accelerator="cpu"))
+        cb = next(d for d in report.resource.drivers if d["module"] == "catboost")
+        assert report.resource.ram_gb > 0
+        assert report.resource.time_hours > 0
+        assert cb["vram_gb"] == 0
+        assert cb["mode"] == "catboost"
+
+    def test_catboost_gpu_moves_cost_to_vram(self) -> None:
+        cfg = {
+            "search_space": [
+                self._embedder_node(),
+                {
+                    "node_type": "scoring",
+                    "search_space": [
+                        {
+                            "module_name": "catboost",
+                            "iterations": [1000],
+                            "depth": [6],
+                            "task_type": "GPU",
+                        }
+                    ],
+                },
+            ],
+            "hpo_config": {"n_trials": 2},
+        }
+        stats = DatasetStats.placeholder(n_samples=100_000, n_classes=8, avg_tokens=24)
+        report = run_preflight(cfg, stats, _profile(accelerator="cuda"))
+        cb = next(d for d in report.resource.drivers if d["module"] == "catboost")
+        assert report.resource.vram_gb > 0
+        assert cb["ram_gb"] == 0
+        assert cb["mode"] == "catboost-gpu"
+
+    def test_linear_scales_with_n_samples(self) -> None:
+        cfg = {
+            "search_space": [
+                self._embedder_node(),
+                {
+                    "node_type": "scoring",
+                    "search_space": [{"module_name": "linear"}],
+                },
+            ],
+        }
+        small = run_preflight(cfg, DatasetStats.placeholder(n_samples=500), _profile())
+        big = run_preflight(cfg, DatasetStats.placeholder(n_samples=500_000), _profile())
+        assert big.resource.time_hours > small.resource.time_hours
+        assert big.resource.ram_gb > small.resource.ram_gb
+
+
+class TestDumpModulesBounding:
+    """`dump_modules=True` writes one selected variant per node per trial — not
+    every candidate. The estimate must be bounded by sum-of-max-per-node x n_trials."""
+
+    def test_dump_disk_is_bounded_by_per_node_max_not_sum_of_all_variants(self) -> None:
+        # Two BERT candidates in the same node: only one is selected per trial.
+        cfg = {
+            "search_space": [
+                {
+                    "node_type": "scoring",
+                    "search_space": [
+                        {
+                            "module_name": "bert",
+                            "classification_model_config": [
+                                {"model_name": "microsoft/deberta-v3-small"},
+                                {"model_name": "microsoft/deberta-v3-large"},
+                            ],
+                            "num_train_epochs": [3],
+                            "batch_size": [16],
+                        }
+                    ],
+                }
+            ],
+            "hpo_config": {"n_trials": 4},
+            "dump_modules": True,
+        }
+        report = run_preflight(cfg, DatasetStats.placeholder(), _profile())
+        # Per-node max ~ deberta-v3-large weights (~350M x 4 ~ 1.3 GB). Two-candidate
+        # sum would be roughly doubled. Verify we used the per-node-max bound.
+        small_meta = _hub.resolve_model("microsoft/deberta-v3-small")
+        large_meta = _hub.resolve_model("microsoft/deberta-v3-large")
+        expected = large_meta.weights_gb * 4
+        naive_sum = (small_meta.weights_gb + large_meta.weights_gb) * 4
+        assert report.resource.disk_dump_gb == pytest.approx(expected, rel=0.01)
+        assert report.resource.disk_dump_gb < naive_sum
+
+    def test_dump_disk_sums_across_nodes(self) -> None:
+        cfg = {
+            "search_space": [
+                {
+                    "node_type": "embedder",
+                    "search_space": [
+                        {
+                            "module_name": "sentence_transformer",
+                            "embedder_config": [{"model_name": "sentence-transformers/all-MiniLM-L6-v2"}],
+                        }
+                    ],
+                },
+                {
+                    "node_type": "scoring",
+                    "search_space": [
+                        {
+                            "module_name": "bert",
+                            "classification_model_config": [{"model_name": "microsoft/deberta-v3-small"}],
+                            "num_train_epochs": [3],
+                            "batch_size": [16],
+                        }
+                    ],
+                },
+            ],
+            "hpo_config": {"n_trials": 2},
+            "dump_modules": True,
+        }
+        report = run_preflight(cfg, DatasetStats.placeholder(), _profile())
+        embedder = _hub.resolve_model("sentence-transformers/all-MiniLM-L6-v2")
+        bert = _hub.resolve_model("microsoft/deberta-v3-small")
+        expected = (embedder.weights_gb + bert.weights_gb) * 2
+        assert report.resource.disk_dump_gb == pytest.approx(expected, rel=0.01)
diff --git a/tests/advisor/test_hub_heuristics.py b/tests/advisor/test_hub_heuristics.py
index 54a03431d..b43b95522 100644
--- a/tests/advisor/test_hub_heuristics.py
+++ b/tests/advisor/test_hub_heuristics.py
@@ -32,9 +32,7 @@ def _offline(monkeypatch: pytest.MonkeyPatch) -> None:
         ("bert-base-uncased", 70, 200),
     ],
 )
-def test_name_heuristic_picks_reasonable_bucket(
-    name: str, expected_min_m: int, expected_max_m: int
-) -> None:
+def test_name_heuristic_picks_reasonable_bucket(name: str, expected_min_m: int, expected_max_m: int) -> None:
     meta = _hub.resolve_model(name)
     assert meta.confidence == "heuristic"
     assert expected_min_m <= meta.params_millions <= expected_max_m, (
diff --git a/tests/advisor/test_render.py b/tests/advisor/test_render.py
index e82d7573b..55a2b0ce3 100644
--- a/tests/advisor/test_render.py
+++ b/tests/advisor/test_render.py
@@ -4,8 +4,6 @@
 
 import json
 
-import pytest
-
 from autointent._advisor._render import render_json, render_recommendation, render_text
 from autointent._advisor._report import (
     DatasetStats,
@@ -140,12 +138,14 @@ def test_shows_status_per_preset(self) -> None:
 
 def test_dataset_stats_in_text_block() -> None:
     stats = DatasetStats.placeholder(n_samples=777, n_classes=4)
-    r = PreflightReport(dataset={
-        "n_samples": stats.n_samples,
-        "n_classes": stats.n_classes,
-        "avg_tokens": stats.avg_tokens,
-        "source": stats.source,
-    })
+    r = PreflightReport(
+        dataset={
+            "n_samples": stats.n_samples,
+            "n_classes": stats.n_classes,
+            "avg_tokens": stats.avg_tokens,
+            "source": stats.source,
+        }
+    )
     out = render_text(r)
     assert "777" in out
     assert "n_classes=4" in out
diff --git a/user_guides/advanced/02_embedder_configuration.py b/user_guides/advanced/02_embedder_configuration.py
index 32118fed2..43ce27278 100644
--- a/user_guides/advanced/02_embedder_configuration.py
+++ b/user_guides/advanced/02_embedder_configuration.py
@@ -261,9 +261,7 @@
 )
 
 # Example (does not run training here): construct an embedder and call train when you have data.
-_embedder_for_ft = Embedder(
-    SentenceTransformerEmbeddingConfig(model_name="sentence-transformers/all-MiniLM-L6-v2")
-)
+_embedder_for_ft = Embedder(SentenceTransformerEmbeddingConfig(model_name="sentence-transformers/all-MiniLM-L6-v2"))
 # _embedder_for_ft.train(utterances=[...], labels=[...], config=ft_cfg)
 
 # %%

From f927729e2c0d736023d7e7c88bbb1d84bcaf5a57 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Tue, 16 Jun 2026 01:45:25 +0300
Subject: [PATCH 05/16] add more handling

---
 src/autointent/_advisor/_cli.py               |   2 +-
 src/autointent/_advisor/_estimates.py         | 171 +++++++++++++++---
 src/autointent/_advisor/_render.py            |  76 ++++++--
 src/autointent/_advisor/_report.py            |  17 +-
 .../_presets/transformers-heavy.yaml          |   7 +
 tests/advisor/test_estimates_and_cli.py       |   6 +-
 tests/advisor/test_estimates_internals.py     | 131 ++++++++++++--
 tests/advisor/test_render.py                  |  41 +++--
 tests/advisor/test_report.py                  |  26 +--
 9 files changed, 385 insertions(+), 92 deletions(-)

diff --git a/src/autointent/_advisor/_cli.py b/src/autointent/_advisor/_cli.py
index b3f43aab5..d300ad6fa 100644
--- a/src/autointent/_advisor/_cli.py
+++ b/src/autointent/_advisor/_cli.py
@@ -153,7 +153,7 @@ def cmd_recommend(args: argparse.Namespace) -> int:
         if args.budget_time_h is not None and report.resource.time_hours > args.budget_time_h:
             report.add(
                 "resource",
-                Severity.RED,
+                Severity.OVER,
                 f"Estimated time {report.resource.time_hours:.1f} h exceeds budget {args.budget_time_h} h.",
             )
         results.append((preset, report))
diff --git a/src/autointent/_advisor/_estimates.py b/src/autointent/_advisor/_estimates.py
index e8f619303..06d0e4fe1 100644
--- a/src/autointent/_advisor/_estimates.py
+++ b/src/autointent/_advisor/_estimates.py
@@ -33,6 +33,10 @@
 
 TRANSFORMER_SCORER_MODULES = {"bert", "lora", "ptuning", "dnnc"}
 
+# Fallback max_length when the search-space entry doesn't pin it. Used both as
+# the default in _vram_for_transformer and in the entry-walk seq_len resolution.
+_DEFAULT_SEQ_LEN = 128
+
 # Coefficients for the linear / catboost time formulas (proposal §"Algorithm").
 _LINEAR_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-8
 _CATBOOST_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-9
@@ -95,12 +99,12 @@ def _walk_modules_indexed(
             yield node_idx, node_type, entry
 
 
-def _vram_for_transformer(meta: ModelMeta, mode: str, mixed_precision: bool) -> float:
-    """VRAM in GB for one trial of a transformer-based module.
+def _weights_vram_for_transformer(meta: ModelMeta, mode: str) -> float:
+    """Weight-side VRAM in GB — weights + grads + Adam optimizer state. Excludes activations.
 
-    Full fine-tune fp32: weights + grads + Adam (m, v) = 4W.
-    Full fine-tune AMP: fp16 weights + fp16 grads + fp32 master copy + fp32 Adam = 3W.
-    (Activations are not modeled separately.)
+    Full fine-tune fp32: W + W + 2W (Adam m, v) = 4W.
+    Full fine-tune AMP: 0.5W (fp16 weights) + 0.5W (fp16 grads) + W (fp32 master) + 2W (fp32 Adam) = 4W.
+    AMP's savings live in activations, not the optimizer — the weight side is identical.
     """
     weights_gb = meta.weights_gb
     if mode == "inference":
@@ -109,16 +113,111 @@ def _vram_for_transformer(meta: ModelMeta, mode: str, mixed_precision: bool) ->
         return weights_gb * 1.3 + 0.5
     if mode == "reranker":
         return weights_gb * 1.5
-    if mixed_precision:
-        return weights_gb * 3.0
     return weights_gb * 4.0
 
 
+def _vram_for_transformer(
+    meta: ModelMeta,
+    mode: str,
+    mixed_precision: bool,
+    *,
+    batch_size: int = 0,
+    seq_len: int = _DEFAULT_SEQ_LEN,
+) -> float:
+    """Total VRAM in GB: weights + grads + optimizer state + activations × batch.
+
+    Activation accounting differs by mode — training keeps per-layer outputs for
+    backward; inference only needs one or two layers in flight.
+    """
+    base = _weights_vram_for_transformer(meta, mode)
+    if batch_size <= 0:
+        return base
+    per_sample = _activations_gb_per_sample(
+        meta, seq_len, mixed_precision=mixed_precision, is_training=mode != "inference"
+    )
+    return base + per_sample * batch_size
+
+
 def _ram_for_module(meta: ModelMeta, stats: DatasetStats) -> float:
     """RAM in GB. Loose upper bound."""
     return meta.weights_gb + (stats.n_samples * stats.avg_tokens * 4) / (1024**3)
 
 
+def _floor_to_power_of_two(n: int) -> int:
+    """Largest power of two ≤ n; returns 0 when n < 1."""
+    if n < 1:
+        return 0
+    power = 1
+    while power * 2 <= n:
+        power *= 2
+    return power
+
+
+def _n_layers(meta: ModelMeta | None) -> int:
+    """Coarse layer-count guess from parameter count.
+
+    MiniLM (33M) ~6, BERT-base (110M) ~12, BERT-large (350M) ~24.
+    """
+    if meta is None:
+        return 12
+    params = meta.params_millions
+    if params >= 300:
+        return 24
+    if params >= 100:
+        return 12
+    if params >= 50:
+        return 8
+    return 6
+
+
+def _activations_gb_per_sample(
+    meta: ModelMeta | None,
+    seq_len: int,
+    *,
+    mixed_precision: bool,
+    is_training: bool,
+) -> float:
+    """Heuristic activation memory per sample.
+
+    Training: ``seq_len × hidden × layers × const`` — per-layer outputs are kept
+    for backward.
+    Inference: ``seq_len × hidden × const`` — only one or two layers' outputs in
+    flight at once.
+    Mixed precision halves activation bytes.
+    """
+    hidden = _embedder_dim(meta)
+    if is_training:
+        # Training keeps every layer's outputs for backward → scales × n_layers.
+        # The 16-byte/token/layer coefficient bundles fp32 activation + ~4× backward overhead.
+        bytes_per_sample = seq_len * hidden * _n_layers(meta) * 16
+    else:
+        # Inference only holds ~1-2 layers' outputs in flight at once.
+        bytes_per_sample = seq_len * hidden * 8
+    if mixed_precision:
+        bytes_per_sample //= 2
+    return bytes_per_sample / (1024**3)
+
+
+def _max_fitting_batch_size(
+    *,
+    weight_vram_gb: float,
+    vram_budget_gb: float,
+    per_sample_gb: float,
+) -> int:
+    """Largest batch that keeps total VRAM under the AMPLE/TIGHT threshold.
+
+    Returns 0 when even the weights blow the budget. Result is rounded down to
+    the nearest power of two.
+    """
+    if per_sample_gb <= 0:
+        return 0
+    target_vram = vram_budget_gb * _YELLOW
+    available_for_activations = target_vram - weight_vram_gb
+    if available_for_activations <= 0:
+        return 0
+    return _floor_to_power_of_two(int(available_for_activations / per_sample_gb))
+
+
 def _embedder_dim(meta: ModelMeta | None) -> int:
     """Coarse hidden-size guess from parameter count.
 
@@ -211,13 +310,13 @@ def _time_for_transformer(
 
 def _classify_severity(estimate: float, budget: float) -> Severity:
     if budget <= 0:
-        return Severity.YELLOW
+        return Severity.TIGHT
     ratio = estimate / budget
     if ratio >= _RED:
-        return Severity.RED
+        return Severity.OVER
     if ratio >= _YELLOW:
-        return Severity.YELLOW
-    return Severity.GREEN
+        return Severity.TIGHT
+    return Severity.AMPLE
 
 
 def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
@@ -281,20 +380,31 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
 
             batch_size = _max_int(entry.get("batch_size"), 32)
             epochs = _max_int(entry.get("num_train_epochs"), 1 if mode == "inference" else 10)
+            seq_len = _max_int(entry.get("max_length"), _DEFAULT_SEQ_LEN)
 
-            vram = _vram_for_transformer(meta, mode, mixed_precision)
+            vram = _vram_for_transformer(meta, mode, mixed_precision, batch_size=batch_size, seq_len=seq_len)
             ram = _ram_for_module(meta, stats)
 
-            time_h = 0.0
-            if mode != "inference":
-                time_h = _time_for_transformer(
-                    meta=meta,
-                    n_trials=n_trials,
-                    epochs=epochs,
-                    batch_size=batch_size,
-                    n_samples=stats.n_samples,
-                    device_class=hardware.device_class,
+            driver_max_batch: int | None = None
+            if hardware.vram_gb > 0:
+                weights_vram = _weights_vram_for_transformer(meta, mode)
+                per_sample_gb = _activations_gb_per_sample(
+                    meta, seq_len, mixed_precision=mixed_precision, is_training=mode != "inference"
                 )
+                driver_max_batch = _max_fitting_batch_size(
+                    weight_vram_gb=weights_vram,
+                    vram_budget_gb=hardware.vram_gb,
+                    per_sample_gb=per_sample_gb,
+                )
+
+            time_h = _time_for_transformer(
+                meta=meta,
+                n_trials=n_trials,
+                epochs=epochs,
+                batch_size=batch_size,
+                n_samples=stats.n_samples,
+                device_class=hardware.device_class,
+            )
             if refit_after and mode != "inference":
                 time_h *= 1 + 1.0 / max(1, n_trials)
 
@@ -311,6 +421,8 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
                     "vram_gb": round(vram, 2),
                     "ram_gb": round(ram, 2),
                     "time_hours": round(time_h, 2),
+                    "batch_size": batch_size,
+                    "max_batch_size": driver_max_batch,
                     "confidence": meta.confidence,
                 }
             )
@@ -381,6 +493,8 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
                 "vram_gb": round(vram, 2),
                 "ram_gb": round(ram, 2),
                 "time_hours": round(time_h, 2),
+                "batch_size": None,
+                "max_batch_size": None,
                 "confidence": confidence,
             }
         )
@@ -409,7 +523,7 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
     if hardware.accelerator == "cpu" and effective_vram > 0:
         report.add(
             "resource",
-            Severity.YELLOW,
+            Severity.TIGHT,
             f"No GPU detected; transformer modules will be very slow (worst case ~{estimate.time_hours:.1f} h).",
             metric="vram",
         )
@@ -420,6 +534,7 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
         msg += f" vs available {hardware.vram_gb:.1f} GB"
         report.add("resource", vram_sev, msg, metric="vram")
 
+
     ram_sev = _classify_severity(estimate.ram_gb, hardware.ram_gb)
     report.add(
         "resource",
@@ -440,7 +555,7 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
 
     if estimate.time_hours > 0:
         time_msg = f"Time ~{estimate.time_hours:.1f} h (worst case, no HPO pruning)"
-        report.add("resource", Severity.GREEN, time_msg, metric="time")
+        report.add("resource", Severity.AMPLE, time_msg, metric="time")
 
 
 def _config_phase(
@@ -454,7 +569,7 @@ def _config_phase(
     if n_jobs > 1 and hardware.accelerator in {"cuda", "mps"}:
         report.add(
             "config",
-            Severity.YELLOW,
+            Severity.TIGHT,
             f"hpo_config.n_jobs={n_jobs} on a single GPU multiplies VRAM demand by {n_jobs}×.",
         )
 
@@ -466,7 +581,7 @@ def _config_phase(
     if uses_catboost_gpu and hardware.accelerator != "cuda":
         report.add(
             "config",
-            Severity.YELLOW,
+            Severity.TIGHT,
             "CatBoost task_type=GPU configured but no CUDA detected — will fall back to CPU.",
         )
 
@@ -484,7 +599,7 @@ def _data_phase(
             continue
         max_len = _max_int(max_len_value, 512)
         if p95 > max_len:
-            severity = Severity.RED if p95 > max_len * 1.5 else Severity.YELLOW
+            severity = Severity.OVER if p95 > max_len * 1.5 else Severity.TIGHT
             report.add(
                 "data",
                 severity,
@@ -496,7 +611,7 @@ def _data_phase(
     if has_linear and stats.rare_classes:
         report.add(
             "data",
-            Severity.RED,
+            Severity.OVER,
             (f"LogisticRegressionCV (cv=3) will fail: classes {stats.rare_classes[:5]} have <3 samples."),
         )
 
@@ -507,7 +622,7 @@ def _data_phase(
     if has_description and stats.has_descriptions is False:
         report.add(
             "data",
-            Severity.RED,
+            Severity.OVER,
             "description scorer present but intent descriptions are missing — fill them in or drop the scorer.",
         )
 
diff --git a/src/autointent/_advisor/_render.py b/src/autointent/_advisor/_render.py
index fe0f32dd7..a3778d307 100644
--- a/src/autointent/_advisor/_render.py
+++ b/src/autointent/_advisor/_render.py
@@ -13,11 +13,69 @@
 if TYPE_CHECKING:
     from ._report import PreflightReport
 
-_SEVERITY_TAG = {"green": "✓", "yellow": "⚠", "red": "✗"}
+_SEVERITY_TAG = {"ample": "✓", "tight": "⚠", "over": "✗"}
 _PHASE_ORDER = ("resource", "data", "config")
 _PHASE_LABEL = {"resource": "Resource", "data": "Data", "config": "Config"}
 
 
+def _batch_hint(driver: dict) -> str:
+    """Per-driver batch annotation: '64 → 32', '64', '64 (no fit)', or ''."""
+    bs = driver.get("batch_size")
+    if bs is None:
+        return ""
+    mx = driver.get("max_batch_size")
+    if mx is None:
+        return str(bs)
+    if mx == 0:
+        return f"{bs} (no fit)"
+    if mx == bs:
+        return str(bs)
+    return f"{bs} → {mx}"
+
+
+_DRIVERS_LIMIT = 8
+_DRIVERS_HEADERS = ("Node", "Model", "Mode", "VRAM", "Time", "Batch", "Source")
+
+
+def _render_drivers_table(drivers: list[dict]) -> list[str]:
+    """Format the Drivers of cost section as an aligned table."""
+    visible = drivers[:_DRIVERS_LIMIT]
+    rows: list[tuple[str, ...]] = []
+    for d in visible:
+        rows.append((
+            f"{d['node_type']}.{d['module']}",
+            str(d["model"]),
+            str(d["mode"]),
+            f"{d['vram_gb']:.2f} GB",
+            f"{d['time_hours']:.2f} h",
+            _batch_hint(d),
+            f"[{d['confidence']}]",
+        ))
+
+    widths = [len(h) for h in _DRIVERS_HEADERS]
+    for row in rows:
+        for i, cell in enumerate(row):
+            widths[i] = max(widths[i], len(cell))
+
+    # Right-align numeric columns (VRAM @ idx 3, Time @ idx 4); left-align the rest.
+    right_align = {3, 4}
+
+    def fmt(row: tuple[str, ...]) -> str:
+        cells = []
+        for i, cell in enumerate(row):
+            if i in right_align:
+                cells.append(cell.rjust(widths[i]))
+            else:
+                cells.append(cell.ljust(widths[i]))
+        return "  " + "  ".join(cells).rstrip()
+
+    lines = ["Drivers of cost:", fmt(_DRIVERS_HEADERS), "  " + "  ".join("─" * w for w in widths)]
+    lines.extend(fmt(r) for r in rows)
+    if len(drivers) > _DRIVERS_LIMIT:
+        lines.append(f"  … and {len(drivers) - _DRIVERS_LIMIT} more")
+    return lines
+
+
 def render_text(report: PreflightReport) -> str:
     lines: list[str] = []
     title = "Compute feasibility check"
@@ -50,15 +108,7 @@ def render_text(report: PreflightReport) -> str:
         lines.append("")
 
     if report.resource.drivers:
-        lines.append("Drivers of cost:")
-        for d in report.resource.drivers[:8]:
-            lines.append(
-                f"  {d['node_type']}.{d['module']:<10} {d['model']:<48}"
-                f"  {d['mode']:<14}  VRAM ~{d['vram_gb']} GB, time ~{d['time_hours']} h"
-                f"  [{d['confidence']}]"
-            )
-        if len(report.resource.drivers) > 8:
-            lines.append(f"  … and {len(report.resource.drivers) - 8} more")
+        lines.extend(_render_drivers_table(report.resource.drivers))
         lines.append("")
 
     if report.notes:
@@ -68,7 +118,7 @@ def render_text(report: PreflightReport) -> str:
         lines.append("")
 
     summary = f"Verdict: {'feasible' if report.is_feasible else 'INFEASIBLE'} "
-    summary += f"(worst severity: {report.worst_severity.value})"
+    summary += f"(headroom: {report.headroom.value})"
     if report.low_confidence:
         summary += " — low-confidence (heuristic fallback in use)"
     lines.append(summary)
@@ -91,7 +141,7 @@ def render_recommendation(
     else:
         lines.append("  → none of the bundled presets fit your hardware as-is.")
     lines.append("")
-    lines.append(f"{'Preset':<24} {'Status':<14} {'VRAM':<10} {'Time':<10} {'Worst':<8}")
+    lines.append(f"{'Preset':<24} {'Status':<14} {'VRAM':<10} {'Time':<10} {'Headroom':<10}")
     lines.append("-" * 68)
     for name, report in results:
         verdict = "feasible" if report.is_feasible else "infeasible"
@@ -99,6 +149,6 @@ def render_recommendation(
             f"{name:<24} {verdict:<14} "
             f"{report.resource.vram_gb:>4.1f} GB   "
             f"{report.resource.time_hours:>4.1f} h    "
-            f"{report.worst_severity.value:<8}"
+            f"{report.headroom.value:<8}"
         )
     return "\n".join(lines)
diff --git a/src/autointent/_advisor/_report.py b/src/autointent/_advisor/_report.py
index 6b930db95..9b4a319c8 100644
--- a/src/autointent/_advisor/_report.py
+++ b/src/autointent/_advisor/_report.py
@@ -8,9 +8,9 @@
 
 
 class Severity(str, Enum):
-    GREEN = "green"
-    YELLOW = "yellow"
-    RED = "red"
+    AMPLE = "ample"
+    TIGHT = "tight"
+    OVER = "over"
 
 
 Phase = Literal["resource", "data", "config"]
@@ -93,19 +93,20 @@ def add(self, phase: Phase, severity: Severity, message: str, metric: str | None
         self.findings.append(Finding(phase=phase, severity=severity, message=message, metric=metric))
 
     @property
-    def worst_severity(self) -> Severity:
-        order = {Severity.GREEN: 0, Severity.YELLOW: 1, Severity.RED: 2}
+    def headroom(self) -> Severity:
+        """Worst headroom level across all findings — the column shown in CLI reports."""
+        order = {Severity.AMPLE: 0, Severity.TIGHT: 1, Severity.OVER: 2}
         if not self.findings:
-            return Severity.GREEN
+            return Severity.AMPLE
         return max((f.severity for f in self.findings), key=lambda s: order[s])
 
     @property
     def is_feasible(self) -> bool:
-        return self.worst_severity != Severity.RED
+        return self.headroom != Severity.OVER
 
     def to_dict(self) -> dict[str, Any]:
         d = asdict(self)
         d["findings"] = [{**asdict(f), "severity": f.severity.value} for f in self.findings]
-        d["worst_severity"] = self.worst_severity.value
+        d["headroom"] = self.headroom.value
         d["is_feasible"] = self.is_feasible
         return d
diff --git a/src/autointent/_presets/transformers-heavy.yaml b/src/autointent/_presets/transformers-heavy.yaml
index 2576fbc82..cd15d791e 100644
--- a/src/autointent/_presets/transformers-heavy.yaml
+++ b/src/autointent/_presets/transformers-heavy.yaml
@@ -5,12 +5,19 @@ search_space:
       - module_name: bert
         classification_model_config:
           - model_name: microsoft/deberta-v3-large
+          - model_name: intfloat/multilingual-e5-large-instruct
+          - model_name: microsoft/harrier-oss-v1-27b
         num_train_epochs: [30]
         batch_size: [32, 64]
         learning_rate:
           low: 1.0e-5
           high: 1.0e-4
           log: True
+      - module_name: description_bi
+        embedder_config:
+          - model_name: microsoft/deberta-v3-large
+          - model_name: intfloat/multilingual-e5-large-instruct
+          - model_name: microsoft/harrier-oss-v1-27b
   - node_type: decision
     target_metric: decision_accuracy
     search_space:
diff --git a/tests/advisor/test_estimates_and_cli.py b/tests/advisor/test_estimates_and_cli.py
index 00537226d..15a087e07 100644
--- a/tests/advisor/test_estimates_and_cli.py
+++ b/tests/advisor/test_estimates_and_cli.py
@@ -98,7 +98,7 @@ def test_cli_inspect_json_is_parseable(capsys: pytest.CaptureFixture[str]) -> No
     payload = json.loads(captured.out)
     assert payload["preset_name"] == "transformers-light"
     assert "findings" in payload
-    assert payload["worst_severity"] in {"green", "yellow", "red"}
+    assert payload["headroom"] in {"ample", "tight", "over"}
     # rc is 0 on feasible, 1 otherwise
     assert rc in (0, 1)
 
@@ -217,9 +217,9 @@ def test_cli_recommend_budget_time_flags_red_for_overbudget_presets(
     flagged = [
         r
         for r in payload["results"]
-        if any(f["severity"] == "red" and "exceeds budget" in f["message"] for f in r["report"]["findings"])
+        if any(f["severity"] == "over" and "exceeds budget" in f["message"] for f in r["report"]["findings"])
     ]
-    assert flagged, "budget-time-h breach should produce RED severity findings"
+    assert flagged, "budget-time-h breach should produce OVER severity findings"
     # Any preset above the budget must be marked infeasible.
     for r in flagged:
         assert r["report"]["is_feasible"] is False
diff --git a/tests/advisor/test_estimates_internals.py b/tests/advisor/test_estimates_internals.py
index 713db27f7..5ac66af2f 100644
--- a/tests/advisor/test_estimates_internals.py
+++ b/tests/advisor/test_estimates_internals.py
@@ -86,17 +86,17 @@ def test_empty_entry(self) -> None:
 
 class TestClassifySeverity:
     def test_below_yellow_is_green(self) -> None:
-        assert _classify_severity(estimate=1.0, budget=10.0) == Severity.GREEN
+        assert _classify_severity(estimate=1.0, budget=10.0) == Severity.AMPLE
 
     def test_above_yellow_threshold(self) -> None:
-        assert _classify_severity(estimate=8.0, budget=10.0) == Severity.YELLOW
+        assert _classify_severity(estimate=8.0, budget=10.0) == Severity.TIGHT
 
     def test_at_or_above_red_threshold(self) -> None:
-        assert _classify_severity(estimate=10.0, budget=10.0) == Severity.RED
-        assert _classify_severity(estimate=12.0, budget=10.0) == Severity.RED
+        assert _classify_severity(estimate=10.0, budget=10.0) == Severity.OVER
+        assert _classify_severity(estimate=12.0, budget=10.0) == Severity.OVER
 
     def test_zero_budget_returns_yellow(self) -> None:
-        assert _classify_severity(estimate=1.0, budget=0.0) == Severity.YELLOW
+        assert _classify_severity(estimate=1.0, budget=0.0) == Severity.TIGHT
 
 
 class TestVramForTransformer:
@@ -117,13 +117,34 @@ def test_full_finetune_is_larger_than_lora_is_larger_than_inference(self, meta:
         full = _vram_for_transformer(meta, "full-finetune", mixed_precision=False)
         assert inference < lora < full
 
-    def test_amp_partially_reduces_full_finetune_vram(self, meta: ModelMeta) -> None:
-        """AMP saves on fp16 weights+grads (W down from 2W); Adam state stays
-        fp32 (2W). Total 3W vs fp32's 4W — real but not a full halving."""
-        full_fp32 = _vram_for_transformer(meta, "full-finetune", mixed_precision=False)
-        full_amp = _vram_for_transformer(meta, "full-finetune", mixed_precision=True)
-        assert full_amp < full_fp32
-        assert full_amp / full_fp32 == pytest.approx(0.75)
+    def test_inference_activations_are_smaller_than_training(self, meta: ModelMeta) -> None:
+        """Inference doesn't store per-layer outputs for backward — activation memory
+        should be many times smaller than training at the same batch_size."""
+        train_total = _vram_for_transformer(meta, "full-finetune", False, batch_size=64, seq_len=128)
+        train_weights = _vram_for_transformer(meta, "full-finetune", False, batch_size=0)
+        inf_total = _vram_for_transformer(meta, "inference", False, batch_size=64, seq_len=128)
+        inf_weights = _vram_for_transformer(meta, "inference", False, batch_size=0)
+        train_acts = train_total - train_weights
+        inf_acts = inf_total - inf_weights
+        assert inf_acts > 0
+        assert train_acts > inf_acts
+        # 12-layer model: training activations should be at least ~5× inference.
+        assert train_acts / inf_acts > 5
+
+    def test_amp_does_not_reduce_weight_side_vram(self, meta: ModelMeta) -> None:
+        """Weight-side AMP accounting: fp16 weights+grads (W) + fp32 master copy (W)
+        + fp32 Adam moments (2W) = 4W, identical to pure fp32. AMP's savings live
+        in activations, not the optimizer."""
+        full_fp32 = _vram_for_transformer(meta, "full-finetune", mixed_precision=False, batch_size=0)
+        full_amp = _vram_for_transformer(meta, "full-finetune", mixed_precision=True, batch_size=0)
+        assert full_amp == pytest.approx(full_fp32)
+
+    def test_amp_does_reduce_activation_side_vram(self, meta: ModelMeta) -> None:
+        """When a batch is configured, AMP halves activation bytes — total VRAM
+        with batch should be strictly smaller under AMP than fp32."""
+        fp32 = _vram_for_transformer(meta, "full-finetune", mixed_precision=False, batch_size=64, seq_len=128)
+        amp = _vram_for_transformer(meta, "full-finetune", mixed_precision=True, batch_size=64, seq_len=128)
+        assert amp < fp32
 
     def test_reranker_uses_inference_class(self, meta: ModelMeta) -> None:
         inference = _vram_for_transformer(meta, "inference", mixed_precision=False)
@@ -255,7 +276,7 @@ def test_rare_classes_with_linear_scorer_flag_red(self) -> None:
         )
         report = run_preflight(cfg, stats, _profile())
         assert any(
-            f.phase == "data" and "LogisticRegressionCV" in f.message and f.severity == Severity.RED
+            f.phase == "data" and "LogisticRegressionCV" in f.message and f.severity == Severity.OVER
             for f in report.findings
         )
 
@@ -276,7 +297,7 @@ def test_truncation_red_when_p95_dominates_max_length(self) -> None:
         }
         stats = DatasetStats(n_samples=500, n_classes=5, avg_tokens=50, p95_tokens=400)
         report = run_preflight(cfg, stats, _profile())
-        red = [f for f in report.findings if f.phase == "data" and f.severity == Severity.RED]
+        red = [f for f in report.findings if f.phase == "data" and f.severity == Severity.OVER]
         assert red, "p95=400 > 1.5 * max_length=128 should be red"
 
     def test_truncation_yellow_when_p95_only_slightly_exceeds(self) -> None:
@@ -299,7 +320,7 @@ def test_truncation_yellow_when_p95_only_slightly_exceeds(self) -> None:
         yellows = [
             f
             for f in report.findings
-            if f.phase == "data" and f.severity == Severity.YELLOW and "truncation" in f.message.lower()
+            if f.phase == "data" and f.severity == Severity.TIGHT and "truncation" in f.message.lower()
         ]
         assert yellows
 
@@ -431,6 +452,86 @@ def test_linear_scales_with_n_samples(self) -> None:
         assert big.resource.ram_gb > small.resource.ram_gb
 
 
+class TestPerDriverBatchHint:
+    """Each transformer driver carries its own (batch_size, max_batch_size) for rendering."""
+
+    def _bert_cfg(self, model_name: str, batch_size: int) -> dict[str, Any]:
+        return {
+            "search_space": [
+                {
+                    "node_type": "scoring",
+                    "search_space": [
+                        {
+                            "module_name": "bert",
+                            "classification_model_config": [{"model_name": model_name}],
+                            "num_train_epochs": [3],
+                            "batch_size": [batch_size],
+                        }
+                    ],
+                }
+            ],
+            "hpo_config": {"n_trials": 1},
+        }
+
+    def test_driver_records_current_and_max_batch(self) -> None:
+        report = run_preflight(
+            self._bert_cfg("microsoft/deberta-v3-large", batch_size=64),
+            DatasetStats.placeholder(),
+            _profile(vram_gb=10.0),
+        )
+        drivers = [d for d in report.resource.drivers if d["module"] == "bert"]
+        assert drivers
+        d = drivers[0]
+        assert d["batch_size"] == 64
+        # vram_gb=10 + 5 GB weights → some room for activations, max < 64.
+        assert d["max_batch_size"] is not None
+        assert 0 < d["max_batch_size"] < 64
+
+    def test_max_batch_zero_when_weights_alone_overflow(self) -> None:
+        report = run_preflight(
+            self._bert_cfg("microsoft/deberta-v3-large", batch_size=64),
+            DatasetStats.placeholder(),
+            _profile(vram_gb=2.0),
+        )
+        d = next(d for d in report.resource.drivers if d["module"] == "bert")
+        assert d["max_batch_size"] == 0
+
+    def test_max_batch_can_be_larger_than_current(self) -> None:
+        report = run_preflight(
+            self._bert_cfg("microsoft/deberta-v3-large", batch_size=32),
+            DatasetStats.placeholder(),
+            _profile(vram_gb=64.0),
+        )
+        d = next(d for d in report.resource.drivers if d["module"] == "bert")
+        assert d["max_batch_size"] is not None and d["max_batch_size"] > 32
+
+    def test_multiple_drivers_carry_independent_max_batch(self) -> None:
+        cfg = {
+            "search_space": [
+                {
+                    "node_type": "scoring",
+                    "search_space": [
+                        {
+                            "module_name": "bert",
+                            "classification_model_config": [
+                                {"model_name": "microsoft/deberta-v3-small"},
+                                {"model_name": "microsoft/deberta-v3-large"},
+                            ],
+                            "num_train_epochs": [3],
+                            "batch_size": [64],
+                        }
+                    ],
+                }
+            ],
+            "hpo_config": {"n_trials": 1},
+        }
+        report = run_preflight(cfg, DatasetStats.placeholder(), _profile(vram_gb=10.0))
+        small = next(d for d in report.resource.drivers if "small" in d["model"])
+        large = next(d for d in report.resource.drivers if "large" in d["model"])
+        # The smaller model has more headroom → larger max batch (or equal-cap when both saturate).
+        assert small["max_batch_size"] >= large["max_batch_size"]
+
+
 class TestDumpModulesBounding:
     """`dump_modules=True` writes one selected variant per node per trial — not
     every candidate. The estimate must be bounded by sum-of-max-per-node x n_trials."""
diff --git a/tests/advisor/test_render.py b/tests/advisor/test_render.py
index 55a2b0ce3..2c0604a11 100644
--- a/tests/advisor/test_render.py
+++ b/tests/advisor/test_render.py
@@ -4,7 +4,7 @@
 
 import json
 
-from autointent._advisor._render import render_json, render_recommendation, render_text
+from autointent._advisor._render import _batch_hint, render_json, render_recommendation, render_text
 from autointent._advisor._report import (
     DatasetStats,
     PreflightReport,
@@ -46,8 +46,8 @@ def _populated_report() -> PreflightReport:
         ),
         notes=["MPS unified memory note"],
     )
-    r.add("resource", Severity.YELLOW, "VRAM ~6 GB vs available 8 GB")
-    r.add("data", Severity.RED, "rare classes blocked")
+    r.add("resource", Severity.TIGHT, "VRAM ~6 GB vs available 8 GB")
+    r.add("data", Severity.OVER, "rare classes blocked")
     return r
 
 
@@ -64,10 +64,10 @@ def test_includes_drivers_block(self) -> None:
         assert "Drivers of cost:" in out
         assert "x/y" in out
 
-    def test_verdict_reflects_worst_severity(self) -> None:
+    def test_verdict_reflects_headroom(self) -> None:
         out = render_text(_populated_report())
         assert "Verdict: INFEASIBLE" in out
-        assert "worst severity: red" in out
+        assert "headroom: over" in out
 
     def test_disclaimer_always_present(self) -> None:
         out = render_text(_populated_report())
@@ -96,25 +96,25 @@ def test_is_valid_json(self) -> None:
     def test_findings_have_string_severity(self) -> None:
         d = json.loads(render_json(_populated_report()))
         for f in d["findings"]:
-            assert f["severity"] in {"green", "yellow", "red"}
+            assert f["severity"] in {"ample", "tight", "over"}
 
-    def test_worst_severity_and_feasibility_serialized(self) -> None:
+    def test_headroom_and_feasibility_serialized(self) -> None:
         d = json.loads(render_json(_populated_report()))
-        assert d["worst_severity"] == "red"
+        assert d["headroom"] == "over"
         assert d["is_feasible"] is False
 
     def test_empty_report_serializes(self) -> None:
         d = json.loads(render_json(PreflightReport()))
-        assert d["worst_severity"] == "green"
+        assert d["headroom"] == "ample"
         assert d["is_feasible"] is True
 
 
 class TestRenderRecommendation:
     def _two_reports(self) -> list[tuple[str, PreflightReport]]:
         a = PreflightReport(preset_name="a", resource=ResourceEstimate(vram_gb=2.0, time_hours=0.5))
-        a.add("resource", Severity.GREEN, "ok")
+        a.add("resource", Severity.AMPLE, "ok")
         b = PreflightReport(preset_name="b", resource=ResourceEstimate(vram_gb=8.0, time_hours=4.0))
-        b.add("resource", Severity.RED, "too big")
+        b.add("resource", Severity.OVER, "too big")
         return [("a", a), ("b", b)]
 
     def test_lists_chosen_preset_when_present(self) -> None:
@@ -136,6 +136,25 @@ def test_shows_status_per_preset(self) -> None:
         assert "infeasible" in out
 
 
+class TestBatchHint:
+    """Per-driver batch cell rendered in the Drivers-of-cost table."""
+
+    def test_arrow_when_max_differs(self) -> None:
+        assert _batch_hint({"batch_size": 64, "max_batch_size": 32}) == "64 → 32"
+
+    def test_plain_when_max_equals_current(self) -> None:
+        assert _batch_hint({"batch_size": 64, "max_batch_size": 64}) == "64"
+
+    def test_no_fit_label_when_max_zero(self) -> None:
+        assert _batch_hint({"batch_size": 64, "max_batch_size": 0}) == "64 (no fit)"
+
+    def test_empty_when_no_batch(self) -> None:
+        assert _batch_hint({"batch_size": None, "max_batch_size": None}) == ""
+
+    def test_increase_arrow(self) -> None:
+        assert _batch_hint({"batch_size": 32, "max_batch_size": 128}) == "32 → 128"
+
+
 def test_dataset_stats_in_text_block() -> None:
     stats = DatasetStats.placeholder(n_samples=777, n_classes=4)
     r = PreflightReport(
diff --git a/tests/advisor/test_report.py b/tests/advisor/test_report.py
index 52f2e675e..28adbfc34 100644
--- a/tests/advisor/test_report.py
+++ b/tests/advisor/test_report.py
@@ -14,22 +14,22 @@
 
 
 class TestSeverityOrdering:
-    def test_worst_severity_on_empty_report_is_green(self) -> None:
-        assert PreflightReport().worst_severity == Severity.GREEN
+    def test_headroom_on_empty_report_is_green(self) -> None:
+        assert PreflightReport().headroom == Severity.AMPLE
 
     def test_red_beats_yellow_beats_green(self) -> None:
         r = PreflightReport()
-        r.add("resource", Severity.GREEN, "ok")
-        r.add("data", Severity.YELLOW, "warn")
-        assert r.worst_severity == Severity.YELLOW
-        r.add("config", Severity.RED, "fail")
-        assert r.worst_severity == Severity.RED
+        r.add("resource", Severity.AMPLE, "ok")
+        r.add("data", Severity.TIGHT, "warn")
+        assert r.headroom == Severity.TIGHT
+        r.add("config", Severity.OVER, "fail")
+        assert r.headroom == Severity.OVER
 
     def test_is_feasible_flips_on_any_red(self) -> None:
         r = PreflightReport()
-        r.add("resource", Severity.YELLOW, "warn")
+        r.add("resource", Severity.TIGHT, "warn")
         assert r.is_feasible is True
-        r.add("data", Severity.RED, "fail")
+        r.add("data", Severity.OVER, "fail")
         assert r.is_feasible is False
 
 
@@ -62,12 +62,12 @@ def test_total_disk_ignores_cached(self) -> None:
 class TestToDictSerialization:
     def test_findings_round_trip_severity_as_string(self) -> None:
         r = PreflightReport()
-        r.add("resource", Severity.RED, "boom")
+        r.add("resource", Severity.OVER, "boom")
         d = r.to_dict()
-        assert d["worst_severity"] == "red"
+        assert d["headroom"] == "over"
         assert d["is_feasible"] is False
         assert d["findings"] == [
-            {"phase": "resource", "severity": "red", "message": "boom", "metric": None},
+            {"phase": "resource", "severity": "over", "message": "boom", "metric": None},
         ]
 
     def test_hardware_and_dataset_pass_through(self) -> None:
@@ -80,6 +80,6 @@ def test_hardware_and_dataset_pass_through(self) -> None:
         assert d["dataset"]["n_samples"] == 100
 
     def test_finding_is_frozen(self) -> None:
-        f = Finding(phase="resource", severity=Severity.GREEN, message="ok")
+        f = Finding(phase="resource", severity=Severity.AMPLE, message="ok")
         with pytest.raises(Exception):  # noqa: PT011 - dataclass.FrozenInstanceError varies
             f.message = "changed"  # type: ignore[misc]

From 82a78287bdbb1bec9e4bce6dda9804f7a801fe19 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Tue, 16 Jun 2026 02:26:01 +0300
Subject: [PATCH 06/16] add more handling

---
 pyproject.toml                          |   1 +
 src/autointent/_advisor/_cli.py         | 101 +++++++++++++++---
 src/autointent/_advisor/_estimates.py   | 135 +++++++++++++++---------
 src/autointent/_advisor/_hub.py         |  10 +-
 tests/advisor/test_estimates_and_cli.py |   2 +-
 5 files changed, 183 insertions(+), 66 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8675e7ea3..4ee34b9c5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,6 +45,7 @@ dependencies = [
     "aiometer (>=1.0.0,<2.0.0)",
     "aiofiles (>=24.1.0,<25.0.0)",
     "threadpoolctl (>=3.0.0,<4.0.0)",
+    "psutil (>=5.9.0,<8.0.0)",
 ]
 
 [project.optional-dependencies]
diff --git a/src/autointent/_advisor/_cli.py b/src/autointent/_advisor/_cli.py
index d300ad6fa..c9485cc81 100644
--- a/src/autointent/_advisor/_cli.py
+++ b/src/autointent/_advisor/_cli.py
@@ -5,9 +5,10 @@
 * ``inspect`` — show what a given preset / config will cost on this machine.
 * ``recommend`` — pick the best-fitting bundled preset for this machine.
 
-Both subcommands accept either a real ``--dataset`` (path to load with
-``Dataset.from_*`` constructors) or ``--n-samples / --n-classes / --avg-tokens``
-placeholders so the script is useful before the user has built a dataset.
+Both subcommands accept either a real ``--dataset`` (Hub id or local
+csv/json/jsonl/parquet path loaded via ``datasets.load_dataset``) or
+``--n-samples / --n-classes / --avg-tokens`` placeholders so the script is
+useful before the user has built a dataset.
 """
 
 from __future__ import annotations
@@ -19,8 +20,8 @@
 from typing import Any
 
 import yaml
+from datasets import ClassLabel, Sequence, load_dataset
 
-from autointent import Dataset
 from autointent.utils import load_preset
 
 from ._estimates import run_preflight
@@ -79,35 +80,107 @@ def _stats_from_args(args: argparse.Namespace) -> DatasetStats:
     )
 
 
+_UTTERANCE_COLS = ("utterance", "text", "sentence", "query", "input")
+_LABEL_COLS = ("label", "labels", "intent", "target")
+# Map file extension → datasets builder name. Anything else is treated as a Hub
+# repo id or a directory and passed to load_dataset directly.
+_FILE_BUILDERS = {".csv": "csv", ".tsv": "csv", ".json": "json", ".jsonl": "json", ".parquet": "parquet"}
+
+
 def _stats_from_dataset(path: str, *, multilabel: bool) -> DatasetStats:
-    """Best-effort: load a dataset from disk via the existing Dataset constructor."""
+    """Best-effort: load via HF ``datasets.load_dataset``.
+
+    Accepts a Hub repo id ('DeepPavlov/clinc150') or a local file path
+    (.csv / .json / .jsonl / .parquet) / dataset directory. Falls back to a
+    placeholder on any loader error so the advisor stays best-effort.
+    """
+    builder = _FILE_BUILDERS.get(Path(path).suffix.lower())
     try:
-        ds = Dataset.from_json(path) if path.endswith(".json") else Dataset.from_hub(path)
-    except (OSError, ValueError) as e:
+        ds = load_dataset(builder, data_files=path) if builder else load_dataset(path)
+    except (OSError, ValueError, FileNotFoundError) as e:
         logger.warning("Failed to load dataset %s: %s", path, e)
         return DatasetStats.placeholder(multilabel=multilabel)
 
-    train = ds.get("train") or next(iter(ds.values()), None)
+    train = ds["train"] if "train" in ds else next(iter(ds.values()), None)
     if train is None:
         return DatasetStats.placeholder(multilabel=multilabel)
 
-    utt_col = getattr(ds, "utterance_feature", "utterance")
+    cols = train.column_names
+    utt_col = next((c for c in _UTTERANCE_COLS if c in cols), cols[0] if cols else None)
+    label_col = next((c for c in _LABEL_COLS if c in cols), None)
+
+    detected_multilabel, n_classes = _label_shape(train, label_col, fallback_multilabel=multilabel)
+
     sample = train[:1000] if len(train) > 1000 else train[:]
-    lengths = [len(str(s).split()) for s in sample.get(utt_col, [])]
+    lengths = [len(str(s).split()) for s in (sample.get(utt_col, []) if utt_col else [])]
     avg_tokens = int(sum(lengths) / max(1, len(lengths))) if lengths else 32
-    p95 = sorted(lengths)[int(len(lengths) * 0.95)] if lengths else avg_tokens * 2
+    if lengths:
+        sorted_lengths = sorted(lengths)
+        idx = max(0, min(len(sorted_lengths) - 1, int(round((len(sorted_lengths) - 1) * 0.95))))
+        p95 = sorted_lengths[idx]
+    else:
+        p95 = avg_tokens * 2
 
     return DatasetStats(
         n_samples=len(train),
-        n_classes=getattr(ds, "n_classes", 0) or 0,
+        n_classes=n_classes,
         avg_tokens=avg_tokens,
         p95_tokens=p95,
-        multilabel=getattr(ds, "multilabel", multilabel),
-        has_descriptions=getattr(ds, "has_descriptions", None),
+        multilabel=detected_multilabel,
+        has_descriptions=None,
+        rare_classes=_rare_classes(train, label_col, detected_multilabel, n_classes) if label_col else [],
         source=f"dataset:{path}",
     )
 
 
+def _label_shape(train: Any, label_col: str | None, *, fallback_multilabel: bool) -> tuple[bool, int]:
+    """Derive (multilabel, n_classes) from the HF feature schema, with a value-based fallback."""
+    if label_col is None:
+        return fallback_multilabel, 0
+    feature = train.features.get(label_col)
+    if isinstance(feature, Sequence):
+        inner = feature.feature
+        if isinstance(inner, ClassLabel):
+            return True, inner.num_classes
+        # Sequence of plain ints — n_classes = max label index + 1.
+        max_idx = max((max(row) for row in train[label_col] if row), default=-1)
+        return True, max_idx + 1
+    if isinstance(feature, ClassLabel):
+        return False, feature.num_classes
+    # Plain int/string column. Detect multilabel from the first non-empty row, then count uniques.
+    is_multi = len(train) > 0 and isinstance(train[0][label_col], (list, tuple))
+    if is_multi:
+        max_idx = max((max(row) for row in train[label_col] if row), default=-1)
+        return True, max_idx + 1
+    return False, len({label for label in train[label_col] if label is not None})
+
+
+def _rare_classes(train: Any, label_col: str, multilabel: bool, n_classes: int, min_count: int = 3) -> list[str]:
+    """Return labels with fewer than ``min_count`` samples in the train split.
+
+    Used to surface the LogisticRegressionCV(cv=3) failure case before fit.
+    Returns an empty list on any error so the advisor stays best-effort.
+    """
+    try:
+        labels = train[label_col]
+    except (KeyError, AttributeError, TypeError):
+        return []
+    counts: dict[str, int] = {}
+    if multilabel:
+        for row in labels:
+            if not row:
+                continue
+            for i, v in enumerate(row):
+                if v:
+                    counts[str(i)] = counts.get(str(i), 0) + 1
+        for i in range(n_classes):
+            counts.setdefault(str(i), 0)
+    else:
+        for label in labels:
+            counts[str(label)] = counts.get(str(label), 0) + 1
+    return sorted(name for name, c in counts.items() if c < min_count)
+
+
 def _add_common_dataset_args(p: argparse.ArgumentParser) -> None:
     p.add_argument("--dataset", help="Path or hub id of a dataset; overrides placeholders.")
     p.add_argument("--n-samples", type=int, default=1_000, help="Placeholder training set size.")
diff --git a/src/autointent/_advisor/_estimates.py b/src/autointent/_advisor/_estimates.py
index 06d0e4fe1..e88fffd0a 100644
--- a/src/autointent/_advisor/_estimates.py
+++ b/src/autointent/_advisor/_estimates.py
@@ -12,15 +12,49 @@
 from collections.abc import Iterable
 from typing import Any
 
+from pydantic import BaseModel, ConfigDict, Field, ValidationError
+
+from autointent.configs._optimization import HPOConfig
+
 from ._hardware import HardwareProfile
 from ._hub import ModelMeta, hub_reachable, resolve_model
 from ._report import DatasetStats, PreflightReport, ResourceEstimate, Severity
 
 logger = logging.getLogger(__name__)
 
-# yellow / red thresholds as fraction of available budget
-_YELLOW = 0.7
-_RED = 1.0
+
+class _AdvisorConfig(BaseModel):
+    """Validated view of the advisor's input config.
+
+    Wraps the four top-level keys the phase helpers read. Unknown top-level
+    keys are ignored (preset YAMLs carry extra metadata the advisor doesn't model).
+    """
+
+    model_config = ConfigDict(extra="ignore")
+
+    hpo_config: HPOConfig = Field(default_factory=HPOConfig)
+    search_space: list[dict[str, Any]] = Field(default_factory=list)
+    refit_after: bool = False
+    dump_modules: bool = False
+    embedder_config: dict[str, Any] | None = None
+
+
+def _validated_config(config: dict[str, Any]) -> _AdvisorConfig:
+    """Validate ``config`` against ``_AdvisorConfig``; fall back to defaults on any error.
+
+    The advisor is best-effort: a malformed user config should still produce a
+    report (with placeholder costs) rather than crashing.
+    """
+    try:
+        return _AdvisorConfig.model_validate(config)
+    except ValidationError as e:
+        logger.warning("Advisor config failed validation; falling back to defaults: %s", e)
+        return _AdvisorConfig()
+
+# Severity thresholds as a fraction of available budget: at or above _TIGHT
+# downgrades to Severity.TIGHT; at or above _OVER downgrades to Severity.OVER.
+_TIGHT_RATIO = 0.7
+_OVER_RATIO = 1.0
 
 # rough per-step seconds, keyed on device class. Scaled by params_millions / 100.
 _PER_STEP_BASELINE_S = {
@@ -31,7 +65,14 @@
     "apple-silicon": 0.08,
 }
 
-TRANSFORMER_SCORER_MODULES = {"bert", "lora", "ptuning", "dnnc"}
+# Maps each fine-tunable transformer module to its training-mode label.
+# Modules not listed are treated as inference-only.
+_TRANSFORMER_TRAINING_MODE = {
+    "bert": "full-finetune",
+    "ptuning": "lora",
+    "lora": "lora",
+    "dnnc": "reranker",
+}
 
 # Fallback max_length when the search-space entry doesn't pin it. Used both as
 # the default in _vram_for_transformer and in the entry-walk seq_len resolution.
@@ -81,14 +122,6 @@ def _max_int(value: Any, default: int) -> int:
         return default
 
 
-def _walk_modules(search_space: list[dict[str, Any]]) -> Iterable[tuple[str, dict[str, Any]]]:
-    """Yield (node_type, module_entry) pairs."""
-    for node in search_space or []:
-        node_type = node.get("node_type", "?")
-        for entry in node.get("search_space", []) or []:
-            yield node_type, entry
-
-
 def _walk_modules_indexed(
     search_space: list[dict[str, Any]],
 ) -> Iterable[tuple[int, str, dict[str, Any]]]:
@@ -99,6 +132,12 @@ def _walk_modules_indexed(
             yield node_idx, node_type, entry
 
 
+def _walk_modules(search_space: list[dict[str, Any]]) -> Iterable[tuple[str, dict[str, Any]]]:
+    """Yield (node_type, module_entry) pairs — index-agnostic view over `_walk_modules_indexed`."""
+    for _, node_type, entry in _walk_modules_indexed(search_space):
+        yield node_type, entry
+
+
 def _weights_vram_for_transformer(meta: ModelMeta, mode: str) -> float:
     """Weight-side VRAM in GB — weights + grads + Adam optimizer state. Excludes activations.
 
@@ -211,7 +250,7 @@ def _max_fitting_batch_size(
     """
     if per_sample_gb <= 0:
         return 0
-    target_vram = vram_budget_gb * _YELLOW
+    target_vram = vram_budget_gb * _TIGHT_RATIO
     available_for_activations = target_vram - weight_vram_gb
     if available_for_activations <= 0:
         return 0
@@ -309,12 +348,14 @@ def _time_for_transformer(
 
 
 def _classify_severity(estimate: float, budget: float) -> Severity:
+    if estimate <= 0:
+        return Severity.AMPLE
     if budget <= 0:
         return Severity.TIGHT
     ratio = estimate / budget
-    if ratio >= _RED:
+    if ratio >= _OVER_RATIO:
         return Severity.OVER
-    if ratio >= _YELLOW:
+    if ratio >= _TIGHT_RATIO:
         return Severity.TIGHT
     return Severity.AMPLE
 
@@ -325,28 +366,27 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
     hardware: HardwareProfile,
     report: PreflightReport,
 ) -> None:
-    hpo = config.get("hpo_config") or {}
-    n_trials = int(hpo.get("n_trials", 1))
-    n_jobs = int(hpo.get("n_jobs", 1))
-    refit_after = bool(config.get("refit_after", False))
-    dump_modules = bool(config.get("dump_modules", False))
+    cfg = _validated_config(config)
+    n_trials = max(1, cfg.hpo_config.n_trials)
+    n_jobs = max(1, cfg.hpo_config.n_jobs)
+    refit_after = cfg.refit_after
+    dump_modules = cfg.dump_modules
 
     if not hub_reachable():
         report.low_confidence = True
         report.notes.append("HF Hub unreachable — all model sizes are name-pattern heuristics.")
 
     seen_models: dict[str, ModelMeta] = {}
-    estimate = ResourceEstimate(parallel_factor=max(1, n_jobs))
+    estimate = ResourceEstimate(parallel_factor=n_jobs)
 
-    embedder_cfg = config.get("embedder_config") or {}
-    global_embedder = embedder_cfg.get("model_name") if isinstance(embedder_cfg, dict) else None
+    global_embedder = (cfg.embedder_config or {}).get("model_name")
     if global_embedder:
         seen_models[global_embedder] = resolve_model(global_embedder)
 
     # First pass: walk transformer-bearing modules (collects seen_models for embedder_dim lookup).
     transformer_entries: list[tuple[int, str, dict[str, Any]]] = []
     classic_entries: list[tuple[int, str, dict[str, Any]]] = []
-    for node_idx, node_type, entry in _walk_modules_indexed(config.get("search_space") or []):
+    for node_idx, node_type, entry in _walk_modules_indexed(cfg.search_space):
         module = entry.get("module_name", "?")
         if module in {"linear", "catboost"}:
             classic_entries.append((node_idx, node_type, entry))
@@ -367,16 +407,7 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
             meta = seen_models.setdefault(name, resolve_model(name))
 
             mixed_precision = entry.get("dtype") in {"fp16", "bf16"}
-            if module == "bert":
-                mode = "full-finetune"
-            elif module == "lora":
-                mode = "lora"
-            elif module == "dnnc":
-                mode = "reranker"
-            elif module == "ptuning":
-                mode = "full-finetune"
-            else:
-                mode = "inference"
+            mode = _TRANSFORMER_TRAINING_MODE.get(module, "inference")
 
             batch_size = _max_int(entry.get("batch_size"), 32)
             epochs = _max_int(entry.get("num_train_epochs"), 1 if mode == "inference" else 10)
@@ -430,7 +461,11 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
     # Second pass: linear / catboost — cost depends on embedder_dim, not a checkpoint.
     embedder_meta = _largest_embedder(seen_models)
     embedder_dim = _embedder_dim(embedder_meta)
-    class_multiplier_classic = max(1, stats.n_classes) if stats.multilabel else 1
+    # Both multinomial (multiclass) and one-vs-rest (multilabel) LR scale linearly in n_classes;
+    # the multiclass path additionally pays the LogisticRegressionCV inner-fit multiplier.
+    class_multiplier_classic = max(1, stats.n_classes)
+    confidence = embedder_meta.confidence if embedder_meta else "heuristic"
+    embedder_label = embedder_meta.name if embedder_meta else "(no embedder)"
     for _node_idx, node_type, entry in classic_entries:
         module = entry.get("module_name", "?")
         if module == "linear":
@@ -449,14 +484,14 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
                 time_h *= 1 + 1.0 / max(1, n_trials)
             vram = 0.0
             mode = "linear-cv" if cv_multiplier > 1 else "linear"
-            confidence = embedder_meta.confidence if embedder_meta else "heuristic"
         elif module == "catboost":
             iterations = _max_int(entry.get("iterations"), 1000)
             depth = _max_int(entry.get("depth"), 6)
             on_gpu = entry.get("task_type") == "GPU" and hardware.accelerator == "cuda"
-            # CatBoost's multiclass MultiClass loss already grows per-class trees.
-            cb_class_mult = max(1, stats.n_classes)
-            ram = _ram_for_catboost(
+            # CatBoost's MultiClass loss grows per-class trees only above binary;
+            # binary uses Logloss with one tree per iteration.
+            cb_class_mult = max(1, stats.n_classes) if stats.n_classes > 2 or stats.multilabel else 1
+            ram_total = _ram_for_catboost(
                 stats=stats,
                 n_features=embedder_dim,
                 iterations=iterations,
@@ -473,11 +508,8 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
             )
             if refit_after:
                 time_h *= 1 + 1.0 / max(1, n_trials)
-            vram = ram if on_gpu else 0.0
-            if on_gpu:
-                ram = 0.0
+            vram, ram = (ram_total, 0.0) if on_gpu else (0.0, ram_total)
             mode = "catboost-gpu" if on_gpu else "catboost"
-            confidence = embedder_meta.confidence if embedder_meta else "heuristic"
         else:
             continue
 
@@ -488,7 +520,7 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
             {
                 "node_type": node_type,
                 "module": module,
-                "model": embedder_meta.name if embedder_meta else "(no embedder)",
+                "model": embedder_label,
                 "mode": mode,
                 "vram_gb": round(vram, 2),
                 "ram_gb": round(ram, 2),
@@ -515,6 +547,9 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
         effective_vram = estimate.vram_gb * n_jobs
     else:
         effective_vram = estimate.vram_gb
+    # MPS shares one unified pool: parallel workers each allocate weights+activations
+    # in RAM, so peak RAM also scales with n_jobs on Apple Silicon.
+    effective_ram = estimate.ram_gb * n_jobs if n_jobs > 1 and hardware.accelerator == "mps" else estimate.ram_gb
 
     report.resource = estimate
 
@@ -535,11 +570,11 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
         report.add("resource", vram_sev, msg, metric="vram")
 
 
-    ram_sev = _classify_severity(estimate.ram_gb, hardware.ram_gb)
+    ram_sev = _classify_severity(effective_ram, hardware.ram_gb)
     report.add(
         "resource",
         ram_sev,
-        f"RAM ~{estimate.ram_gb:.1f} GB vs available {hardware.ram_gb:.1f} GB",
+        f"RAM ~{effective_ram:.1f} GB vs available {hardware.ram_gb:.1f} GB",
         metric="ram",
     )
 
@@ -606,9 +641,10 @@ def _data_phase(
                 f"Train tokens p95~{p95} exceeds {entry.get('module_name', '?')}.max_length={max_len}; expect silent truncation.",
             )
 
-    # rare class × linear-CV
+    # rare class × linear-CV (LogisticRegressionCV cv=3 needs ≥3 samples/class;
+    # multilabel path uses one-vs-rest without CV so the failure can't occur there)
     has_linear = any(e.get("module_name") == "linear" for _, e in _walk_modules(config.get("search_space") or []))
-    if has_linear and stats.rare_classes:
+    if has_linear and stats.rare_classes and not stats.multilabel:
         report.add(
             "data",
             Severity.OVER,
@@ -616,8 +652,9 @@ def _data_phase(
         )
 
     # partial descriptions × description scorer
+    description_modules = {"description_bi", "description_cross", "description_llm"}
     has_description = any(
-        e.get("module_name") == "description" for _, e in _walk_modules(config.get("search_space") or [])
+        e.get("module_name") in description_modules for _, e in _walk_modules(config.get("search_space") or [])
     )
     if has_description and stats.has_descriptions is False:
         report.add(
diff --git a/src/autointent/_advisor/_hub.py b/src/autointent/_advisor/_hub.py
index 613ab6b40..1c559ee2f 100644
--- a/src/autointent/_advisor/_hub.py
+++ b/src/autointent/_advisor/_hub.py
@@ -76,7 +76,7 @@ def _is_warm_cached(model_name: str) -> bool:
     weight_files = ["model.safetensors", "pytorch_model.bin", "model.safetensors.index.json"]
     for fname in weight_files:
         path = try_to_load_from_cache(model_name, fname)
-        if path is not None and path is not False:
+        if isinstance(path, str):
             return True
 
     # sharded models won't match the single-file probe; fall back to a scan
@@ -114,11 +114,17 @@ def _hub_metadata(model_name: str) -> ModelMeta | None:
         if size:
             total_file_bytes += int(size)
 
+    # Track whether either size came from the Hub or from the name-pattern fallback;
+    # if any field was filled by heuristic, downgrade confidence so the report flips
+    # low_confidence rather than misreporting hub-grade accuracy.
+    confidence = "hub"
     if params_millions == 0:
         params_millions = _heuristic_params_millions(model_name)
+        confidence = "heuristic"
 
     if total_file_bytes == 0:
         total_file_bytes = int(params_millions * 1_000_000 * weight_bytes_per_param)
+        confidence = "heuristic"
 
     return ModelMeta(
         name=model_name,
@@ -126,7 +132,7 @@ def _hub_metadata(model_name: str) -> ModelMeta | None:
         weight_bytes_per_param=weight_bytes_per_param,
         total_file_bytes=total_file_bytes,
         cached_locally=_is_warm_cached(model_name),
-        confidence="hub",
+        confidence=confidence,
     )
 
 
diff --git a/tests/advisor/test_estimates_and_cli.py b/tests/advisor/test_estimates_and_cli.py
index 15a087e07..3092dce9e 100644
--- a/tests/advisor/test_estimates_and_cli.py
+++ b/tests/advisor/test_estimates_and_cli.py
@@ -150,7 +150,7 @@ def test_partial_descriptions_with_description_scorer_flags_red() -> None:
             {
                 "node_type": "scoring",
                 "search_space": [
-                    {"module_name": "description"},
+                    {"module_name": "description_bi"},
                 ],
             }
         ],

From bbb039e576467b479e8148a87bbfd4b34299976e Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Tue, 16 Jun 2026 17:41:08 +0300
Subject: [PATCH 07/16] fix typing & lint

---
 pyproject.toml                            |  6 ++
 src/autointent/_advisor/_cli.py           | 26 ++++++--
 src/autointent/_advisor/_estimates.py     | 78 ++++++++++++-----------
 src/autointent/_advisor/_hardware.py      | 14 ++--
 src/autointent/_advisor/_hub.py           |  6 +-
 src/autointent/_advisor/_render.py        | 18 +++---
 tests/advisor/test_estimates_internals.py |  5 +-
 tests/advisor/test_report.py              |  4 +-
 8 files changed, 94 insertions(+), 63 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4ee34b9c5..971de2655 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -296,6 +296,12 @@ module = [
     "dspy.evaluate.auto_evaluation",
     "codecarbon",
     "catboost",
+    "openai",
+    "openai.*",
+    "tiktoken",
+    "peft",
+    "sentence_transformers",
+    "psutil",
 ]
 ignore_missing_imports = true
 
diff --git a/src/autointent/_advisor/_cli.py b/src/autointent/_advisor/_cli.py
index c9485cc81..8c8b7b9d2 100644
--- a/src/autointent/_advisor/_cli.py
+++ b/src/autointent/_advisor/_cli.py
@@ -17,7 +17,7 @@
 import logging
 import sys
 from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import yaml
 from datasets import ClassLabel, Sequence, load_dataset
@@ -27,10 +27,16 @@
 from ._estimates import run_preflight
 from ._hardware import detect_hardware
 from ._render import render_json, render_recommendation, render_text
-from ._report import DatasetStats, PreflightReport, Severity
+from ._report import DatasetStats, Severity
+
+if TYPE_CHECKING:
+    from ._report import PreflightReport
 
 logger = logging.getLogger("autointent.advisor")
 
+_SAMPLE_LIMIT = 1000
+_P95_PERCENTILE = 0.95
+
 BUNDLED_PRESETS = [
     "transformers-heavy",
     "transformers-light",
@@ -111,12 +117,12 @@ def _stats_from_dataset(path: str, *, multilabel: bool) -> DatasetStats:
 
     detected_multilabel, n_classes = _label_shape(train, label_col, fallback_multilabel=multilabel)
 
-    sample = train[:1000] if len(train) > 1000 else train[:]
+    sample = train[:_SAMPLE_LIMIT] if len(train) > _SAMPLE_LIMIT else train[:]
     lengths = [len(str(s).split()) for s in (sample.get(utt_col, []) if utt_col else [])]
     avg_tokens = int(sum(lengths) / max(1, len(lengths))) if lengths else 32
     if lengths:
         sorted_lengths = sorted(lengths)
-        idx = max(0, min(len(sorted_lengths) - 1, int(round((len(sorted_lengths) - 1) * 0.95))))
+        idx = max(0, min(len(sorted_lengths) - 1, round((len(sorted_lengths) - 1) * _P95_PERCENTILE)))
         p95 = sorted_lengths[idx]
     else:
         p95 = avg_tokens * 2
@@ -133,7 +139,7 @@ def _stats_from_dataset(path: str, *, multilabel: bool) -> DatasetStats:
     )
 
 
-def _label_shape(train: Any, label_col: str | None, *, fallback_multilabel: bool) -> tuple[bool, int]:
+def _label_shape(train: Any, label_col: str | None, *, fallback_multilabel: bool) -> tuple[bool, int]:  # noqa: ANN401
     """Derive (multilabel, n_classes) from the HF feature schema, with a value-based fallback."""
     if label_col is None:
         return fallback_multilabel, 0
@@ -155,7 +161,13 @@ def _label_shape(train: Any, label_col: str | None, *, fallback_multilabel: bool
     return False, len({label for label in train[label_col] if label is not None})
 
 
-def _rare_classes(train: Any, label_col: str, multilabel: bool, n_classes: int, min_count: int = 3) -> list[str]:
+def _rare_classes(
+    train: Any,  # noqa: ANN401
+    label_col: str,
+    multilabel: bool,
+    n_classes: int,
+    min_count: int = 3,
+) -> list[str]:
     """Return labels with fewer than ``min_count`` samples in the train split.
 
     Used to surface the LogisticRegressionCV(cv=3) failure case before fit.
@@ -293,7 +305,7 @@ def main(argv: list[str] | None = None) -> int:
         level=logging.DEBUG if args.verbose else logging.WARNING,
         format="%(levelname)s %(name)s: %(message)s",
     )
-    return args.func(args)
+    return int(args.func(args))
 
 
 if __name__ == "__main__":
diff --git a/src/autointent/_advisor/_estimates.py b/src/autointent/_advisor/_estimates.py
index e88fffd0a..93dfcedaf 100644
--- a/src/autointent/_advisor/_estimates.py
+++ b/src/autointent/_advisor/_estimates.py
@@ -9,16 +9,26 @@
 from __future__ import annotations
 
 import logging
-from collections.abc import Iterable
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 from pydantic import BaseModel, ConfigDict, Field, ValidationError
 
 from autointent.configs._optimization import HPOConfig
 
-from ._hardware import HardwareProfile
-from ._hub import ModelMeta, hub_reachable, resolve_model
-from ._report import DatasetStats, PreflightReport, ResourceEstimate, Severity
+from ._hub import hub_reachable, resolve_model
+from ._report import PreflightReport, ResourceEstimate, Severity
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+    from ._hardware import HardwareProfile
+    from ._hub import ModelMeta
+    from ._report import DatasetStats
+
+_MULTICLASS_THRESHOLD = 2
+_PARAMS_LARGE = 300
+_PARAMS_BASE = 100
+_PARAMS_SMALL = 50
 
 logger = logging.getLogger(__name__)
 
@@ -51,6 +61,7 @@ def _validated_config(config: dict[str, Any]) -> _AdvisorConfig:
         logger.warning("Advisor config failed validation; falling back to defaults: %s", e)
         return _AdvisorConfig()
 
+
 # Severity thresholds as a fraction of available budget: at or above _TIGHT
 # downgrades to Severity.TIGHT; at or above _OVER downgrades to Severity.OVER.
 _TIGHT_RATIO = 0.7
@@ -94,22 +105,18 @@ def _extract_model_names(module_entry: dict[str, Any]) -> list[str]:
     candidates: list[str] = []
     cfg = module_entry.get("classification_model_config")
     if isinstance(cfg, list):
-        for c in cfg:
-            if isinstance(c, dict) and c.get("model_name"):
-                candidates.append(c["model_name"])
+        candidates.extend(c["model_name"] for c in cfg if isinstance(c, dict) and c.get("model_name"))
     elif isinstance(cfg, dict) and cfg.get("model_name"):
         candidates.append(cfg["model_name"])
     embedder_cfg = module_entry.get("embedder_config")
     if isinstance(embedder_cfg, list):
-        for c in embedder_cfg:
-            if isinstance(c, dict) and c.get("model_name"):
-                candidates.append(c["model_name"])
+        candidates.extend(c["model_name"] for c in embedder_cfg if isinstance(c, dict) and c.get("model_name"))
     elif isinstance(embedder_cfg, dict) and embedder_cfg.get("model_name"):
         candidates.append(embedder_cfg["model_name"])
     return candidates
 
 
-def _max_int(value: Any, default: int) -> int:
+def _max_int(value: Any, default: int) -> int:  # noqa: ANN401
     if value is None:
         return default
     if isinstance(value, list) and value:
@@ -163,7 +170,7 @@ def _vram_for_transformer(
     batch_size: int = 0,
     seq_len: int = _DEFAULT_SEQ_LEN,
 ) -> float:
-    """Total VRAM in GB: weights + grads + optimizer state + activations × batch.
+    """Total VRAM in GB: weights + grads + optimizer state + activations x batch.
 
     Activation accounting differs by mode — training keeps per-layer outputs for
     backward; inference only needs one or two layers in flight.
@@ -200,11 +207,11 @@ def _n_layers(meta: ModelMeta | None) -> int:
     if meta is None:
         return 12
     params = meta.params_millions
-    if params >= 300:
+    if params >= _PARAMS_LARGE:
         return 24
-    if params >= 100:
+    if params >= _PARAMS_BASE:
         return 12
-    if params >= 50:
+    if params >= _PARAMS_SMALL:
         return 8
     return 6
 
@@ -218,20 +225,17 @@ def _activations_gb_per_sample(
 ) -> float:
     """Heuristic activation memory per sample.
 
-    Training: ``seq_len × hidden × layers × const`` — per-layer outputs are kept
+    Training: ``seq_len x hidden x layers x const`` — per-layer outputs are kept
     for backward.
-    Inference: ``seq_len × hidden × const`` — only one or two layers' outputs in
+    Inference: ``seq_len x hidden x const`` — only one or two layers' outputs in
     flight at once.
     Mixed precision halves activation bytes.
     """
     hidden = _embedder_dim(meta)
-    if is_training:
-        # Training keeps every layer's outputs for backward → scales × n_layers.
-        # The 16-byte/token/layer coefficient bundles fp32 activation + ~4× backward overhead.
-        bytes_per_sample = seq_len * hidden * _n_layers(meta) * 16
-    else:
-        # Inference only holds ~1-2 layers' outputs in flight at once.
-        bytes_per_sample = seq_len * hidden * 8
+    # Training keeps every layer's outputs for backward -> scales x n_layers.
+    # The 16-byte/token/layer coefficient bundles fp32 activation + ~4x backward overhead.
+    # Inference only holds ~1-2 layers' outputs in flight at once.
+    bytes_per_sample = seq_len * hidden * _n_layers(meta) * 16 if is_training else seq_len * hidden * 8
     if mixed_precision:
         bytes_per_sample //= 2
     return bytes_per_sample / (1024**3)
@@ -265,11 +269,11 @@ def _embedder_dim(meta: ModelMeta | None) -> int:
     if meta is None:
         return 768
     params = meta.params_millions
-    if params >= 300:
+    if params >= _PARAMS_LARGE:
         return 1024
-    if params >= 100:
+    if params >= _PARAMS_BASE:
         return 768
-    if params >= 50:
+    if params >= _PARAMS_SMALL:
         return 512
     return 384
 
@@ -313,7 +317,7 @@ def _ram_for_catboost(*, stats: DatasetStats, n_features: int, iterations: int,
     data_bytes = 4.0 * stats.n_samples * n_features
     histograms_bytes = 4.0 * n_features * _CATBOOST_DEFAULT_BINS
     trees_bytes = iterations * (2**depth) * _CATBOOST_BYTES_PER_TREE_NODE
-    return (data_bytes + histograms_bytes + trees_bytes) / (1024**3)
+    return float((data_bytes + histograms_bytes + trees_bytes) / (1024**3))
 
 
 def _time_for_catboost(
@@ -360,7 +364,7 @@ def _classify_severity(estimate: float, budget: float) -> Severity:
     return Severity.AMPLE
 
 
-def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
+def _resource_phase(  # noqa: PLR0912, C901, PLR0915 - kept linear for clarity
     config: dict[str, Any],
     stats: DatasetStats,
     hardware: HardwareProfile,
@@ -394,7 +398,7 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
             transformer_entries.append((node_idx, node_type, entry))
 
     # Track the heaviest module per node so dump_modules accounting is bounded by
-    # "one selected variant per node × n_trials", not "sum of every candidate".
+    # "one selected variant per node x n_trials", not "sum of every candidate".
     node_max_weights: dict[int, float] = {}
 
     for node_idx, node_type, entry in transformer_entries:
@@ -490,7 +494,9 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
             on_gpu = entry.get("task_type") == "GPU" and hardware.accelerator == "cuda"
             # CatBoost's MultiClass loss grows per-class trees only above binary;
             # binary uses Logloss with one tree per iteration.
-            cb_class_mult = max(1, stats.n_classes) if stats.n_classes > 2 or stats.multilabel else 1
+            cb_class_mult = (
+                max(1, stats.n_classes) if stats.n_classes > _MULTICLASS_THRESHOLD or stats.multilabel else 1
+            )
             ram_total = _ram_for_catboost(
                 stats=stats,
                 n_features=embedder_dim,
@@ -569,7 +575,6 @@ def _resource_phase(  # noqa: PLR0912 - kept linear for clarity
         msg += f" vs available {hardware.vram_gb:.1f} GB"
         report.add("resource", vram_sev, msg, metric="vram")
 
-
     ram_sev = _classify_severity(effective_ram, hardware.ram_gb)
     report.add(
         "resource",
@@ -635,13 +640,14 @@ def _data_phase(
         max_len = _max_int(max_len_value, 512)
         if p95 > max_len:
             severity = Severity.OVER if p95 > max_len * 1.5 else Severity.TIGHT
+            module_name = entry.get("module_name", "?")
             report.add(
                 "data",
                 severity,
-                f"Train tokens p95~{p95} exceeds {entry.get('module_name', '?')}.max_length={max_len}; expect silent truncation.",
+                f"Train tokens p95~{p95} exceeds {module_name}.max_length={max_len}; expect silent truncation.",
             )
 
-    # rare class × linear-CV (LogisticRegressionCV cv=3 needs ≥3 samples/class;
+    # rare class x linear-CV (LogisticRegressionCV cv=3 needs >=3 samples/class;
     # multilabel path uses one-vs-rest without CV so the failure can't occur there)
     has_linear = any(e.get("module_name") == "linear" for _, e in _walk_modules(config.get("search_space") or []))
     if has_linear and stats.rare_classes and not stats.multilabel:
@@ -651,7 +657,7 @@ def _data_phase(
             (f"LogisticRegressionCV (cv=3) will fail: classes {stats.rare_classes[:5]} have <3 samples."),
         )
 
-    # partial descriptions × description scorer
+    # partial descriptions x description scorer
     description_modules = {"description_bi", "description_cross", "description_llm"}
     has_description = any(
         e.get("module_name") in description_modules for _, e in _walk_modules(config.get("search_space") or [])
diff --git a/src/autointent/_advisor/_hardware.py b/src/autointent/_advisor/_hardware.py
index 9c0cae049..e959b6ebf 100644
--- a/src/autointent/_advisor/_hardware.py
+++ b/src/autointent/_advisor/_hardware.py
@@ -12,6 +12,7 @@
 import platform
 import shutil
 from dataclasses import dataclass, field
+from pathlib import Path
 from typing import Literal
 
 import psutil
@@ -24,6 +25,9 @@
 # matches macOS PYTORCH_MPS_HIGH_WATERMARK_RATIO default
 MPS_DEFAULT_BUDGET_RATIO = 0.7
 
+_HIGH_GPU_VRAM_GB = 24
+_MID_GPU_VRAM_GB = 12
+
 
 @dataclass
 class HardwareProfile:
@@ -41,20 +45,20 @@ def device_class(self) -> str:
             return "cpu"
         if self.accelerator == "mps":
             return "apple-silicon"
-        if self.vram_gb >= 24:
+        if self.vram_gb >= _HIGH_GPU_VRAM_GB:
             return "high-gpu"
-        if self.vram_gb >= 12:
+        if self.vram_gb >= _MID_GPU_VRAM_GB:
             return "mid-gpu"
         return "low-gpu"
 
 
 def _detect_ram_gb() -> float:
-    return psutil.virtual_memory().total / (1024**3)
+    return float(psutil.virtual_memory().total) / (1024**3)
 
 
 def _detect_free_disk_gb(path: str | None = None) -> float:
-    cache = path or os.environ.get("HF_HOME") or os.path.expanduser("~/.cache/huggingface")
-    probe_path = cache if os.path.exists(cache) else os.path.expanduser("~")
+    cache = Path(path or os.environ.get("HF_HOME") or Path("~/.cache/huggingface").expanduser())
+    probe_path = cache if cache.exists() else Path("~").expanduser()
     try:
         usage = shutil.disk_usage(probe_path)
         return usage.free / (1024**3)
diff --git a/src/autointent/_advisor/_hub.py b/src/autointent/_advisor/_hub.py
index 1c559ee2f..9b351952a 100644
--- a/src/autointent/_advisor/_hub.py
+++ b/src/autointent/_advisor/_hub.py
@@ -8,10 +8,10 @@
 from __future__ import annotations
 
 import logging
-import os
 import re
 from dataclasses import dataclass
 from functools import lru_cache
+from pathlib import Path
 from typing import Any
 
 from huggingface_hub import HfApi, scan_cache_dir, try_to_load_from_cache
@@ -54,7 +54,7 @@ def weights_gb(self) -> float:
 
 
 @lru_cache(maxsize=1)
-def hub_reachable(timeout_s: float = 2.0) -> bool:
+def hub_reachable() -> bool:
     """Single up-front probe. Memoized per process."""
     try:
         HfApi().list_models(limit=1)
@@ -157,7 +157,7 @@ def resolve_model(model_name: str) -> ModelMeta:
     Always returns a value — never raises — so the advisor can keep going
     on offline machines or for unknown checkpoints.
     """
-    if model_name.startswith("local:") or os.path.isabs(model_name):
+    if model_name.startswith("local:") or Path(model_name).is_absolute():
         return ModelMeta(
             name=model_name,
             params_millions=_heuristic_params_millions(model_name),
diff --git a/src/autointent/_advisor/_render.py b/src/autointent/_advisor/_render.py
index a3778d307..82771ef9f 100644
--- a/src/autointent/_advisor/_render.py
+++ b/src/autointent/_advisor/_render.py
@@ -8,7 +8,7 @@
 from __future__ import annotations
 
 import json
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
     from ._report import PreflightReport
@@ -18,7 +18,7 @@
 _PHASE_LABEL = {"resource": "Resource", "data": "Data", "config": "Config"}
 
 
-def _batch_hint(driver: dict) -> str:
+def _batch_hint(driver: dict[str, Any]) -> str:
     """Per-driver batch annotation: '64 → 32', '64', '64 (no fit)', or ''."""
     bs = driver.get("batch_size")
     if bs is None:
@@ -37,12 +37,11 @@ def _batch_hint(driver: dict) -> str:
 _DRIVERS_HEADERS = ("Node", "Model", "Mode", "VRAM", "Time", "Batch", "Source")
 
 
-def _render_drivers_table(drivers: list[dict]) -> list[str]:
+def _render_drivers_table(drivers: list[dict[str, Any]]) -> list[str]:
     """Format the Drivers of cost section as an aligned table."""
     visible = drivers[:_DRIVERS_LIMIT]
-    rows: list[tuple[str, ...]] = []
-    for d in visible:
-        rows.append((
+    rows: list[tuple[str, ...]] = [
+        (
             f"{d['node_type']}.{d['module']}",
             str(d["model"]),
             str(d["mode"]),
@@ -50,7 +49,9 @@ def _render_drivers_table(drivers: list[dict]) -> list[str]:
             f"{d['time_hours']:.2f} h",
             _batch_hint(d),
             f"[{d['confidence']}]",
-        ))
+        )
+        for d in visible
+    ]
 
     widths = [len(h) for h in _DRIVERS_HEADERS]
     for row in rows:
@@ -113,8 +114,7 @@ def render_text(report: PreflightReport) -> str:
 
     if report.notes:
         lines.append("Notes:")
-        for note in report.notes:
-            lines.append(f"  • {note}")
+        lines.extend(f"  • {note}" for note in report.notes)
         lines.append("")
 
     summary = f"Verdict: {'feasible' if report.is_feasible else 'INFEASIBLE'} "
diff --git a/tests/advisor/test_estimates_internals.py b/tests/advisor/test_estimates_internals.py
index 5ac66af2f..9b4881611 100644
--- a/tests/advisor/test_estimates_internals.py
+++ b/tests/advisor/test_estimates_internals.py
@@ -128,7 +128,7 @@ def test_inference_activations_are_smaller_than_training(self, meta: ModelMeta)
         inf_acts = inf_total - inf_weights
         assert inf_acts > 0
         assert train_acts > inf_acts
-        # 12-layer model: training activations should be at least ~5× inference.
+        # 12-layer model: training activations should be at least ~5x inference.
         assert train_acts / inf_acts > 5
 
     def test_amp_does_not_reduce_weight_side_vram(self, meta: ModelMeta) -> None:
@@ -503,7 +503,8 @@ def test_max_batch_can_be_larger_than_current(self) -> None:
             _profile(vram_gb=64.0),
         )
         d = next(d for d in report.resource.drivers if d["module"] == "bert")
-        assert d["max_batch_size"] is not None and d["max_batch_size"] > 32
+        assert d["max_batch_size"] is not None
+        assert d["max_batch_size"] > 32
 
     def test_multiple_drivers_carry_independent_max_batch(self) -> None:
         cfg = {
diff --git a/tests/advisor/test_report.py b/tests/advisor/test_report.py
index 28adbfc34..dbfc7adf6 100644
--- a/tests/advisor/test_report.py
+++ b/tests/advisor/test_report.py
@@ -2,6 +2,8 @@
 
 from __future__ import annotations
 
+import dataclasses
+
 import pytest
 
 from autointent._advisor._report import (
@@ -81,5 +83,5 @@ def test_hardware_and_dataset_pass_through(self) -> None:
 
     def test_finding_is_frozen(self) -> None:
         f = Finding(phase="resource", severity=Severity.AMPLE, message="ok")
-        with pytest.raises(Exception):  # noqa: PT011 - dataclass.FrozenInstanceError varies
+        with pytest.raises(dataclasses.FrozenInstanceError):
             f.message = "changed"  # type: ignore[misc]

From 334783c77f02b084a47d2e504782b55b296565ac Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Tue, 16 Jun 2026 18:01:40 +0300
Subject: [PATCH 08/16] try to fix typing

---
 tests/advisor/test_estimates_and_cli.py | 6 +++---
 tests/advisor/test_report.py            | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/advisor/test_estimates_and_cli.py b/tests/advisor/test_estimates_and_cli.py
index 3092dce9e..2f16555ae 100644
--- a/tests/advisor/test_estimates_and_cli.py
+++ b/tests/advisor/test_estimates_and_cli.py
@@ -56,21 +56,21 @@ def test_every_preset_inspects_without_raising(preset: str) -> None:
 
 
 def test_heavy_preset_is_infeasible_on_2gb_budget() -> None:
-    cfg = load_preset("transformers-heavy")  # type: ignore[arg-type]
+    cfg = load_preset("transformers-heavy")
     stats = DatasetStats.placeholder(n_samples=5000, n_classes=20, avg_tokens=40)
     report = run_preflight(cfg, stats, _profile(vram_gb=2.0), preset_name="transformers-heavy")
     assert not report.is_feasible, "deberta-v3-large should not fit in 2 GB"
 
 
 def test_light_preset_is_feasible_on_8gb_budget() -> None:
-    cfg = load_preset("transformers-light")  # type: ignore[arg-type]
+    cfg = load_preset("transformers-light")
     stats = DatasetStats.placeholder(n_samples=1000, n_classes=10, avg_tokens=24)
     report = run_preflight(cfg, stats, _profile(vram_gb=8.0), preset_name="transformers-light")
     assert report.is_feasible
 
 
 def test_n_jobs_doubles_vram_findings() -> None:
-    cfg = load_preset("transformers-light")  # type: ignore[arg-type]
+    cfg = load_preset("transformers-light")
     cfg = {**cfg, "hpo_config": {**(cfg.get("hpo_config") or {}), "n_jobs": 4}}
     stats = DatasetStats.placeholder()
     report = run_preflight(cfg, stats, _profile(vram_gb=4.0))
diff --git a/tests/advisor/test_report.py b/tests/advisor/test_report.py
index dbfc7adf6..acb2b5bf8 100644
--- a/tests/advisor/test_report.py
+++ b/tests/advisor/test_report.py
@@ -25,7 +25,7 @@ def test_red_beats_yellow_beats_green(self) -> None:
         r.add("data", Severity.TIGHT, "warn")
         assert r.headroom == Severity.TIGHT
         r.add("config", Severity.OVER, "fail")
-        assert r.headroom == Severity.OVER
+        assert r.headroom == Severity.OVER  # type: ignore[comparison-overlap]
 
     def test_is_feasible_flips_on_any_red(self) -> None:
         r = PreflightReport()

From 4e4da91edc5712017ad134b22db8b8260a1b3ca9 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Tue, 16 Jun 2026 18:10:21 +0300
Subject: [PATCH 09/16] roll back config changes

---
 src/autointent/_presets/transformers-heavy.yaml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/autointent/_presets/transformers-heavy.yaml b/src/autointent/_presets/transformers-heavy.yaml
index cd15d791e..2576fbc82 100644
--- a/src/autointent/_presets/transformers-heavy.yaml
+++ b/src/autointent/_presets/transformers-heavy.yaml
@@ -5,19 +5,12 @@ search_space:
       - module_name: bert
         classification_model_config:
           - model_name: microsoft/deberta-v3-large
-          - model_name: intfloat/multilingual-e5-large-instruct
-          - model_name: microsoft/harrier-oss-v1-27b
         num_train_epochs: [30]
         batch_size: [32, 64]
         learning_rate:
           low: 1.0e-5
           high: 1.0e-4
           log: True
-      - module_name: description_bi
-        embedder_config:
-          - model_name: microsoft/deberta-v3-large
-          - model_name: intfloat/multilingual-e5-large-instruct
-          - model_name: microsoft/harrier-oss-v1-27b
   - node_type: decision
     target_metric: decision_accuracy
     search_space:

From 8bd0b018ede3c89175cca41e6d71f0676d3f58d9 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Tue, 16 Jun 2026 18:26:19 +0300
Subject: [PATCH 10/16] move cli logic

---
 src/autointent/_advisor/__init__.py   |   9 +-
 src/autointent/_advisor/_cli.py       | 231 +++-----------------------
 src/autointent/_advisor/_report.py    |  18 ++
 src/autointent/_advisor/_workflows.py | 231 ++++++++++++++++++++++++++
 src/autointent/custom_types/_types.py |  17 +-
 5 files changed, 294 insertions(+), 212 deletions(-)
 create mode 100644 src/autointent/_advisor/_workflows.py

diff --git a/src/autointent/_advisor/__init__.py b/src/autointent/_advisor/__init__.py
index 3ff898816..28422c78d 100644
--- a/src/autointent/_advisor/__init__.py
+++ b/src/autointent/_advisor/__init__.py
@@ -9,15 +9,22 @@
 
 from ._estimates import run_preflight
 from ._hardware import HardwareProfile, detect_hardware
-from ._report import DatasetStats, Finding, PreflightReport, ResourceEstimate, Severity
+from ._report import DatasetStats, Finding, PreflightReport, RecommendationResult, ResourceEstimate, Severity
+from ._workflows import BUNDLED_PRESETS, inspect, load_config, recommend, stats_from_dataset
 
 __all__ = [
+    "BUNDLED_PRESETS",
     "DatasetStats",
     "Finding",
     "HardwareProfile",
     "PreflightReport",
+    "RecommendationResult",
     "ResourceEstimate",
     "Severity",
     "detect_hardware",
+    "inspect",
+    "load_config",
+    "recommend",
     "run_preflight",
+    "stats_from_dataset",
 ]
diff --git a/src/autointent/_advisor/_cli.py b/src/autointent/_advisor/_cli.py
index 8c8b7b9d2..315d5c443 100644
--- a/src/autointent/_advisor/_cli.py
+++ b/src/autointent/_advisor/_cli.py
@@ -9,190 +9,40 @@
 csv/json/jsonl/parquet path loaded via ``datasets.load_dataset``) or
 ``--n-samples / --n-classes / --avg-tokens`` placeholders so the script is
 useful before the user has built a dataset.
+
+The CLI is a thin wrapper around :func:`autointent._advisor.inspect` and
+:func:`autointent._advisor.recommend`; callers that don't need argparse can
+import those helpers directly.
 """
 
 from __future__ import annotations
 
 import argparse
+import json
 import logging
 import sys
-from pathlib import Path
-from typing import TYPE_CHECKING, Any
-
-import yaml
-from datasets import ClassLabel, Sequence, load_dataset
-
-from autointent.utils import load_preset
 
-from ._estimates import run_preflight
-from ._hardware import detect_hardware
 from ._render import render_json, render_recommendation, render_text
-from ._report import DatasetStats, Severity
+from ._report import DatasetStats
+from ._workflows import BUNDLED_PRESETS, inspect, recommend, stats_from_dataset
 
-if TYPE_CHECKING:
-    from ._report import PreflightReport
+__all__ = ["BUNDLED_PRESETS", "build_parser", "cmd_inspect", "cmd_recommend", "main"]
 
 logger = logging.getLogger("autointent.advisor")
 
-_SAMPLE_LIMIT = 1000
-_P95_PERCENTILE = 0.95
-
-BUNDLED_PRESETS = [
-    "transformers-heavy",
-    "transformers-light",
-    "transformers-no-hpo",
-    "nn-heavy",
-    "nn-medium",
-    "classic-heavy",
-    "classic-medium",
-    "classic-light",
-    "zero-shot-encoders",
-    "zero-shot-llm",
-]
-
-# rough quality tiering used by `recommend`
-_QUALITY_TIER = {
-    "transformers-heavy": 5,
-    "nn-heavy": 4,
-    "transformers-light": 4,
-    "nn-medium": 3,
-    "classic-heavy": 3,
-    "transformers-no-hpo": 3,
-    "classic-medium": 2,
-    "classic-light": 1,
-    "zero-shot-encoders": 2,
-    "zero-shot-llm": 4,
-}
-
-
-def _load_config(target: str) -> tuple[dict[str, Any], str]:
-    """Return (config_dict, friendly_name) for either a preset or a path."""
-    path = Path(target)
-    if path.is_file():
-        with path.open(encoding="utf-8") as f:
-            return yaml.safe_load(f), path.stem
-    # treat as a bundled preset name
-    return load_preset(target), target  # type: ignore[arg-type]
-
 
 def _stats_from_args(args: argparse.Namespace) -> DatasetStats:
+    multilabel = args.task == "multilabel"
     if args.dataset:
-        return _stats_from_dataset(args.dataset, multilabel=args.task == "multilabel")
+        return stats_from_dataset(args.dataset, multilabel=multilabel)
     return DatasetStats.placeholder(
         n_samples=args.n_samples,
         n_classes=args.n_classes,
         avg_tokens=args.avg_tokens,
-        multilabel=args.task == "multilabel",
-    )
-
-
-_UTTERANCE_COLS = ("utterance", "text", "sentence", "query", "input")
-_LABEL_COLS = ("label", "labels", "intent", "target")
-# Map file extension → datasets builder name. Anything else is treated as a Hub
-# repo id or a directory and passed to load_dataset directly.
-_FILE_BUILDERS = {".csv": "csv", ".tsv": "csv", ".json": "json", ".jsonl": "json", ".parquet": "parquet"}
-
-
-def _stats_from_dataset(path: str, *, multilabel: bool) -> DatasetStats:
-    """Best-effort: load via HF ``datasets.load_dataset``.
-
-    Accepts a Hub repo id ('DeepPavlov/clinc150') or a local file path
-    (.csv / .json / .jsonl / .parquet) / dataset directory. Falls back to a
-    placeholder on any loader error so the advisor stays best-effort.
-    """
-    builder = _FILE_BUILDERS.get(Path(path).suffix.lower())
-    try:
-        ds = load_dataset(builder, data_files=path) if builder else load_dataset(path)
-    except (OSError, ValueError, FileNotFoundError) as e:
-        logger.warning("Failed to load dataset %s: %s", path, e)
-        return DatasetStats.placeholder(multilabel=multilabel)
-
-    train = ds["train"] if "train" in ds else next(iter(ds.values()), None)
-    if train is None:
-        return DatasetStats.placeholder(multilabel=multilabel)
-
-    cols = train.column_names
-    utt_col = next((c for c in _UTTERANCE_COLS if c in cols), cols[0] if cols else None)
-    label_col = next((c for c in _LABEL_COLS if c in cols), None)
-
-    detected_multilabel, n_classes = _label_shape(train, label_col, fallback_multilabel=multilabel)
-
-    sample = train[:_SAMPLE_LIMIT] if len(train) > _SAMPLE_LIMIT else train[:]
-    lengths = [len(str(s).split()) for s in (sample.get(utt_col, []) if utt_col else [])]
-    avg_tokens = int(sum(lengths) / max(1, len(lengths))) if lengths else 32
-    if lengths:
-        sorted_lengths = sorted(lengths)
-        idx = max(0, min(len(sorted_lengths) - 1, round((len(sorted_lengths) - 1) * _P95_PERCENTILE)))
-        p95 = sorted_lengths[idx]
-    else:
-        p95 = avg_tokens * 2
-
-    return DatasetStats(
-        n_samples=len(train),
-        n_classes=n_classes,
-        avg_tokens=avg_tokens,
-        p95_tokens=p95,
-        multilabel=detected_multilabel,
-        has_descriptions=None,
-        rare_classes=_rare_classes(train, label_col, detected_multilabel, n_classes) if label_col else [],
-        source=f"dataset:{path}",
+        multilabel=multilabel,
     )
 
 
-def _label_shape(train: Any, label_col: str | None, *, fallback_multilabel: bool) -> tuple[bool, int]:  # noqa: ANN401
-    """Derive (multilabel, n_classes) from the HF feature schema, with a value-based fallback."""
-    if label_col is None:
-        return fallback_multilabel, 0
-    feature = train.features.get(label_col)
-    if isinstance(feature, Sequence):
-        inner = feature.feature
-        if isinstance(inner, ClassLabel):
-            return True, inner.num_classes
-        # Sequence of plain ints — n_classes = max label index + 1.
-        max_idx = max((max(row) for row in train[label_col] if row), default=-1)
-        return True, max_idx + 1
-    if isinstance(feature, ClassLabel):
-        return False, feature.num_classes
-    # Plain int/string column. Detect multilabel from the first non-empty row, then count uniques.
-    is_multi = len(train) > 0 and isinstance(train[0][label_col], (list, tuple))
-    if is_multi:
-        max_idx = max((max(row) for row in train[label_col] if row), default=-1)
-        return True, max_idx + 1
-    return False, len({label for label in train[label_col] if label is not None})
-
-
-def _rare_classes(
-    train: Any,  # noqa: ANN401
-    label_col: str,
-    multilabel: bool,
-    n_classes: int,
-    min_count: int = 3,
-) -> list[str]:
-    """Return labels with fewer than ``min_count`` samples in the train split.
-
-    Used to surface the LogisticRegressionCV(cv=3) failure case before fit.
-    Returns an empty list on any error so the advisor stays best-effort.
-    """
-    try:
-        labels = train[label_col]
-    except (KeyError, AttributeError, TypeError):
-        return []
-    counts: dict[str, int] = {}
-    if multilabel:
-        for row in labels:
-            if not row:
-                continue
-            for i, v in enumerate(row):
-                if v:
-                    counts[str(i)] = counts.get(str(i), 0) + 1
-        for i in range(n_classes):
-            counts.setdefault(str(i), 0)
-    else:
-        for label in labels:
-            counts[str(label)] = counts.get(str(label), 0) + 1
-    return sorted(name for name, c in counts.items() if c < min_count)
-
-
 def _add_common_dataset_args(p: argparse.ArgumentParser) -> None:
     p.add_argument("--dataset", help="Path or hub id of a dataset; overrides placeholders.")
     p.add_argument("--n-samples", type=int, default=1_000, help="Placeholder training set size.")
@@ -207,63 +57,36 @@ def _add_common_dataset_args(p: argparse.ArgumentParser) -> None:
 
 
 def cmd_inspect(args: argparse.Namespace) -> int:
-    config, name = _load_config(args.target)
-    hardware = detect_hardware(
-        vram_budget_gb=args.budget_vram_gb,
+    report = inspect(
+        args.target,
+        stats=_stats_from_args(args),
+        budget_vram_gb=args.budget_vram_gb,
     )
-    stats = _stats_from_args(args)
-    report = run_preflight(config, stats, hardware, preset_name=name)
     if args.json:
         sys.stdout.write(render_json(report))
-        sys.stdout.write("\n")
     else:
         sys.stdout.write(render_text(report))
-        sys.stdout.write("\n")
+    sys.stdout.write("\n")
     return 0 if report.is_feasible else 1
 
 
 def cmd_recommend(args: argparse.Namespace) -> int:
-    hardware = detect_hardware(vram_budget_gb=args.budget_vram_gb)
-    stats = _stats_from_args(args)
-
-    results: list[tuple[str, PreflightReport]] = []
-
-    for preset in BUNDLED_PRESETS:
-        try:
-            cfg = load_preset(preset)  # type: ignore[arg-type]
-        except (OSError, ValueError, KeyError) as e:
-            logger.debug("Skipping preset %s: %s", preset, e)
-            continue
-        report = run_preflight(cfg, stats, hardware, preset_name=preset)
-        if args.budget_time_h is not None and report.resource.time_hours > args.budget_time_h:
-            report.add(
-                "resource",
-                Severity.OVER,
-                f"Estimated time {report.resource.time_hours:.1f} h exceeds budget {args.budget_time_h} h.",
-            )
-        results.append((preset, report))
-
-    feasible = [(name, r) for name, r in results if r.is_feasible]
-    feasible.sort(key=lambda pair: (-_QUALITY_TIER.get(pair[0], 0), pair[1].resource.time_hours, pair[0]))
-    chosen = feasible[0][0] if feasible else None
-
+    result = recommend(
+        stats=_stats_from_args(args),
+        budget_vram_gb=args.budget_vram_gb,
+        budget_time_h=args.budget_time_h,
+    )
     if args.json:
-        import json
-
-        out = {
-            "chosen": chosen,
-            "results": [{"preset": name, "report": r.to_dict()} for name, r in results],
-        }
-        sys.stdout.write(json.dumps(out, indent=2, default=str))
+        sys.stdout.write(json.dumps(result.to_dict(), indent=2, default=str))
         sys.stdout.write("\n")
     else:
-        sys.stdout.write(render_recommendation(results, chosen))
+        sys.stdout.write(render_recommendation(result.results, result.chosen))
         sys.stdout.write("\n")
-        if chosen:
+        if result.chosen:
             sys.stdout.write("\n")
-            sys.stdout.write(render_text(dict(results)[chosen]))
+            sys.stdout.write(render_text(dict(result.results)[result.chosen]))
             sys.stdout.write("\n")
-    return 0 if chosen else 1
+    return 0 if result.chosen else 1
 
 
 def build_parser() -> argparse.ArgumentParser:
@@ -309,4 +132,4 @@ def main(argv: list[str] | None = None) -> int:
 
 
 if __name__ == "__main__":
-    raise SystemExit(main())
+    main()
diff --git a/src/autointent/_advisor/_report.py b/src/autointent/_advisor/_report.py
index 9b4a319c8..c9fd920f4 100644
--- a/src/autointent/_advisor/_report.py
+++ b/src/autointent/_advisor/_report.py
@@ -110,3 +110,21 @@ def to_dict(self) -> dict[str, Any]:
         d["headroom"] = self.headroom.value
         d["is_feasible"] = self.is_feasible
         return d
+
+
+@dataclass
+class RecommendationResult:
+    """Output of the recommend workflow: ranked per-preset reports plus the pick.
+
+    ``chosen`` is the best feasible preset name, or ``None`` if none fit.
+    ``results`` is the full per-preset report list in evaluation order.
+    """
+
+    chosen: str | None
+    results: list[tuple[str, PreflightReport]]
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "chosen": self.chosen,
+            "results": [{"preset": name, "report": r.to_dict()} for name, r in self.results],
+        }
diff --git a/src/autointent/_advisor/_workflows.py b/src/autointent/_advisor/_workflows.py
new file mode 100644
index 000000000..0bd7ee5bf
--- /dev/null
+++ b/src/autointent/_advisor/_workflows.py
@@ -0,0 +1,231 @@
+"""High-level advisor workflows: ``inspect`` and ``recommend``.
+
+Each workflow orchestrates the lower-level pieces (``load_config``,
+``detect_hardware``, ``stats_from_dataset``, ``run_preflight``) into a single
+typed call. They expose the same logic the CLI uses but accept Python
+arguments instead of an ``argparse.Namespace`` — useful from notebooks,
+integration tests, or any caller that wants a ``PreflightReport`` /
+``RecommendationResult`` directly.
+"""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, get_args
+
+import yaml
+from datasets import ClassLabel, Sequence, load_dataset
+
+from autointent.custom_types import SearchSpacePreset
+from autointent.utils import load_preset
+
+from ._estimates import run_preflight
+from ._hardware import detect_hardware
+from ._report import DatasetStats, RecommendationResult, Severity
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+    from ._report import PreflightReport
+
+
+logger = logging.getLogger("autointent.advisor")
+
+_SAMPLE_LIMIT = 1000
+_P95_PERCENTILE = 0.95
+BUNDLED_PRESETS: tuple[str, ...] = get_args(SearchSpacePreset)
+
+
+def load_config(target: str) -> tuple[dict[str, Any], str]:
+    """Return ``(config_dict, friendly_name)`` for either a preset name or a YAML path."""
+    path = Path(target)
+    if path.is_file():
+        with path.open(encoding="utf-8") as f:
+            return yaml.safe_load(f), path.stem
+    return load_preset(target), target  # type: ignore[arg-type]
+
+
+def stats_from_dataset(path: str, *, multilabel: bool = False) -> DatasetStats:
+    """Best-effort: load a dataset via HF ``datasets.load_dataset`` and derive advisor stats.
+
+    Accepts a Hub repo id (``DeepPavlov/clinc150``) or a local file path
+    (``.csv`` / ``.json`` / ``.jsonl`` / ``.parquet``) / dataset directory. Falls
+    back to a placeholder on any loader error so callers stay best-effort.
+    """
+    # Anything not in this map (no suffix, unknown suffix) is treated as a Hub
+    # repo id or a dataset directory and passed to load_dataset directly.
+    file_builders = {".csv": "csv", ".tsv": "csv", ".json": "json", ".jsonl": "json", ".parquet": "parquet"}
+    builder = file_builders.get(Path(path).suffix.lower())
+    try:
+        ds = load_dataset(builder, data_files=path) if builder else load_dataset(path)
+    except (OSError, ValueError, FileNotFoundError) as e:
+        logger.warning("Failed to load dataset %s: %s", path, e)
+        return DatasetStats.placeholder(multilabel=multilabel)
+
+    train = ds["train"] if "train" in ds else next(iter(ds.values()), None)
+    if train is None:
+        return DatasetStats.placeholder(multilabel=multilabel)
+
+    cols = train.column_names
+    utt_col = next(
+        (c for c in ("utterance", "text", "sentence", "query", "input") if c in cols), cols[0] if cols else None
+    )
+    label_col = next((c for c in ("label", "labels", "intent", "target") if c in cols), None)
+
+    detected_multilabel, n_classes = _label_shape(train, label_col, fallback_multilabel=multilabel)
+
+    sample = train[:_SAMPLE_LIMIT] if len(train) > _SAMPLE_LIMIT else train[:]
+    lengths = [len(str(s).split()) for s in (sample.get(utt_col, []) if utt_col else [])]
+    avg_tokens = int(sum(lengths) / max(1, len(lengths))) if lengths else 32
+    if lengths:
+        sorted_lengths = sorted(lengths)
+        idx = max(0, min(len(sorted_lengths) - 1, round((len(sorted_lengths) - 1) * _P95_PERCENTILE)))
+        p95 = sorted_lengths[idx]
+    else:
+        p95 = avg_tokens * 2
+
+    return DatasetStats(
+        n_samples=len(train),
+        n_classes=n_classes,
+        avg_tokens=avg_tokens,
+        p95_tokens=p95,
+        multilabel=detected_multilabel,
+        has_descriptions=None,
+        rare_classes=_rare_classes(train, label_col, detected_multilabel, n_classes) if label_col else [],
+        source=f"dataset:{path}",
+    )
+
+
+def _label_shape(train: Any, label_col: str | None, *, fallback_multilabel: bool) -> tuple[bool, int]:  # noqa: ANN401
+    """Derive ``(multilabel, n_classes)`` from the HF feature schema with a value-based fallback."""
+    if label_col is None:
+        return fallback_multilabel, 0
+    feature = train.features.get(label_col)
+    if isinstance(feature, Sequence):
+        inner = feature.feature
+        if isinstance(inner, ClassLabel):
+            return True, inner.num_classes
+        # Sequence of plain ints — n_classes = max label index + 1.
+        max_idx = max((max(row) for row in train[label_col] if row), default=-1)
+        return True, max_idx + 1
+    if isinstance(feature, ClassLabel):
+        return False, feature.num_classes
+    # Plain int/string column. Detect multilabel from the first non-empty row, then count uniques.
+    is_multi = len(train) > 0 and isinstance(train[0][label_col], (list, tuple))
+    if is_multi:
+        max_idx = max((max(row) for row in train[label_col] if row), default=-1)
+        return True, max_idx + 1
+    return False, len({label for label in train[label_col] if label is not None})
+
+
+def _rare_classes(
+    train: Any,  # noqa: ANN401
+    label_col: str,
+    multilabel: bool,
+    n_classes: int,
+    min_count: int = 3,
+) -> list[str]:
+    """Return labels with fewer than ``min_count`` samples in the train split.
+
+    Used to surface the LogisticRegressionCV(cv=3) failure case before fit.
+    Returns an empty list on any error so the advisor stays best-effort.
+    """
+    try:
+        labels = train[label_col]
+    except (KeyError, AttributeError, TypeError):
+        return []
+    counts: dict[str, int] = {}
+    if multilabel:
+        for row in labels:
+            if not row:
+                continue
+            for i, v in enumerate(row):
+                if v:
+                    counts[str(i)] = counts.get(str(i), 0) + 1
+        for i in range(n_classes):
+            counts.setdefault(str(i), 0)
+    else:
+        for label in labels:
+            counts[str(label)] = counts.get(str(label), 0) + 1
+    return sorted(name for name, c in counts.items() if c < min_count)
+
+
+def inspect(
+    target: str,
+    *,
+    stats: DatasetStats | None = None,
+    budget_vram_gb: float | None = None,
+) -> PreflightReport:
+    """Inspect a preset (or YAML config path) against the local hardware.
+
+    Args:
+        target: Bundled preset name (e.g. ``'transformers-light'``) or a YAML
+            config path. The friendly name surfaced in the report is the file
+            stem for paths and the preset name otherwise.
+        stats: Dataset stats to score against. Defaults to a placeholder if
+            ``None``.
+        budget_vram_gb: Optional VRAM-budget override for the hardware probe.
+
+    Returns:
+        ``PreflightReport`` covering resource / data / config phases.
+    """
+    config, name = load_config(target)
+    hardware = detect_hardware(vram_budget_gb=budget_vram_gb)
+    return run_preflight(config, stats or DatasetStats.placeholder(), hardware, preset_name=name)
+
+
+def recommend(
+    *,
+    stats: DatasetStats | None = None,
+    presets: Iterable[str] | None = None,
+    budget_vram_gb: float | None = None,
+    budget_time_h: float | None = None,
+) -> RecommendationResult:
+    """Walk bundled presets and return the best feasible fit plus all per-preset reports.
+
+    Args:
+        stats: Dataset stats to score against. Defaults to a placeholder if ``None``.
+        presets: Override of the preset list (defaults to ``BUNDLED_PRESETS``).
+        budget_vram_gb: Optional VRAM-budget override for the hardware probe.
+        budget_time_h: Optional wall-time ceiling in hours; presets exceeding it
+            get an extra ``Severity.OVER`` finding so they drop out of the
+            feasible ranking.
+
+    Returns:
+        ``RecommendationResult`` with the chosen preset name and full results list.
+
+    Note:
+        Among feasible presets we pick the one with the largest estimated
+        ``time_hours`` (ties broken alphabetically). Higher-quality presets cost
+        more wall-time, so the slowest feasible preset is also the heaviest
+        preset that still fits the hardware — i.e. "use what you have".
+    """
+    hardware = detect_hardware(vram_budget_gb=budget_vram_gb)
+    stats = stats or DatasetStats.placeholder()
+    preset_iter = list(presets) if presets is not None else BUNDLED_PRESETS
+
+    results: list[tuple[str, PreflightReport]] = []
+    for preset in preset_iter:
+        try:
+            cfg = load_preset(preset)  # type: ignore[arg-type]
+        except (OSError, ValueError, KeyError) as e:
+            logger.debug("Skipping preset %s: %s", preset, e)
+            continue
+        report = run_preflight(cfg, stats, hardware, preset_name=preset)
+        if budget_time_h is not None and report.resource.time_hours > budget_time_h:
+            report.add(
+                "resource",
+                Severity.OVER,
+                f"Estimated time {report.resource.time_hours:.1f} h exceeds budget {budget_time_h} h.",
+            )
+        results.append((preset, report))
+
+    # Rank by Literal position (lower index = higher quality); presets the user
+    # passed via the ``presets`` override but not in BUNDLED_PRESETS sort last.
+    quality_rank = {name: i for i, name in enumerate(BUNDLED_PRESETS)}
+    feasible = [(name, r) for name, r in results if r.is_feasible]
+    feasible.sort(key=lambda pair: (quality_rank.get(pair[0], len(BUNDLED_PRESETS)), pair[0]))
+    chosen = feasible[0][0] if feasible else None
+
+    return RecommendationResult(chosen=chosen, results=results)
diff --git a/src/autointent/custom_types/_types.py b/src/autointent/custom_types/_types.py
index cbfa82576..a54da368d 100644
--- a/src/autointent/custom_types/_types.py
+++ b/src/autointent/custom_types/_types.py
@@ -117,18 +117,21 @@ class Split:
 """
 
 SearchSpacePreset = Literal[
-    "classic-heavy",
-    "classic-light",
-    "classic-medium",
-    "nn-heavy",
-    "nn-medium",
     "transformers-heavy",
     "transformers-light",
-    "transformers-no-hpo",
+    "nn-heavy",
     "zero-shot-llm",
+    "nn-medium",
+    "classic-heavy",
+    "transformers-no-hpo",
+    "classic-medium",
     "zero-shot-encoders",
+    "classic-light",
 ]
-"""Some presets that our library supports."""
+"""Bundled search-space presets, listed in descending quality order.
+
+The order is consumed by ``autointent._advisor.recommend`` to pick the
+highest-quality feasible preset (lower index = higher quality)."""
 
 
 class Document(BaseModel):

From b77d57586028af16a01b210161af702dcf34e789 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Tue, 16 Jun 2026 18:57:07 +0300
Subject: [PATCH 11/16] simplify logic

---
 src/autointent/_advisor/_estimates.py     | 477 ++++++++++++----------
 src/autointent/_advisor/_hub.py           |  47 ++-
 tests/advisor/test_estimates_internals.py |  10 +-
 3 files changed, 320 insertions(+), 214 deletions(-)

diff --git a/src/autointent/_advisor/_estimates.py b/src/autointent/_advisor/_estimates.py
index 93dfcedaf..274407c26 100644
--- a/src/autointent/_advisor/_estimates.py
+++ b/src/autointent/_advisor/_estimates.py
@@ -9,6 +9,7 @@
 from __future__ import annotations
 
 import logging
+from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any
 
 from pydantic import BaseModel, ConfigDict, Field, ValidationError
@@ -26,9 +27,11 @@
     from ._report import DatasetStats
 
 _MULTICLASS_THRESHOLD = 2
-_PARAMS_LARGE = 300
-_PARAMS_BASE = 100
-_PARAMS_SMALL = 50
+
+# Fallback architecture shape (BERT-base) used only when the model's actual
+# config.json couldn't be fetched from HF Hub — see _hub._shape_from_config.
+_DEFAULT_HIDDEN = 768
+_DEFAULT_LAYERS = 12
 
 logger = logging.getLogger(__name__)
 
@@ -77,12 +80,14 @@ def _validated_config(config: dict[str, Any]) -> _AdvisorConfig:
 }
 
 # Maps each fine-tunable transformer module to its training-mode label.
-# Modules not listed are treated as inference-only.
+# Modules not listed (or listed as "inference") run the encoder forward-only.
+# Note: dnnc keeps the cross-encoder frozen and trains an sklearn LogisticRegressionCV
+# head on top of its features (see autointent._wrappers.ranker.Ranker._fit), so the
+# encoder's VRAM profile matches inference rather than fine-tuning.
 _TRANSFORMER_TRAINING_MODE = {
     "bert": "full-finetune",
     "ptuning": "lora",
     "lora": "lora",
-    "dnnc": "reranker",
 }
 
 # Fallback max_length when the search-space entry doesn't pin it. Used both as
@@ -148,17 +153,16 @@ def _walk_modules(search_space: list[dict[str, Any]]) -> Iterable[tuple[str, dic
 def _weights_vram_for_transformer(meta: ModelMeta, mode: str) -> float:
     """Weight-side VRAM in GB — weights + grads + Adam optimizer state. Excludes activations.
 
-    Full fine-tune fp32: W + W + 2W (Adam m, v) = 4W.
-    Full fine-tune AMP: 0.5W (fp16 weights) + 0.5W (fp16 grads) + W (fp32 master) + 2W (fp32 Adam) = 4W.
-    AMP's savings live in activations, not the optimizer — the weight side is identical.
+    Modes:
+      * ``inference``: forward only — weights + ~30% intermediate-tensor overhead.
+      * ``lora``: frozen base + small trainable adapters + their grads/optimizer (~0.5 GB).
+      * ``full-finetune`` (default): weights + grads + Adam (m, v) = 4x weights.
     """
     weights_gb = meta.weights_gb
     if mode == "inference":
         return weights_gb * 1.3
     if mode == "lora":
         return weights_gb * 1.3 + 0.5
-    if mode == "reranker":
-        return weights_gb * 1.5
     return weights_gb * 4.0
 
 
@@ -200,20 +204,10 @@ def _floor_to_power_of_two(n: int) -> int:
 
 
 def _n_layers(meta: ModelMeta | None) -> int:
-    """Coarse layer-count guess from parameter count.
-
-    MiniLM (33M) ~6, BERT-base (110M) ~12, BERT-large (350M) ~24.
-    """
-    if meta is None:
-        return 12
-    params = meta.params_millions
-    if params >= _PARAMS_LARGE:
-        return 24
-    if params >= _PARAMS_BASE:
-        return 12
-    if params >= _PARAMS_SMALL:
-        return 8
-    return 6
+    """Layer count from the model's ``config.json``; falls back to BERT-base when absent."""
+    if meta is not None and meta.n_layers is not None:
+        return meta.n_layers
+    return _DEFAULT_LAYERS
 
 
 def _activations_gb_per_sample(
@@ -262,20 +256,10 @@ def _max_fitting_batch_size(
 
 
 def _embedder_dim(meta: ModelMeta | None) -> int:
-    """Coarse hidden-size guess from parameter count.
-
-    Concrete points: MiniLM (33M) ~384, BERT-base (110M) ~768, BERT-large (350M) ~1024.
-    """
-    if meta is None:
-        return 768
-    params = meta.params_millions
-    if params >= _PARAMS_LARGE:
-        return 1024
-    if params >= _PARAMS_BASE:
-        return 768
-    if params >= _PARAMS_SMALL:
-        return 512
-    return 384
+    """Hidden size from the model's ``config.json``; falls back to BERT-base when absent."""
+    if meta is not None and meta.hidden_size is not None:
+        return meta.hidden_size
+    return _DEFAULT_HIDDEN
 
 
 def _largest_embedder(seen_models: dict[str, ModelMeta]) -> ModelMeta | None:
@@ -364,146 +348,139 @@ def _classify_severity(estimate: float, budget: float) -> Severity:
     return Severity.AMPLE
 
 
-def _resource_phase(  # noqa: PLR0912, C901, PLR0915 - kept linear for clarity
-    config: dict[str, Any],
-    stats: DatasetStats,
-    hardware: HardwareProfile,
-    report: PreflightReport,
-) -> None:
-    cfg = _validated_config(config)
-    n_trials = max(1, cfg.hpo_config.n_trials)
-    n_jobs = max(1, cfg.hpo_config.n_jobs)
-    refit_after = cfg.refit_after
-    dump_modules = cfg.dump_modules
+@dataclass
+class _ModuleEstimate:
+    """Per-module cost contribution + the dict that gets rendered in the report."""
 
-    if not hub_reachable():
-        report.low_confidence = True
-        report.notes.append("HF Hub unreachable — all model sizes are name-pattern heuristics.")
+    driver: dict[str, Any]
+    vram_gb: float
+    ram_gb: float
+    time_hours: float
+    model_weights_gb: float = 0.0
 
-    seen_models: dict[str, ModelMeta] = {}
-    estimate = ResourceEstimate(parallel_factor=n_jobs)
 
-    global_embedder = (cfg.embedder_config or {}).get("model_name")
-    if global_embedder:
-        seen_models[global_embedder] = resolve_model(global_embedder)
+def _refit_factor(*, refit_after: bool, n_trials: int) -> float:
+    """Wall-time multiplier for ``refit_after=True`` (amortized 1/n_trials extra)."""
+    return 1 + 1.0 / max(1, n_trials) if refit_after else 1.0
 
-    # First pass: walk transformer-bearing modules (collects seen_models for embedder_dim lookup).
-    transformer_entries: list[tuple[int, str, dict[str, Any]]] = []
-    classic_entries: list[tuple[int, str, dict[str, Any]]] = []
-    for node_idx, node_type, entry in _walk_modules_indexed(cfg.search_space):
-        module = entry.get("module_name", "?")
-        if module in {"linear", "catboost"}:
-            classic_entries.append((node_idx, node_type, entry))
-        else:
-            transformer_entries.append((node_idx, node_type, entry))
 
-    # Track the heaviest module per node so dump_modules accounting is bounded by
-    # "one selected variant per node x n_trials", not "sum of every candidate".
-    node_max_weights: dict[int, float] = {}
+def _split_entries(
+    search_space: list[dict[str, Any]],
+) -> tuple[list[tuple[int, str, dict[str, Any]]], list[tuple[int, str, dict[str, Any]]]]:
+    """Partition search-space entries into (transformer-bearing, classic)."""
+    transformer, classic = [], []
+    for node_idx, node_type, entry in _walk_modules_indexed(search_space):
+        bucket = classic if entry.get("module_name") in {"linear", "catboost"} else transformer
+        bucket.append((node_idx, node_type, entry))
+    return transformer, classic
 
-    for node_idx, node_type, entry in transformer_entries:
-        module = entry.get("module_name", "?")
-        model_names = _extract_model_names(entry)
-        if not model_names and global_embedder and module in {"knn", "mlknn"}:
-            model_names = [global_embedder]
 
-        for name in model_names:
-            meta = seen_models.setdefault(name, resolve_model(name))
+def _estimate_transformer_model(
+    *,
+    meta: ModelMeta,
+    entry: dict[str, Any],
+    node_type: str,
+    module: str,
+    name: str,
+    stats: DatasetStats,
+    hardware: HardwareProfile,
+    n_trials: int,
+    refit_after: bool,
+) -> _ModuleEstimate:
+    """One row of cost for a transformer module + a specific model checkpoint."""
+    mixed_precision = entry.get("dtype") in {"fp16", "bf16"}
+    mode = _TRANSFORMER_TRAINING_MODE.get(module, "inference")
+    batch_size = _max_int(entry.get("batch_size"), 32)
+    epochs = _max_int(entry.get("num_train_epochs"), 1 if mode == "inference" else 10)
+    seq_len = _max_int(entry.get("max_length"), _DEFAULT_SEQ_LEN)
+
+    vram = _vram_for_transformer(meta, mode, mixed_precision, batch_size=batch_size, seq_len=seq_len)
+    ram = _ram_for_module(meta, stats)
+
+    driver_max_batch: int | None = None
+    if hardware.vram_gb > 0:
+        driver_max_batch = _max_fitting_batch_size(
+            weight_vram_gb=_weights_vram_for_transformer(meta, mode),
+            vram_budget_gb=hardware.vram_gb,
+            per_sample_gb=_activations_gb_per_sample(
+                meta, seq_len, mixed_precision=mixed_precision, is_training=mode != "inference"
+            ),
+        )
 
-            mixed_precision = entry.get("dtype") in {"fp16", "bf16"}
-            mode = _TRANSFORMER_TRAINING_MODE.get(module, "inference")
-
-            batch_size = _max_int(entry.get("batch_size"), 32)
-            epochs = _max_int(entry.get("num_train_epochs"), 1 if mode == "inference" else 10)
-            seq_len = _max_int(entry.get("max_length"), _DEFAULT_SEQ_LEN)
-
-            vram = _vram_for_transformer(meta, mode, mixed_precision, batch_size=batch_size, seq_len=seq_len)
-            ram = _ram_for_module(meta, stats)
-
-            driver_max_batch: int | None = None
-            if hardware.vram_gb > 0:
-                weights_vram = _weights_vram_for_transformer(meta, mode)
-                per_sample_gb = _activations_gb_per_sample(
-                    meta, seq_len, mixed_precision=mixed_precision, is_training=mode != "inference"
-                )
-                driver_max_batch = _max_fitting_batch_size(
-                    weight_vram_gb=weights_vram,
-                    vram_budget_gb=hardware.vram_gb,
-                    per_sample_gb=per_sample_gb,
-                )
-
-            time_h = _time_for_transformer(
-                meta=meta,
-                n_trials=n_trials,
-                epochs=epochs,
-                batch_size=batch_size,
-                n_samples=stats.n_samples,
-                device_class=hardware.device_class,
-            )
-            if refit_after and mode != "inference":
-                time_h *= 1 + 1.0 / max(1, n_trials)
-
-            estimate.vram_gb = max(estimate.vram_gb, vram)
-            estimate.ram_gb = max(estimate.ram_gb, ram)
-            estimate.time_hours += time_h
-            node_max_weights[node_idx] = max(node_max_weights.get(node_idx, 0.0), meta.weights_gb)
-            estimate.drivers.append(
-                {
-                    "node_type": node_type,
-                    "module": module,
-                    "model": name,
-                    "mode": mode,
-                    "vram_gb": round(vram, 2),
-                    "ram_gb": round(ram, 2),
-                    "time_hours": round(time_h, 2),
-                    "batch_size": batch_size,
-                    "max_batch_size": driver_max_batch,
-                    "confidence": meta.confidence,
-                }
-            )
+    time_h = _time_for_transformer(
+        meta=meta,
+        n_trials=n_trials,
+        epochs=epochs,
+        batch_size=batch_size,
+        n_samples=stats.n_samples,
+        device_class=hardware.device_class,
+    )
+    if mode != "inference":
+        time_h *= _refit_factor(refit_after=refit_after, n_trials=n_trials)
+
+    return _ModuleEstimate(
+        driver={
+            "node_type": node_type,
+            "module": module,
+            "model": name,
+            "mode": mode,
+            "vram_gb": round(vram, 2),
+            "ram_gb": round(ram, 2),
+            "time_hours": round(time_h, 2),
+            "batch_size": batch_size,
+            "max_batch_size": driver_max_batch,
+            "confidence": meta.confidence,
+        },
+        vram_gb=vram,
+        ram_gb=ram,
+        time_hours=time_h,
+        model_weights_gb=meta.weights_gb,
+    )
 
-    # Second pass: linear / catboost — cost depends on embedder_dim, not a checkpoint.
-    embedder_meta = _largest_embedder(seen_models)
-    embedder_dim = _embedder_dim(embedder_meta)
-    # Both multinomial (multiclass) and one-vs-rest (multilabel) LR scale linearly in n_classes;
-    # the multiclass path additionally pays the LogisticRegressionCV inner-fit multiplier.
-    class_multiplier_classic = max(1, stats.n_classes)
-    confidence = embedder_meta.confidence if embedder_meta else "heuristic"
-    embedder_label = embedder_meta.name if embedder_meta else "(no embedder)"
-    for _node_idx, node_type, entry in classic_entries:
-        module = entry.get("module_name", "?")
-        if module == "linear":
-            max_iter = _max_int(entry.get("max_iter"), 100)
-            cv_multiplier = 1 if stats.multilabel else _LOGREG_CV_MULTIPLIER
-            ram = _ram_for_linear(stats=stats, embedder_dim=embedder_dim)
-            time_h = _time_for_linear(
+
+def _estimate_classic_entry(
+    *,
+    entry: dict[str, Any],
+    node_type: str,
+    embedder_meta: ModelMeta | None,
+    embedder_dim: int,
+    stats: DatasetStats,
+    hardware: HardwareProfile,
+    n_trials: int,
+    refit_after: bool,
+) -> _ModuleEstimate | None:
+    """Cost row for a linear or catboost scorer (returns ``None`` for any other module)."""
+    module = entry.get("module_name", "?")
+    refit = _refit_factor(refit_after=refit_after, n_trials=n_trials)
+    # Both multinomial (multiclass) and one-vs-rest (multilabel) LR scale linearly in n_classes.
+    class_multiplier = max(1, stats.n_classes)
+
+    if module == "linear":
+        cv_multiplier = 1 if stats.multilabel else _LOGREG_CV_MULTIPLIER
+        ram = _ram_for_linear(stats=stats, embedder_dim=embedder_dim)
+        time_h = (
+            _time_for_linear(
                 n_trials=n_trials,
                 n_samples=stats.n_samples,
                 embedder_dim=embedder_dim,
-                max_iter=max_iter,
+                max_iter=_max_int(entry.get("max_iter"), 100),
                 cv_multiplier=cv_multiplier,
-                class_multiplier=class_multiplier_classic,
-            )
-            if refit_after:
-                time_h *= 1 + 1.0 / max(1, n_trials)
-            vram = 0.0
-            mode = "linear-cv" if cv_multiplier > 1 else "linear"
-        elif module == "catboost":
-            iterations = _max_int(entry.get("iterations"), 1000)
-            depth = _max_int(entry.get("depth"), 6)
-            on_gpu = entry.get("task_type") == "GPU" and hardware.accelerator == "cuda"
-            # CatBoost's MultiClass loss grows per-class trees only above binary;
-            # binary uses Logloss with one tree per iteration.
-            cb_class_mult = (
-                max(1, stats.n_classes) if stats.n_classes > _MULTICLASS_THRESHOLD or stats.multilabel else 1
-            )
-            ram_total = _ram_for_catboost(
-                stats=stats,
-                n_features=embedder_dim,
-                iterations=iterations,
-                depth=depth,
+                class_multiplier=class_multiplier,
             )
-            time_h = _time_for_catboost(
+            * refit
+        )
+        vram = 0.0
+        mode = "linear-cv" if cv_multiplier > 1 else "linear"
+    elif module == "catboost":
+        on_gpu = entry.get("task_type") == "GPU" and hardware.accelerator == "cuda"
+        # CatBoost MultiClass loss grows per-class trees only above binary; binary uses
+        # Logloss with one tree per iteration.
+        cb_class_mult = class_multiplier if stats.n_classes > _MULTICLASS_THRESHOLD or stats.multilabel else 1
+        iterations = _max_int(entry.get("iterations"), 1000)
+        depth = _max_int(entry.get("depth"), 6)
+        ram_total = _ram_for_catboost(stats=stats, n_features=embedder_dim, iterations=iterations, depth=depth)
+        time_h = (
+            _time_for_catboost(
                 n_trials=n_trials,
                 n_samples=stats.n_samples,
                 n_features=embedder_dim,
@@ -512,55 +489,66 @@ def _resource_phase(  # noqa: PLR0912, C901, PLR0915 - kept linear for clarity
                 class_multiplier=cb_class_mult,
                 on_gpu=on_gpu,
             )
-            if refit_after:
-                time_h *= 1 + 1.0 / max(1, n_trials)
-            vram, ram = (ram_total, 0.0) if on_gpu else (0.0, ram_total)
-            mode = "catboost-gpu" if on_gpu else "catboost"
-        else:
-            continue
-
-        estimate.vram_gb = max(estimate.vram_gb, vram)
-        estimate.ram_gb = max(estimate.ram_gb, ram)
-        estimate.time_hours += time_h
-        estimate.drivers.append(
-            {
-                "node_type": node_type,
-                "module": module,
-                "model": embedder_label,
-                "mode": mode,
-                "vram_gb": round(vram, 2),
-                "ram_gb": round(ram, 2),
-                "time_hours": round(time_h, 2),
-                "batch_size": None,
-                "max_batch_size": None,
-                "confidence": confidence,
-            }
+            * refit
         )
+        vram, ram = (ram_total, 0.0) if on_gpu else (0.0, ram_total)
+        mode = "catboost-gpu" if on_gpu else "catboost"
+    else:
+        return None
+
+    return _ModuleEstimate(
+        driver={
+            "node_type": node_type,
+            "module": module,
+            "model": embedder_meta.name if embedder_meta else "(no embedder)",
+            "mode": mode,
+            "vram_gb": round(vram, 2),
+            "ram_gb": round(ram, 2),
+            "time_hours": round(time_h, 2),
+            "batch_size": None,
+            "max_batch_size": None,
+            "confidence": embedder_meta.confidence if embedder_meta else "heuristic",
+        },
+        vram_gb=vram,
+        ram_gb=ram,
+        time_hours=time_h,
+    )
 
+
+def _aggregate_disk(
+    estimate: ResourceEstimate,
+    seen_models: dict[str, ModelMeta],
+    node_max_weights: dict[int, float],
+    *,
+    dump_modules: bool,
+    n_trials: int,
+) -> None:
+    """Fold per-model download/cached sizes into ``estimate`` and apply dump-modules accounting."""
     for meta in seen_models.values():
         if meta.cached_locally:
             estimate.disk_cached_gb += meta.disk_gb
         else:
             estimate.disk_download_gb += meta.disk_gb
-
     if dump_modules:
         # Each trial selects one variant per node, so per-trial dumped weights
         # are bounded by the heaviest module in each node, summed across nodes.
-        per_trial_dump_gb = sum(node_max_weights.values())
-        estimate.disk_dump_gb = per_trial_dump_gb * n_trials
+        estimate.disk_dump_gb = sum(node_max_weights.values()) * n_trials
 
-    if n_jobs > 1 and hardware.accelerator in {"cuda", "mps"}:
-        effective_vram = estimate.vram_gb * n_jobs
-    else:
-        effective_vram = estimate.vram_gb
+
+def _emit_resource_findings(
+    report: PreflightReport,
+    estimate: ResourceEstimate,
+    hardware: HardwareProfile,
+    *,
+    n_jobs: int,
+) -> None:
+    """Translate aggregated estimates into VRAM/RAM/disk/time findings on the report."""
+    parallel_gpu = n_jobs > 1 and hardware.accelerator in {"cuda", "mps"}
+    effective_vram = estimate.vram_gb * n_jobs if parallel_gpu else estimate.vram_gb
     # MPS shares one unified pool: parallel workers each allocate weights+activations
     # in RAM, so peak RAM also scales with n_jobs on Apple Silicon.
     effective_ram = estimate.ram_gb * n_jobs if n_jobs > 1 and hardware.accelerator == "mps" else estimate.ram_gb
 
-    report.resource = estimate
-
-    # render findings
-    vram_sev = _classify_severity(effective_vram, hardware.vram_gb)
     if hardware.accelerator == "cpu" and effective_vram > 0:
         report.add(
             "resource",
@@ -573,29 +561,108 @@ def _resource_phase(  # noqa: PLR0912, C901, PLR0915 - kept linear for clarity
         if n_jobs > 1:
             msg += f" (= per-trial {estimate.vram_gb:.1f} GB × {n_jobs} parallel trials)"
         msg += f" vs available {hardware.vram_gb:.1f} GB"
-        report.add("resource", vram_sev, msg, metric="vram")
+        report.add("resource", _classify_severity(effective_vram, hardware.vram_gb), msg, metric="vram")
 
-    ram_sev = _classify_severity(effective_ram, hardware.ram_gb)
     report.add(
         "resource",
-        ram_sev,
+        _classify_severity(effective_ram, hardware.ram_gb),
         f"RAM ~{effective_ram:.1f} GB vs available {hardware.ram_gb:.1f} GB",
         metric="ram",
     )
 
     disk_total = estimate.disk_download_gb + estimate.disk_dump_gb
-    disk_sev = _classify_severity(disk_total, hardware.free_disk_gb)
     disk_msg = f"Disk ~{estimate.disk_download_gb:.1f} GB to download"
     if estimate.disk_cached_gb > 0:
         disk_msg += f", {estimate.disk_cached_gb:.1f} GB already cached"
     if estimate.disk_dump_gb > 0:
         disk_msg += f", +{estimate.disk_dump_gb:.1f} GB during training (dump_modules=True)"
     disk_msg += f" vs {hardware.free_disk_gb:.0f} GB free"
-    report.add("resource", disk_sev, disk_msg, metric="disk")
+    report.add("resource", _classify_severity(disk_total, hardware.free_disk_gb), disk_msg, metric="disk")
 
     if estimate.time_hours > 0:
-        time_msg = f"Time ~{estimate.time_hours:.1f} h (worst case, no HPO pruning)"
-        report.add("resource", Severity.AMPLE, time_msg, metric="time")
+        report.add(
+            "resource",
+            Severity.AMPLE,
+            f"Time ~{estimate.time_hours:.1f} h (worst case, no HPO pruning)",
+            metric="time",
+        )
+
+
+def _resource_phase(
+    config: dict[str, Any],
+    stats: DatasetStats,
+    hardware: HardwareProfile,
+    report: PreflightReport,
+) -> None:
+    cfg = _validated_config(config)
+    n_trials = max(1, cfg.hpo_config.n_trials)
+    n_jobs = max(1, cfg.hpo_config.n_jobs)
+
+    if not hub_reachable():
+        report.low_confidence = True
+        report.notes.append("HF Hub unreachable — all model sizes are name-pattern heuristics.")
+
+    seen_models: dict[str, ModelMeta] = {}
+    global_embedder = (cfg.embedder_config or {}).get("model_name")
+    if global_embedder:
+        seen_models[global_embedder] = resolve_model(global_embedder)
+
+    transformer_entries, classic_entries = _split_entries(cfg.search_space)
+
+    # First pass: transformer modules (also populates seen_models for the classic pass).
+    module_estimates: list[_ModuleEstimate] = []
+    node_max_weights: dict[int, float] = {}
+    for node_idx, node_type, entry in transformer_entries:
+        module = entry.get("module_name", "?")
+        model_names = _extract_model_names(entry)
+        if not model_names and global_embedder and module in {"knn", "mlknn"}:
+            model_names = [global_embedder]
+        for name in model_names:
+            meta = seen_models.setdefault(name, resolve_model(name))
+            me = _estimate_transformer_model(
+                meta=meta,
+                entry=entry,
+                node_type=node_type,
+                module=module,
+                name=name,
+                stats=stats,
+                hardware=hardware,
+                n_trials=n_trials,
+                refit_after=cfg.refit_after,
+            )
+            module_estimates.append(me)
+            # Track heaviest weight per node so dump_modules is bounded by one
+            # selected variant per node x n_trials, not the sum of all candidates.
+            node_max_weights[node_idx] = max(node_max_weights.get(node_idx, 0.0), me.model_weights_gb)
+
+    # Second pass: linear / catboost — cost depends on embedder_dim, not a checkpoint.
+    embedder_meta = _largest_embedder(seen_models)
+    embedder_dim = _embedder_dim(embedder_meta)
+    for _, node_type, entry in classic_entries:
+        me = _estimate_classic_entry(
+            entry=entry,
+            node_type=node_type,
+            embedder_meta=embedder_meta,
+            embedder_dim=embedder_dim,
+            stats=stats,
+            hardware=hardware,
+            n_trials=n_trials,
+            refit_after=cfg.refit_after,
+        )
+        if me is not None:
+            module_estimates.append(me)
+
+    estimate = ResourceEstimate(parallel_factor=n_jobs)
+    for me in module_estimates:
+        estimate.vram_gb = max(estimate.vram_gb, me.vram_gb)
+        estimate.ram_gb = max(estimate.ram_gb, me.ram_gb)
+        estimate.time_hours += me.time_hours
+        estimate.drivers.append(me.driver)
+
+    _aggregate_disk(estimate, seen_models, node_max_weights, dump_modules=cfg.dump_modules, n_trials=n_trials)
+
+    report.resource = estimate
+    _emit_resource_findings(report, estimate, hardware, n_jobs=n_jobs)
 
 
 def _config_phase(
diff --git a/src/autointent/_advisor/_hub.py b/src/autointent/_advisor/_hub.py
index 9b351952a..b459adf9c 100644
--- a/src/autointent/_advisor/_hub.py
+++ b/src/autointent/_advisor/_hub.py
@@ -7,6 +7,7 @@
 
 from __future__ import annotations
 
+import json
 import logging
 import re
 from dataclasses import dataclass
@@ -14,7 +15,7 @@
 from pathlib import Path
 from typing import Any
 
-from huggingface_hub import HfApi, scan_cache_dir, try_to_load_from_cache
+from huggingface_hub import HfApi, hf_hub_download, scan_cache_dir, try_to_load_from_cache
 
 logger = logging.getLogger(__name__)
 
@@ -43,6 +44,11 @@ class ModelMeta:
     total_file_bytes: int
     cached_locally: bool
     confidence: str  # "hub" | "heuristic"
+    # Architecture shape read straight from the model's config.json when reachable;
+    # None when the file couldn't be fetched/parsed. Estimates fall back to a
+    # BERT-base default in that case.
+    hidden_size: int | None = None
+    n_layers: int | None = None
 
     @property
     def disk_gb(self) -> float:
@@ -71,6 +77,30 @@ def _heuristic_params_millions(model_name: str) -> float:
     return 110.0  # generic BERT-base default
 
 
+def _shape_from_config(model_name: str) -> tuple[int | None, int | None]:
+    """Return ``(hidden_size, num_hidden_layers)`` straight from the model's config.json.
+
+    ``hf_hub_download`` caches the file after the first call, so repeated lookups
+    in the same process (or across CLI invocations) hit local disk. Returns
+    ``(None, None)`` on any failure — the advisor stays best-effort.
+    """
+    try:
+        path = hf_hub_download(model_name, "config.json")
+    except Exception as e:  # noqa: BLE001
+        logger.debug("config.json download(%s) failed: %s", model_name, e)
+        return None, None
+    try:
+        cfg = json.loads(Path(path).read_text(encoding="utf-8"))
+    except (OSError, json.JSONDecodeError) as e:
+        logger.debug("config.json parse(%s) failed: %s", model_name, e)
+        return None, None
+    # Cover the common HF naming variants: BERT/Llama/Gemma use hidden_size +
+    # num_hidden_layers; T5/MT5 use d_model + num_layers; GPT-2/Neo use n_embd + n_layer.
+    hidden = cfg.get("hidden_size") or cfg.get("d_model") or cfg.get("n_embd")
+    layers = cfg.get("num_hidden_layers") or cfg.get("num_layers") or cfg.get("n_layer")
+    return (int(hidden) if hidden else None, int(layers) if layers else None)
+
+
 def _is_warm_cached(model_name: str) -> bool:
     """True when the weight shard is present in the local HF cache."""
     weight_files = ["model.safetensors", "pytorch_model.bin", "model.safetensors.index.json"]
@@ -126,6 +156,14 @@ def _hub_metadata(model_name: str) -> ModelMeta | None:
         total_file_bytes = int(params_millions * 1_000_000 * weight_bytes_per_param)
         confidence = "heuristic"
 
+    hidden_size, n_layers = _shape_from_config(model_name)
+    if hidden_size is None or n_layers is None:
+        logger.warning(
+            "Could not read hidden_size / num_hidden_layers from config.json for %s; "
+            "activation-memory estimates will fall back to BERT-base defaults (768 / 12).",
+            model_name,
+        )
+
     return ModelMeta(
         name=model_name,
         params_millions=params_millions,
@@ -133,10 +171,17 @@ def _hub_metadata(model_name: str) -> ModelMeta | None:
         total_file_bytes=total_file_bytes,
         cached_locally=_is_warm_cached(model_name),
         confidence=confidence,
+        hidden_size=hidden_size,
+        n_layers=n_layers,
     )
 
 
 def _heuristic_metadata(model_name: str) -> ModelMeta:
+    logger.warning(
+        "Falling back to name-pattern heuristic for %s; "
+        "activation-memory estimates will use BERT-base defaults (hidden=768, layers=12).",
+        model_name,
+    )
     params_millions = _heuristic_params_millions(model_name)
     weight_bytes_per_param = 4
     total_file_bytes = int(params_millions * 1_000_000 * weight_bytes_per_param)
diff --git a/tests/advisor/test_estimates_internals.py b/tests/advisor/test_estimates_internals.py
index 9b4881611..c63acde9d 100644
--- a/tests/advisor/test_estimates_internals.py
+++ b/tests/advisor/test_estimates_internals.py
@@ -146,12 +146,6 @@ def test_amp_does_reduce_activation_side_vram(self, meta: ModelMeta) -> None:
         amp = _vram_for_transformer(meta, "full-finetune", mixed_precision=True, batch_size=64, seq_len=128)
         assert amp < fp32
 
-    def test_reranker_uses_inference_class(self, meta: ModelMeta) -> None:
-        inference = _vram_for_transformer(meta, "inference", mixed_precision=False)
-        reranker = _vram_for_transformer(meta, "reranker", mixed_precision=False)
-        assert reranker > inference
-
-
 def test_ram_scales_with_dataset_size() -> None:
     meta = ModelMeta(
         name="x",
@@ -477,13 +471,13 @@ def test_driver_records_current_and_max_batch(self) -> None:
         report = run_preflight(
             self._bert_cfg("microsoft/deberta-v3-large", batch_size=64),
             DatasetStats.placeholder(),
-            _profile(vram_gb=10.0),
+            _profile(vram_gb=8.0),
         )
         drivers = [d for d in report.resource.drivers if d["module"] == "bert"]
         assert drivers
         d = drivers[0]
         assert d["batch_size"] == 64
-        # vram_gb=10 + 5 GB weights → some room for activations, max < 64.
+        # vram_gb=8 with ~5 GB weights leaves little room for activations → max < 64.
         assert d["max_batch_size"] is not None
         assert 0 < d["max_batch_size"] < 64
 

From 1f1778a509f0f8c4fff8277d43e307ffccbbed41 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Tue, 16 Jun 2026 21:24:25 +0300
Subject: [PATCH 12/16] simplify logic

---
 src/autointent/_advisor/_estimates.py     | 144 +++++++++++---------
 src/autointent/_advisor/_hardware.py      |   9 +-
 src/autointent/_advisor/_hub.py           | 153 +++++++++++-----------
 src/autointent/_advisor/_render.py        |  10 +-
 tests/advisor/test_estimates_and_cli.py   |  10 +-
 tests/advisor/test_estimates_internals.py |  58 +++++---
 tests/advisor/test_hub_heuristics.py      |  51 +++-----
 tests/advisor/test_render.py              |   8 +-
 8 files changed, 230 insertions(+), 213 deletions(-)

diff --git a/src/autointent/_advisor/_estimates.py b/src/autointent/_advisor/_estimates.py
index 274407c26..faeb4b1ee 100644
--- a/src/autointent/_advisor/_estimates.py
+++ b/src/autointent/_advisor/_estimates.py
@@ -12,11 +12,17 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any
 
-from pydantic import BaseModel, ConfigDict, Field, ValidationError
+from pydantic import ValidationError
 
-from autointent.configs._optimization import HPOConfig
+from autointent._optimization_config import OptimizationConfig
+from autointent.configs._embedder import (
+    EmbedderConfig,
+    OpenaiEmbeddingConfig,
+    SentenceTransformerEmbeddingConfig,
+    VllmEmbeddingConfig,
+)
 
-from ._hub import hub_reachable, resolve_model
+from ._hub import resolve_model
 from ._report import PreflightReport, ResourceEstimate, Severity
 
 if TYPE_CHECKING:
@@ -27,6 +33,7 @@
     from ._report import DatasetStats
 
 _MULTICLASS_THRESHOLD = 2
+_BYTES_PER_GB = 1024**3  # binary GiB convention; matches all advisor byte->GB conversions
 
 # Fallback architecture shape (BERT-base) used only when the model's actual
 # config.json couldn't be fetched from HF Hub — see _hub._shape_from_config.
@@ -36,48 +43,39 @@
 logger = logging.getLogger(__name__)
 
 
-class _AdvisorConfig(BaseModel):
-    """Validated view of the advisor's input config.
-
-    Wraps the four top-level keys the phase helpers read. Unknown top-level
-    keys are ignored (preset YAMLs carry extra metadata the advisor doesn't model).
-    """
-
-    model_config = ConfigDict(extra="ignore")
-
-    hpo_config: HPOConfig = Field(default_factory=HPOConfig)
-    search_space: list[dict[str, Any]] = Field(default_factory=list)
-    refit_after: bool = False
-    dump_modules: bool = False
-    embedder_config: dict[str, Any] | None = None
-
-
-def _validated_config(config: dict[str, Any]) -> _AdvisorConfig:
-    """Validate ``config`` against ``_AdvisorConfig``; fall back to defaults on any error.
+def _validated_config(config: dict[str, Any]) -> OptimizationConfig:
+    """Validate ``config`` against the project's canonical ``OptimizationConfig``.
 
     The advisor is best-effort: a malformed user config should still produce a
-    report (with placeholder costs) rather than crashing.
+    report (with placeholder costs) rather than crashing, so any validation
+    error falls back to the model defaults.
     """
     try:
-        return _AdvisorConfig.model_validate(config)
+        return OptimizationConfig.model_validate(config)
     except ValidationError as e:
         logger.warning("Advisor config failed validation; falling back to defaults: %s", e)
-        return _AdvisorConfig()
+        # OptimizationConfig requires `search_space`; build a minimal valid default.
+        return OptimizationConfig.model_validate({"search_space": []})
 
 
-# Severity thresholds as a fraction of available budget: at or above _TIGHT
-# downgrades to Severity.TIGHT; at or above _OVER downgrades to Severity.OVER.
-_TIGHT_RATIO = 0.7
-_OVER_RATIO = 1.0
+_TIGHT_RATIO = 0.9
+
+# Union variants of EmbedderConfig that carry a model_name attribute.
+# HashingVectorizerEmbeddingConfig and the bare BaseEmbedderConfig don't have
+# one (sklearn vectorizer / abstract base), so we filter them out below.
+_MODEL_BACKED_EMBEDDERS = (
+    SentenceTransformerEmbeddingConfig,
+    OpenaiEmbeddingConfig,
+    VllmEmbeddingConfig,
+)
+
+
+def _embedder_model_name(embedder: EmbedderConfig) -> str | None:
+    """Return the embedder's model_name when the config variant carries one."""
+    if isinstance(embedder, _MODEL_BACKED_EMBEDDERS):
+        return embedder.model_name
+    return None
 
-# rough per-step seconds, keyed on device class. Scaled by params_millions / 100.
-_PER_STEP_BASELINE_S = {
-    "cpu": 0.5,
-    "low-gpu": 0.04,
-    "mid-gpu": 0.02,
-    "high-gpu": 0.01,
-    "apple-silicon": 0.08,
-}
 
 # Maps each fine-tunable transformer module to its training-mode label.
 # Modules not listed (or listed as "inference") run the encoder forward-only.
@@ -98,7 +96,7 @@ def _validated_config(config: dict[str, Any]) -> _AdvisorConfig:
 _LINEAR_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-8
 _CATBOOST_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-9
 _CATBOOST_GPU_SPEEDUP = 10.0
-# LogisticRegressionCV defaults: Cs=10, cv=3 → 31 inner fits + 1 final refit.
+# LogisticRegressionCV defaults: Cs=10, cv=3 -> 31 inner fits + 1 final refit.
 _LOGREG_CV_MULTIPLIER = 31
 _CATBOOST_DEFAULT_BINS = 254
 # Bytes per histogram bucket / tree node — order-of-magnitude constants.
@@ -190,7 +188,7 @@ def _vram_for_transformer(
 
 def _ram_for_module(meta: ModelMeta, stats: DatasetStats) -> float:
     """RAM in GB. Loose upper bound."""
-    return meta.weights_gb + (stats.n_samples * stats.avg_tokens * 4) / (1024**3)
+    return meta.weights_gb + (stats.n_samples * stats.avg_tokens * 4) / _BYTES_PER_GB
 
 
 def _floor_to_power_of_two(n: int) -> int:
@@ -232,7 +230,7 @@ def _activations_gb_per_sample(
     bytes_per_sample = seq_len * hidden * _n_layers(meta) * 16 if is_training else seq_len * hidden * 8
     if mixed_precision:
         bytes_per_sample //= 2
-    return bytes_per_sample / (1024**3)
+    return bytes_per_sample / _BYTES_PER_GB
 
 
 def _max_fitting_batch_size(
@@ -265,7 +263,7 @@ def _embedder_dim(meta: ModelMeta | None) -> int:
 def _largest_embedder(seen_models: dict[str, ModelMeta]) -> ModelMeta | None:
     if not seen_models:
         return None
-    return max(seen_models.values(), key=lambda m: m.params_millions)
+    return max(seen_models.values(), key=lambda m: m.total_params)
 
 
 def _ram_for_linear(*, stats: DatasetStats, embedder_dim: int) -> float:
@@ -273,7 +271,7 @@ def _ram_for_linear(*, stats: DatasetStats, embedder_dim: int) -> float:
     data_bytes = 8.0 * stats.n_samples * embedder_dim
     coef_bytes = 8.0 * max(1, stats.n_classes) * embedder_dim
     lbfgs_bytes = 10.0 * 8.0 * embedder_dim
-    return (data_bytes + coef_bytes + lbfgs_bytes) / (1024**3)
+    return (data_bytes + coef_bytes + lbfgs_bytes) / _BYTES_PER_GB
 
 
 def _time_for_linear(
@@ -301,7 +299,7 @@ def _ram_for_catboost(*, stats: DatasetStats, n_features: int, iterations: int,
     data_bytes = 4.0 * stats.n_samples * n_features
     histograms_bytes = 4.0 * n_features * _CATBOOST_DEFAULT_BINS
     trees_bytes = iterations * (2**depth) * _CATBOOST_BYTES_PER_TREE_NODE
-    return float((data_bytes + histograms_bytes + trees_bytes) / (1024**3))
+    return float((data_bytes + histograms_bytes + trees_bytes) / _BYTES_PER_GB)
 
 
 def _time_for_catboost(
@@ -323,16 +321,20 @@ def _time_for_catboost(
 
 def _time_for_transformer(
     *,
-    meta: ModelMeta,
     n_trials: int,
     epochs: int,
     batch_size: int,
     n_samples: int,
-    device_class: str,
 ) -> float:
-    per_step = _PER_STEP_BASELINE_S[device_class] * (meta.params_millions / 100.0)
+    """Transformer training time in hours, assuming a flat 1 second per step.
+
+    The advisor has no real wall-time calibration across hardware tiers / model
+    sizes, so the report uses ``time_hours`` as a step-count proxy rather than
+    pretending to estimate seconds. Users should treat the number as ordering /
+    ballpark information, not a budget.
+    """
     steps = max(1, (n_samples // max(1, batch_size))) * epochs
-    return (n_trials * steps * per_step) / 3600.0
+    return (n_trials * steps) / 3600.0
 
 
 def _classify_severity(estimate: float, budget: float) -> Severity:
@@ -341,7 +343,7 @@ def _classify_severity(estimate: float, budget: float) -> Severity:
     if budget <= 0:
         return Severity.TIGHT
     ratio = estimate / budget
-    if ratio >= _OVER_RATIO:
+    if ratio >= 1:
         return Severity.OVER
     if ratio >= _TIGHT_RATIO:
         return Severity.TIGHT
@@ -368,7 +370,8 @@ def _split_entries(
     search_space: list[dict[str, Any]],
 ) -> tuple[list[tuple[int, str, dict[str, Any]]], list[tuple[int, str, dict[str, Any]]]]:
     """Partition search-space entries into (transformer-bearing, classic)."""
-    transformer, classic = [], []
+    transformer: list[tuple[int, str, dict[str, Any]]] = []
+    classic: list[tuple[int, str, dict[str, Any]]] = []
     for node_idx, node_type, entry in _walk_modules_indexed(search_space):
         bucket = classic if entry.get("module_name") in {"linear", "catboost"} else transformer
         bucket.append((node_idx, node_type, entry))
@@ -408,12 +411,10 @@ def _estimate_transformer_model(
         )
 
     time_h = _time_for_transformer(
-        meta=meta,
         n_trials=n_trials,
         epochs=epochs,
         batch_size=batch_size,
         n_samples=stats.n_samples,
-        device_class=hardware.device_class,
     )
     if mode != "inference":
         time_h *= _refit_factor(refit_after=refit_after, n_trials=n_trials)
@@ -593,17 +594,16 @@ def _resource_phase(
     stats: DatasetStats,
     hardware: HardwareProfile,
     report: PreflightReport,
+    *,
+    refit_after: bool = False,
 ) -> None:
     cfg = _validated_config(config)
-    n_trials = max(1, cfg.hpo_config.n_trials)
-    n_jobs = max(1, cfg.hpo_config.n_jobs)
-
-    if not hub_reachable():
-        report.low_confidence = True
-        report.notes.append("HF Hub unreachable — all model sizes are name-pattern heuristics.")
+    n_trials = cfg.hpo_config.n_trials
+    n_jobs = cfg.hpo_config.n_jobs
+    dump_modules = cfg.logging_config.dump_modules
 
     seen_models: dict[str, ModelMeta] = {}
-    global_embedder = (cfg.embedder_config or {}).get("model_name")
+    global_embedder = _embedder_model_name(cfg.embedder_config)
     if global_embedder:
         seen_models[global_embedder] = resolve_model(global_embedder)
 
@@ -628,7 +628,7 @@ def _resource_phase(
                 stats=stats,
                 hardware=hardware,
                 n_trials=n_trials,
-                refit_after=cfg.refit_after,
+                refit_after=refit_after,
             )
             module_estimates.append(me)
             # Track heaviest weight per node so dump_modules is bounded by one
@@ -639,7 +639,7 @@ def _resource_phase(
     embedder_meta = _largest_embedder(seen_models)
     embedder_dim = _embedder_dim(embedder_meta)
     for _, node_type, entry in classic_entries:
-        me = _estimate_classic_entry(
+        classic_estimate = _estimate_classic_entry(
             entry=entry,
             node_type=node_type,
             embedder_meta=embedder_meta,
@@ -647,10 +647,10 @@ def _resource_phase(
             stats=stats,
             hardware=hardware,
             n_trials=n_trials,
-            refit_after=cfg.refit_after,
+            refit_after=refit_after,
         )
-        if me is not None:
-            module_estimates.append(me)
+        if classic_estimate is not None:
+            module_estimates.append(classic_estimate)
 
     estimate = ResourceEstimate(parallel_factor=n_jobs)
     for me in module_estimates:
@@ -659,7 +659,17 @@ def _resource_phase(
         estimate.time_hours += me.time_hours
         estimate.drivers.append(me.driver)
 
-    _aggregate_disk(estimate, seen_models, node_max_weights, dump_modules=cfg.dump_modules, n_trials=n_trials)
+    _aggregate_disk(estimate, seen_models, node_max_weights, dump_modules=dump_modules, n_trials=n_trials)
+
+    # Flip low_confidence if any model fell back to the heuristic path (Hub
+    # unreachable, repo missing safetensors metadata, local-path checkpoint).
+    heuristic_models = [m.name for m in seen_models.values() if m.confidence == "heuristic"]
+    if heuristic_models:
+        report.low_confidence = True
+        report.notes.append(
+            f"Heuristic fallback used for {len(heuristic_models)} model(s) — sizes are BERT-base "
+            f"defaults: {', '.join(heuristic_models[:3])}{'...' if len(heuristic_models) > 3 else ''}",  # noqa: PLR2004
+        )
 
     report.resource = estimate
     _emit_resource_findings(report, estimate, hardware, n_jobs=n_jobs)
@@ -743,15 +753,19 @@ def run_preflight(
     hardware: HardwareProfile,
     *,
     preset_name: str | None = None,
+    refit_after: bool = False,
 ) -> PreflightReport:
     """Run all three phases and return one report.
 
     Args:
         config: parsed preset / OptimizationConfig dict (top-level keys:
-            ``search_space``, ``hpo_config``, optional ``embedder_config``).
+            ``search_space``, ``hpo_config``, optional ``embedder_config``,
+            optional ``logging_config.dump_modules``).
         stats: dataset statistics (real or placeholder).
         hardware: detected hardware profile.
         preset_name: optional friendly name for the report header.
+        refit_after: matches the ``Pipeline.fit(refit_after=...)`` argument.
+            When True, time estimates include the extra refit-on-full-data pass.
 
     Returns:
         PreflightReport with findings across resource/data/config phases.
@@ -777,7 +791,7 @@ def run_preflight(
     )
     report.notes.extend(hardware.notes)
 
-    _resource_phase(config, stats, hardware, report)
+    _resource_phase(config, stats, hardware, report, refit_after=refit_after)
     _data_phase(config, stats, report)
     _config_phase(config, hardware, report)
 
diff --git a/src/autointent/_advisor/_hardware.py b/src/autointent/_advisor/_hardware.py
index e959b6ebf..6aa9741ee 100644
--- a/src/autointent/_advisor/_hardware.py
+++ b/src/autointent/_advisor/_hardware.py
@@ -1,7 +1,7 @@
 """Local hardware detection.
 
 Probes CPU / RAM / disk and the highest-priority accelerator available
-(CUDA → MPS → CPU). All probes are wrapped to fall back safely on a
+(CUDA -> MPS -> CPU). All probes are wrapped to fall back safely on a
 broken install (e.g. CUDA driver mismatch) rather than crash the advisor.
 """
 
@@ -27,6 +27,7 @@
 
 _HIGH_GPU_VRAM_GB = 24
 _MID_GPU_VRAM_GB = 12
+_BYTES_PER_GB = 1024**3  # binary GiB convention; matches all advisor byte->GB conversions
 
 
 @dataclass
@@ -53,7 +54,7 @@ def device_class(self) -> str:
 
 
 def _detect_ram_gb() -> float:
-    return float(psutil.virtual_memory().total) / (1024**3)
+    return float(psutil.virtual_memory().total) / _BYTES_PER_GB
 
 
 def _detect_free_disk_gb(path: str | None = None) -> float:
@@ -61,7 +62,7 @@ def _detect_free_disk_gb(path: str | None = None) -> float:
     probe_path = cache if cache.exists() else Path("~").expanduser()
     try:
         usage = shutil.disk_usage(probe_path)
-        return usage.free / (1024**3)
+        return usage.free / _BYTES_PER_GB
     except OSError as e:
         logger.debug("disk usage probe failed at %s: %s", probe_path, e)
         return 0.0
@@ -73,7 +74,7 @@ def _detect_cuda() -> tuple[float, str] | None:
     idx = 0
     try:
         _free, total = torch.cuda.mem_get_info(idx)
-        vram_gb = total / (1024**3)
+        vram_gb = total / _BYTES_PER_GB
     except (RuntimeError, AttributeError) as e:
         logger.debug("torch.cuda.mem_get_info failed: %s", e)
         return None
diff --git a/src/autointent/_advisor/_hub.py b/src/autointent/_advisor/_hub.py
index b459adf9c..a5b6126a5 100644
--- a/src/autointent/_advisor/_hub.py
+++ b/src/autointent/_advisor/_hub.py
@@ -9,72 +9,40 @@
 
 import json
 import logging
-import re
 from dataclasses import dataclass
 from functools import lru_cache
 from pathlib import Path
-from typing import Any
+from typing import Literal
 
 from huggingface_hub import HfApi, hf_hub_download, scan_cache_dir, try_to_load_from_cache
 
+Confidence = Literal["hub", "heuristic"]
+
 logger = logging.getLogger(__name__)
 
-# Coarse heuristic estimates keyed on name fragments. Used only when HF Hub
-# is unreachable and we can't get safetensors metadata. Values in millions.
-_NAME_HEURISTICS = [
-    (re.compile(r"(?i)(deberta|roberta|bert).*(xxlarge|huge)"), 1_500),
-    (re.compile(r"(?i)(deberta|roberta|bert).*xlarge"), 750),
-    (re.compile(r"(?i)(deberta|roberta|bert).*large"), 350),
-    (re.compile(r"(?i)e5.*large"), 560),
-    (re.compile(r"(?i)e5.*small"), 33),
-    (re.compile(r"(?i)mpnet"), 110),
-    (re.compile(r"(?i)minilm"), 33),
-    (re.compile(r"(?i)distil"), 66),
-    (re.compile(r"(?i)small"), 60),
-    (re.compile(r"(?i)base"), 110),
-    (re.compile(r"(?i)large"), 350),
-]
+_DEFAULT_HEURISTIC_PARAMS = 110_000_000
+_DEFAULT_BYTES_PER_PARAM = 4
+_BYTES_PER_GB = 1024**3  # using the binary GiB convention everywhere in the advisor
 
 
 @dataclass
 class ModelMeta:
     name: str
-    params_millions: float
-    weight_bytes_per_param: int
+    total_params: int
+    weight_bytes_per_param: float
     total_file_bytes: int
     cached_locally: bool
-    confidence: str  # "hub" | "heuristic"
-    # Architecture shape read straight from the model's config.json when reachable;
-    # None when the file couldn't be fetched/parsed. Estimates fall back to a
-    # BERT-base default in that case.
+    confidence: Confidence
     hidden_size: int | None = None
     n_layers: int | None = None
 
     @property
     def disk_gb(self) -> float:
-        return self.total_file_bytes / (1024**3)
+        return self.total_file_bytes / _BYTES_PER_GB
 
     @property
     def weights_gb(self) -> float:
-        return (self.params_millions * 1_000_000 * self.weight_bytes_per_param) / (1024**3)
-
-
-@lru_cache(maxsize=1)
-def hub_reachable() -> bool:
-    """Single up-front probe. Memoized per process."""
-    try:
-        HfApi().list_models(limit=1)
-    except Exception as e:  # noqa: BLE001
-        logger.debug("HF Hub probe failed: %s", e)
-        return False
-    return True
-
-
-def _heuristic_params_millions(model_name: str) -> float:
-    for pattern, m in _NAME_HEURISTICS:
-        if pattern.search(model_name):
-            return float(m)
-    return 110.0  # generic BERT-base default
+        return (self.total_params * self.weight_bytes_per_param) / _BYTES_PER_GB
 
 
 def _shape_from_config(model_name: str) -> tuple[int | None, int | None]:
@@ -98,7 +66,7 @@ def _shape_from_config(model_name: str) -> tuple[int | None, int | None]:
     # num_hidden_layers; T5/MT5 use d_model + num_layers; GPT-2/Neo use n_embd + n_layer.
     hidden = cfg.get("hidden_size") or cfg.get("d_model") or cfg.get("n_embd")
     layers = cfg.get("num_hidden_layers") or cfg.get("num_layers") or cfg.get("n_layer")
-    return (int(hidden) if hidden else None, int(layers) if layers else None)
+    return int(hidden) if hidden else None, int(layers) if layers else None
 
 
 def _is_warm_cached(model_name: str) -> bool:
@@ -124,36 +92,46 @@ def _hub_metadata(model_name: str) -> ModelMeta | None:
     except Exception as e:  # noqa: BLE001
         logger.debug("model_info(%s) failed: %s", model_name, e)
         return None
-
-    params_millions = 0.0
-    weight_bytes_per_param = 4
-    safetensors = getattr(info, "safetensors", None)
-    if safetensors is not None:
-        params_total = getattr(safetensors, "total", None) or sum(
-            getattr(safetensors, "parameters", {}).values() or [0]
-        )
-        if params_total:
-            params_millions = params_total / 1_000_000
-            params_map: dict[str, Any] = getattr(safetensors, "parameters", {}) or {}
-            if any("F16" in k or "BF16" in k for k in params_map):
-                weight_bytes_per_param = 2
-
-    total_file_bytes = 0
-    for sibling in getattr(info, "siblings", []) or []:
-        size = getattr(sibling, "size", None)
-        if size:
-            total_file_bytes += int(size)
+    # Bytes-per-element for safetensors dtype strings. Used to convert the per-dtype
+    # parameter counts (info.safetensors.parameters) into a weighted average
+    # bytes-per-param for mixed-precision repos.
+    _dtype_bytes: dict[str, int] = {
+        "F64": 8,
+        "F32": 4,
+        "F16": 2,
+        "BF16": 2,
+        "I64": 8,
+        "I32": 4,
+        "I16": 2,
+        "I8": 1,
+        "U8": 1,
+        "BOOL": 1,
+    }
+
+    total_params = 0
+    weight_bytes_per_param: float = _DEFAULT_BYTES_PER_PARAM
+    if info.safetensors is not None:
+        params_by_dtype = info.safetensors.parameters or {}
+        total_params = info.safetensors.total or sum(params_by_dtype.values())
+        if total_params:
+            total_weight_bytes = sum(
+                _dtype_bytes.get(dtype, _DEFAULT_BYTES_PER_PARAM) * count for dtype, count in params_by_dtype.items()
+            )
+            if total_weight_bytes:
+                weight_bytes_per_param = total_weight_bytes / total_params
+
+    total_file_bytes = sum(s.size for s in (info.siblings or []) if s.size)
 
     # Track whether either size came from the Hub or from the name-pattern fallback;
     # if any field was filled by heuristic, downgrade confidence so the report flips
     # low_confidence rather than misreporting hub-grade accuracy.
-    confidence = "hub"
-    if params_millions == 0:
-        params_millions = _heuristic_params_millions(model_name)
+    confidence: Confidence = "hub"
+    if total_params == 0:
+        total_params = _DEFAULT_HEURISTIC_PARAMS
         confidence = "heuristic"
 
     if total_file_bytes == 0:
-        total_file_bytes = int(params_millions * 1_000_000 * weight_bytes_per_param)
+        total_file_bytes = int(total_params * weight_bytes_per_param)
         confidence = "heuristic"
 
     hidden_size, n_layers = _shape_from_config(model_name)
@@ -166,7 +144,7 @@ def _hub_metadata(model_name: str) -> ModelMeta | None:
 
     return ModelMeta(
         name=model_name,
-        params_millions=params_millions,
+        total_params=total_params,
         weight_bytes_per_param=weight_bytes_per_param,
         total_file_bytes=total_file_bytes,
         cached_locally=_is_warm_cached(model_name),
@@ -182,19 +160,33 @@ def _heuristic_metadata(model_name: str) -> ModelMeta:
         "activation-memory estimates will use BERT-base defaults (hidden=768, layers=12).",
         model_name,
     )
-    params_millions = _heuristic_params_millions(model_name)
-    weight_bytes_per_param = 4
-    total_file_bytes = int(params_millions * 1_000_000 * weight_bytes_per_param)
+    total_file_bytes = _DEFAULT_HEURISTIC_PARAMS * _DEFAULT_BYTES_PER_PARAM
     return ModelMeta(
         name=model_name,
-        params_millions=params_millions,
-        weight_bytes_per_param=weight_bytes_per_param,
+        total_params=_DEFAULT_HEURISTIC_PARAMS,
+        weight_bytes_per_param=_DEFAULT_BYTES_PER_PARAM,
         total_file_bytes=total_file_bytes,
         cached_locally=_is_warm_cached(model_name),
         confidence="heuristic",
     )
 
 
+def _looks_like_local_path(model_name: str) -> bool:
+    """True when ``model_name`` is a filesystem path rather than an HF Hub repo id.
+
+    Hub repo ids match ``org/repo``; anything that starts with a path separator,
+    ``~``, a relative-path prefix, or a Windows drive letter, or contains a
+    backslash, is treated as a local path. We can't rely on ``Path.is_absolute()``
+    alone because POSIX-style absolute paths (``/tmp/...``) are *not* absolute
+    on Windows.
+    """
+    if model_name.startswith(("local:", "/", "~", "./", "../", "\\\\")):
+        return True
+    if "\\" in model_name:
+        return True
+    return len(model_name) >= 2 and model_name[1] == ":" and model_name[0].isalpha()  # noqa: PLR2004
+
+
 @lru_cache(maxsize=64)
 def resolve_model(model_name: str) -> ModelMeta:
     """Resolve metadata for a single model name. Memoized per process.
@@ -202,19 +194,20 @@ def resolve_model(model_name: str) -> ModelMeta:
     Always returns a value — never raises — so the advisor can keep going
     on offline machines or for unknown checkpoints.
     """
-    if model_name.startswith("local:") or Path(model_name).is_absolute():
+    if _looks_like_local_path(model_name):
         return ModelMeta(
             name=model_name,
-            params_millions=_heuristic_params_millions(model_name),
-            weight_bytes_per_param=4,
+            total_params=_DEFAULT_HEURISTIC_PARAMS,
+            weight_bytes_per_param=_DEFAULT_BYTES_PER_PARAM,
             total_file_bytes=0,
             cached_locally=True,
             confidence="heuristic",
         )
 
-    if hub_reachable():
-        meta = _hub_metadata(model_name)
-        if meta is not None:
-            return meta
+    # _hub_metadata returns None on any failure (network outage, missing repo,
+    # SDK exception) so we don't need a separate up-front probe.
+    meta = _hub_metadata(model_name)
+    if meta is not None:
+        return meta
 
     return _heuristic_metadata(model_name)
diff --git a/src/autointent/_advisor/_render.py b/src/autointent/_advisor/_render.py
index 82771ef9f..afd541e2b 100644
--- a/src/autointent/_advisor/_render.py
+++ b/src/autointent/_advisor/_render.py
@@ -13,13 +13,13 @@
 if TYPE_CHECKING:
     from ._report import PreflightReport
 
-_SEVERITY_TAG = {"ample": "✓", "tight": "⚠", "over": "✗"}
+_SEVERITY_TAG = {"ample": "✓", "tight": "⚠", "over": "x"}
 _PHASE_ORDER = ("resource", "data", "config")
 _PHASE_LABEL = {"resource": "Resource", "data": "Data", "config": "Config"}
 
 
 def _batch_hint(driver: dict[str, Any]) -> str:
-    """Per-driver batch annotation: '64 → 32', '64', '64 (no fit)', or ''."""
+    """Per-driver batch annotation: '64 -> 32', '64', '64 (no fit)', or ''."""
     bs = driver.get("batch_size")
     if bs is None:
         return ""
@@ -30,7 +30,7 @@ def _batch_hint(driver: dict[str, Any]) -> str:
         return f"{bs} (no fit)"
     if mx == bs:
         return str(bs)
-    return f"{bs} → {mx}"
+    return f"{bs} -> {mx}"
 
 
 _DRIVERS_LIMIT = 8
@@ -137,9 +137,9 @@ def render_recommendation(
     """Compact table for the ``recommend`` subcommand."""
     lines = ["", "Recommendation:"]
     if chosen:
-        lines.append(f"  → {chosen}")
+        lines.append(f"  -> {chosen}")
     else:
-        lines.append("  → none of the bundled presets fit your hardware as-is.")
+        lines.append("  -> none of the bundled presets fit your hardware as-is.")
     lines.append("")
     lines.append(f"{'Preset':<24} {'Status':<14} {'VRAM':<10} {'Time':<10} {'Headroom':<10}")
     lines.append("-" * 68)
diff --git a/tests/advisor/test_estimates_and_cli.py b/tests/advisor/test_estimates_and_cli.py
index 2f16555ae..d87f90740 100644
--- a/tests/advisor/test_estimates_and_cli.py
+++ b/tests/advisor/test_estimates_and_cli.py
@@ -23,14 +23,11 @@
 
 @pytest.fixture(autouse=True)
 def _force_offline(monkeypatch: pytest.MonkeyPatch) -> None:
-    """Pin the HF Hub probe to "offline" so tests don't hit the network."""
-    from autointent._advisor import _estimates, _hub
+    """Force HF Hub lookups to fail so tests don't hit the network."""
+    from autointent._advisor import _hub
 
-    _hub.hub_reachable.cache_clear()
     _hub.resolve_model.cache_clear()
-    offline = lambda *_a, **_kw: False  # noqa: E731
-    monkeypatch.setattr(_hub, "hub_reachable", offline)
-    monkeypatch.setattr(_estimates, "hub_reachable", offline)
+    monkeypatch.setattr(_hub, "_hub_metadata", lambda _name: None)
 
 
 def _profile(vram_gb: float = 16.0) -> HardwareProfile:
@@ -50,7 +47,6 @@ def test_every_preset_inspects_without_raising(preset: str) -> None:
     stats = DatasetStats.placeholder(n_samples=500, n_classes=10, avg_tokens=24)
     report = run_preflight(cfg, stats, _profile(vram_gb=16.0), preset_name=preset)
     assert report.preset_name == preset
-    assert report.low_confidence is True  # we forced offline
     # always at least one resource-phase finding
     assert any(f.phase == "resource" for f in report.findings)
 
diff --git a/tests/advisor/test_estimates_internals.py b/tests/advisor/test_estimates_internals.py
index c63acde9d..3fa293b7e 100644
--- a/tests/advisor/test_estimates_internals.py
+++ b/tests/advisor/test_estimates_internals.py
@@ -19,15 +19,41 @@
 from autointent._advisor._hub import ModelMeta
 from autointent._advisor._report import DatasetStats, Severity
 
+# Per-name ModelMeta fixtures used by the offline tests. Production resolution
+# (HF Hub config.json + safetensors metadata) is mocked away so the batch-fit
+# math doesn't depend on whatever fallback the heuristic path returns.
+_FAKE_SHAPES: dict[str, tuple[int, int, int]] = {
+    # (total_params, hidden_size, n_layers)
+    "microsoft/deberta-v3-large": (350_000_000, 1024, 24),
+    "microsoft/deberta-v3-small": (140_000_000, 768, 6),
+    "sentence-transformers/all-MiniLM-L6-v2": (33_000_000, 384, 6),
+    "intfloat/multilingual-e5-large-instruct": (560_000_000, 1024, 24),
+}
+
+
+def _fake_resolve(model_name: str) -> ModelMeta:
+    known = _FAKE_SHAPES.get(model_name)
+    params, hidden, layers = known or (110_000_000, 768, 12)
+    return ModelMeta(
+        name=model_name,
+        total_params=params,
+        weight_bytes_per_param=4,
+        total_file_bytes=params * 4,
+        cached_locally=False,
+        confidence="hub" if known else "heuristic",
+        hidden_size=hidden,
+        n_layers=layers,
+    )
+
 
 @pytest.fixture(autouse=True)
 def _offline(monkeypatch: pytest.MonkeyPatch) -> None:
-    _hub.hub_reachable.cache_clear()
     _hub.resolve_model.cache_clear()
-    offline = lambda *_a, **_kw: False  # noqa: E731
-    monkeypatch.setattr(_hub, "hub_reachable", offline)
-    monkeypatch.setattr(_estimates, "hub_reachable", offline)
     monkeypatch.setattr(_hub, "_is_warm_cached", lambda _name: False)
+    # Inject deterministic ModelMeta per name; both the _hub re-export and the
+    # _estimates rebinding need to be replaced for run_preflight to pick it up.
+    monkeypatch.setattr(_hub, "resolve_model", _fake_resolve)
+    monkeypatch.setattr(_estimates, "resolve_model", _fake_resolve)
 
 
 def _profile(vram_gb: float = 16.0, accelerator: str = "cuda") -> HardwareProfile:
@@ -89,7 +115,7 @@ def test_below_yellow_is_green(self) -> None:
         assert _classify_severity(estimate=1.0, budget=10.0) == Severity.AMPLE
 
     def test_above_yellow_threshold(self) -> None:
-        assert _classify_severity(estimate=8.0, budget=10.0) == Severity.TIGHT
+        assert _classify_severity(estimate=9.5, budget=10.0) == Severity.TIGHT
 
     def test_at_or_above_red_threshold(self) -> None:
         assert _classify_severity(estimate=10.0, budget=10.0) == Severity.OVER
@@ -104,7 +130,7 @@ class TestVramForTransformer:
     def meta(self) -> ModelMeta:
         return ModelMeta(
             name="x",
-            params_millions=100.0,
+            total_params=100_000_000,
             weight_bytes_per_param=4,
             total_file_bytes=0,
             cached_locally=False,
@@ -146,10 +172,11 @@ def test_amp_does_reduce_activation_side_vram(self, meta: ModelMeta) -> None:
         amp = _vram_for_transformer(meta, "full-finetune", mixed_precision=True, batch_size=64, seq_len=128)
         assert amp < fp32
 
+
 def test_ram_scales_with_dataset_size() -> None:
     meta = ModelMeta(
         name="x",
-        params_millions=100.0,
+        total_params=100_000_000,
         weight_bytes_per_param=4,
         total_file_bytes=0,
         cached_locally=False,
@@ -177,7 +204,7 @@ def test_dump_modules_adds_disk_during_training(self) -> None:
                 }
             ],
             "hpo_config": {"n_trials": 5},
-            "dump_modules": True,
+            "logging_config": {"dump_modules": True},
         }
         report = run_preflight(cfg, DatasetStats.placeholder(), _profile())
         assert report.resource.disk_dump_gb > 0
@@ -201,8 +228,7 @@ def test_refit_after_increases_time(self) -> None:
             "hpo_config": {"n_trials": 10},
         }
         baseline = run_preflight(cfg, DatasetStats.placeholder(), _profile())
-        cfg_refit = {**cfg, "refit_after": True}
-        bumped = run_preflight(cfg_refit, DatasetStats.placeholder(), _profile())
+        bumped = run_preflight(cfg, DatasetStats.placeholder(), _profile(), refit_after=True)
         assert bumped.resource.time_hours > baseline.resource.time_hours
 
     def test_catboost_gpu_without_cuda_flags_config(self) -> None:
@@ -249,7 +275,7 @@ def test_offline_flips_low_confidence(self) -> None:
         }
         report = run_preflight(cfg, DatasetStats.placeholder(), _profile())
         assert report.low_confidence is True
-        assert any("HF Hub unreachable" in n for n in report.notes)
+        assert any("Heuristic fallback" in n for n in report.notes)
 
     def test_rare_classes_with_linear_scorer_flag_red(self) -> None:
         cfg = {
@@ -471,13 +497,13 @@ def test_driver_records_current_and_max_batch(self) -> None:
         report = run_preflight(
             self._bert_cfg("microsoft/deberta-v3-large", batch_size=64),
             DatasetStats.placeholder(),
-            _profile(vram_gb=8.0),
+            _profile(vram_gb=6.5),
         )
         drivers = [d for d in report.resource.drivers if d["module"] == "bert"]
         assert drivers
         d = drivers[0]
         assert d["batch_size"] == 64
-        # vram_gb=8 with ~5 GB weights leaves little room for activations → max < 64.
+        # vram_gb=6.5 against ~5 GB weights x 0.9 tight ratio -> little activation room, max < 64.
         assert d["max_batch_size"] is not None
         assert 0 < d["max_batch_size"] < 64
 
@@ -523,7 +549,7 @@ def test_multiple_drivers_carry_independent_max_batch(self) -> None:
         report = run_preflight(cfg, DatasetStats.placeholder(), _profile(vram_gb=10.0))
         small = next(d for d in report.resource.drivers if "small" in d["model"])
         large = next(d for d in report.resource.drivers if "large" in d["model"])
-        # The smaller model has more headroom → larger max batch (or equal-cap when both saturate).
+        # The smaller model has more headroom -> larger max batch (or equal-cap when both saturate).
         assert small["max_batch_size"] >= large["max_batch_size"]
 
 
@@ -551,7 +577,7 @@ def test_dump_disk_is_bounded_by_per_node_max_not_sum_of_all_variants(self) -> N
                 }
             ],
             "hpo_config": {"n_trials": 4},
-            "dump_modules": True,
+            "logging_config": {"dump_modules": True},
         }
         report = run_preflight(cfg, DatasetStats.placeholder(), _profile())
         # Per-node max ~ deberta-v3-large weights (~350M x 4 ~ 1.3 GB). Two-candidate
@@ -588,7 +614,7 @@ def test_dump_disk_sums_across_nodes(self) -> None:
                 },
             ],
             "hpo_config": {"n_trials": 2},
-            "dump_modules": True,
+            "logging_config": {"dump_modules": True},
         }
         report = run_preflight(cfg, DatasetStats.placeholder(), _profile())
         embedder = _hub.resolve_model("sentence-transformers/all-MiniLM-L6-v2")
diff --git a/tests/advisor/test_hub_heuristics.py b/tests/advisor/test_hub_heuristics.py
index b43b95522..c19018235 100644
--- a/tests/advisor/test_hub_heuristics.py
+++ b/tests/advisor/test_hub_heuristics.py
@@ -1,8 +1,8 @@
-"""Tests for the offline name-pattern heuristics in `_hub`.
+"""Tests for the offline heuristic fallback in `_hub`.
 
-The advisor must produce a sensible estimate even when HF Hub is
-unreachable, so these tests pin the public `hub_reachable` to False and
-exercise the heuristic path directly.
+The advisor must produce a sensible estimate even when HF Hub is unreachable.
+Without a per-name heuristic, every offline lookup collapses to a single
+BERT-base-sized default — these tests pin that contract.
 """
 
 from __future__ import annotations
@@ -14,41 +14,28 @@
 
 @pytest.fixture(autouse=True)
 def _offline(monkeypatch: pytest.MonkeyPatch) -> None:
-    _hub.hub_reachable.cache_clear()
     _hub.resolve_model.cache_clear()
-    monkeypatch.setattr(_hub, "hub_reachable", lambda *_a, **_kw: False)
+    # Force `_hub_metadata` to behave as if the live Hub were unreachable so
+    # resolve_model falls through to `_heuristic_metadata`.
+    monkeypatch.setattr(_hub, "_hub_metadata", lambda _name: None)
     monkeypatch.setattr(_hub, "_is_warm_cached", lambda _name: False)
 
 
-@pytest.mark.parametrize(
-    ("name", "expected_min_m", "expected_max_m"),
-    [
-        ("microsoft/deberta-v3-large", 200, 500),
-        ("microsoft/deberta-v3-small", 30, 200),
-        ("sentence-transformers/all-MiniLM-L6-v2", 20, 80),
-        ("intfloat/multilingual-e5-large-instruct", 300, 700),
-        ("intfloat/e5-small", 20, 80),
-        ("distilbert-base-uncased", 40, 150),
-        ("bert-base-uncased", 70, 200),
-    ],
-)
-def test_name_heuristic_picks_reasonable_bucket(name: str, expected_min_m: int, expected_max_m: int) -> None:
-    meta = _hub.resolve_model(name)
-    assert meta.confidence == "heuristic"
-    assert expected_min_m <= meta.params_millions <= expected_max_m, (
-        f"{name} got {meta.params_millions}M; expected [{expected_min_m}, {expected_max_m}]"
-    )
-
-
-def test_unknown_name_falls_back_to_bert_base() -> None:
-    meta = _hub.resolve_model("totally-made-up/no-such-model")
-    assert meta.confidence == "heuristic"
-    assert meta.params_millions == pytest.approx(110.0)
+def test_offline_lookup_uses_bert_base_default() -> None:
+    """Every offline lookup returns the same BERT-base-sized fallback."""
+    for name in (
+        "microsoft/deberta-v3-large",
+        "sentence-transformers/all-MiniLM-L6-v2",
+        "totally-made-up/no-such-model",
+    ):
+        meta = _hub.resolve_model(name)
+        assert meta.confidence == "heuristic"
+        assert meta.total_params == _hub._DEFAULT_HEURISTIC_PARAMS
 
 
 def test_weights_gb_matches_params_times_bytes() -> None:
     meta = _hub.resolve_model("microsoft/deberta-v3-large")
-    expected_gb = meta.params_millions * 1_000_000 * meta.weight_bytes_per_param / (1024**3)
+    expected_gb = meta.total_params * meta.weight_bytes_per_param / (1024**3)
     assert meta.weights_gb == pytest.approx(expected_gb)
 
 
@@ -75,5 +62,5 @@ def test_metadata_fallback_uses_heuristic_when_hub_unreachable() -> None:
     the live Hub is unreachable (autouse fixture forces offline)."""
     meta = _hub.resolve_model("microsoft/deberta-v3-large")
     assert meta.confidence == "heuristic"
-    assert meta.params_millions > 0
+    assert meta.total_params > 0
     assert meta.disk_gb > 0
diff --git a/tests/advisor/test_render.py b/tests/advisor/test_render.py
index 2c0604a11..7a806c7f2 100644
--- a/tests/advisor/test_render.py
+++ b/tests/advisor/test_render.py
@@ -56,7 +56,7 @@ def test_contains_phase_blocks(self) -> None:
         out = render_text(_populated_report())
         assert "Resource:" in out
         assert "Data:" in out
-        # Config phase has no findings → block omitted
+        # Config phase has no findings -> block omitted
         assert "Config:" not in out
 
     def test_includes_drivers_block(self) -> None:
@@ -119,7 +119,7 @@ def _two_reports(self) -> list[tuple[str, PreflightReport]]:
 
     def test_lists_chosen_preset_when_present(self) -> None:
         out = render_recommendation(self._two_reports(), chosen="a")
-        assert "→ a" in out
+        assert "-> a" in out
 
     def test_handles_no_chosen(self) -> None:
         out = render_recommendation(self._two_reports(), chosen=None)
@@ -140,7 +140,7 @@ class TestBatchHint:
     """Per-driver batch cell rendered in the Drivers-of-cost table."""
 
     def test_arrow_when_max_differs(self) -> None:
-        assert _batch_hint({"batch_size": 64, "max_batch_size": 32}) == "64 → 32"
+        assert _batch_hint({"batch_size": 64, "max_batch_size": 32}) == "64 -> 32"
 
     def test_plain_when_max_equals_current(self) -> None:
         assert _batch_hint({"batch_size": 64, "max_batch_size": 64}) == "64"
@@ -152,7 +152,7 @@ def test_empty_when_no_batch(self) -> None:
         assert _batch_hint({"batch_size": None, "max_batch_size": None}) == ""
 
     def test_increase_arrow(self) -> None:
-        assert _batch_hint({"batch_size": 32, "max_batch_size": 128}) == "32 → 128"
+        assert _batch_hint({"batch_size": 32, "max_batch_size": 128}) == "32 -> 128"
 
 
 def test_dataset_stats_in_text_block() -> None:

From e0f14866a898714df279f73ff1dccd9f288725ed Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Tue, 16 Jun 2026 21:26:39 +0300
Subject: [PATCH 13/16] remove from init

---
 src/autointent/_advisor/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/autointent/_advisor/__init__.py b/src/autointent/_advisor/__init__.py
index 28422c78d..d8eb6f5ba 100644
--- a/src/autointent/_advisor/__init__.py
+++ b/src/autointent/_advisor/__init__.py
@@ -10,10 +10,9 @@
 from ._estimates import run_preflight
 from ._hardware import HardwareProfile, detect_hardware
 from ._report import DatasetStats, Finding, PreflightReport, RecommendationResult, ResourceEstimate, Severity
-from ._workflows import BUNDLED_PRESETS, inspect, load_config, recommend, stats_from_dataset
+from ._workflows import inspect, load_config, recommend, stats_from_dataset
 
 __all__ = [
-    "BUNDLED_PRESETS",
     "DatasetStats",
     "Finding",
     "HardwareProfile",

From 6496b4e1873d93ef5e5cc53cdb43b2612611f4d5 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Tue, 16 Jun 2026 21:29:13 +0300
Subject: [PATCH 14/16] revert pyproject.toml

---
 pyproject.toml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ace1a3c77..276361669 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -298,12 +298,6 @@ module = [
     "dspy.evaluate.auto_evaluation",
     "codecarbon",
     "catboost",
-    "openai",
-    "openai.*",
-    "tiktoken",
-    "peft",
-    "sentence_transformers",
-    "psutil",
 ]
 ignore_missing_imports = true
 

From bc3df74af98ae1e63aac3b3f69391b223d03585d Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Tue, 16 Jun 2026 21:37:55 +0300
Subject: [PATCH 15/16] update typing

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 276361669..7677ac2aa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -120,6 +120,7 @@ typing = [
     "joblib-stubs (>=1.4.2.5.20240918,<2.0.0)",
     "pandas-stubs (>= 2.2.3.250527, <3.0.0)",
     "types-aiofiles (>=24.1.0.20250606)",
+    "types-psutil>=7.2.2.20260518",
 ]
 docs = [
     "sphinx (>=8.1.3,<9.0.0)",

From 7cb0f53e9929465637290dbfce27c7661ee9bd61 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:10:14 +0300
Subject: [PATCH 16/16] refactor

---
 src/autointent/_advisor/_estimates.py         | 798 ------------------
 src/autointent/_advisor/_hub.py               |   2 +-
 .../_advisor/{_workflows.py => workflows.py}  |  17 +-
 src/autointent/custom_types/_types.py         |  12 +-
 tests/advisor/test_estimates_internals.py     |  33 +-
 5 files changed, 26 insertions(+), 836 deletions(-)
 delete mode 100644 src/autointent/_advisor/_estimates.py
 rename src/autointent/_advisor/{_workflows.py => workflows.py} (92%)

diff --git a/src/autointent/_advisor/_estimates.py b/src/autointent/_advisor/_estimates.py
deleted file mode 100644
index faeb4b1ee..000000000
--- a/src/autointent/_advisor/_estimates.py
+++ /dev/null
@@ -1,798 +0,0 @@
-"""Resource-phase estimation: walk the search space and aggregate cost.
-
-Implements an honest worst-case for the modules the proposal lists as
-in-scope. Formulas are intentionally coarse — the advisor's contract is
-"heuristic upper bound, not measurement". Time and VRAM are the noisiest;
-treat them as ballparks, not budgets.
-"""
-
-from __future__ import annotations
-
-import logging
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any
-
-from pydantic import ValidationError
-
-from autointent._optimization_config import OptimizationConfig
-from autointent.configs._embedder import (
-    EmbedderConfig,
-    OpenaiEmbeddingConfig,
-    SentenceTransformerEmbeddingConfig,
-    VllmEmbeddingConfig,
-)
-
-from ._hub import resolve_model
-from ._report import PreflightReport, ResourceEstimate, Severity
-
-if TYPE_CHECKING:
-    from collections.abc import Iterable
-
-    from ._hardware import HardwareProfile
-    from ._hub import ModelMeta
-    from ._report import DatasetStats
-
-_MULTICLASS_THRESHOLD = 2
-_BYTES_PER_GB = 1024**3  # binary GiB convention; matches all advisor byte->GB conversions
-
-# Fallback architecture shape (BERT-base) used only when the model's actual
-# config.json couldn't be fetched from HF Hub — see _hub._shape_from_config.
-_DEFAULT_HIDDEN = 768
-_DEFAULT_LAYERS = 12
-
-logger = logging.getLogger(__name__)
-
-
-def _validated_config(config: dict[str, Any]) -> OptimizationConfig:
-    """Validate ``config`` against the project's canonical ``OptimizationConfig``.
-
-    The advisor is best-effort: a malformed user config should still produce a
-    report (with placeholder costs) rather than crashing, so any validation
-    error falls back to the model defaults.
-    """
-    try:
-        return OptimizationConfig.model_validate(config)
-    except ValidationError as e:
-        logger.warning("Advisor config failed validation; falling back to defaults: %s", e)
-        # OptimizationConfig requires `search_space`; build a minimal valid default.
-        return OptimizationConfig.model_validate({"search_space": []})
-
-
-_TIGHT_RATIO = 0.9
-
-# Union variants of EmbedderConfig that carry a model_name attribute.
-# HashingVectorizerEmbeddingConfig and the bare BaseEmbedderConfig don't have
-# one (sklearn vectorizer / abstract base), so we filter them out below.
-_MODEL_BACKED_EMBEDDERS = (
-    SentenceTransformerEmbeddingConfig,
-    OpenaiEmbeddingConfig,
-    VllmEmbeddingConfig,
-)
-
-
-def _embedder_model_name(embedder: EmbedderConfig) -> str | None:
-    """Return the embedder's model_name when the config variant carries one."""
-    if isinstance(embedder, _MODEL_BACKED_EMBEDDERS):
-        return embedder.model_name
-    return None
-
-
-# Maps each fine-tunable transformer module to its training-mode label.
-# Modules not listed (or listed as "inference") run the encoder forward-only.
-# Note: dnnc keeps the cross-encoder frozen and trains an sklearn LogisticRegressionCV
-# head on top of its features (see autointent._wrappers.ranker.Ranker._fit), so the
-# encoder's VRAM profile matches inference rather than fine-tuning.
-_TRANSFORMER_TRAINING_MODE = {
-    "bert": "full-finetune",
-    "ptuning": "lora",
-    "lora": "lora",
-}
-
-# Fallback max_length when the search-space entry doesn't pin it. Used both as
-# the default in _vram_for_transformer and in the entry-walk seq_len resolution.
-_DEFAULT_SEQ_LEN = 128
-
-# Coefficients for the linear / catboost time formulas (proposal §"Algorithm").
-_LINEAR_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-8
-_CATBOOST_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-9
-_CATBOOST_GPU_SPEEDUP = 10.0
-# LogisticRegressionCV defaults: Cs=10, cv=3 -> 31 inner fits + 1 final refit.
-_LOGREG_CV_MULTIPLIER = 31
-_CATBOOST_DEFAULT_BINS = 254
-# Bytes per histogram bucket / tree node — order-of-magnitude constants.
-_CATBOOST_BYTES_PER_TREE_NODE = 32
-
-
-def _extract_model_names(module_entry: dict[str, Any]) -> list[str]:
-    """Pull model name(s) from a search-space module entry."""
-    candidates: list[str] = []
-    cfg = module_entry.get("classification_model_config")
-    if isinstance(cfg, list):
-        candidates.extend(c["model_name"] for c in cfg if isinstance(c, dict) and c.get("model_name"))
-    elif isinstance(cfg, dict) and cfg.get("model_name"):
-        candidates.append(cfg["model_name"])
-    embedder_cfg = module_entry.get("embedder_config")
-    if isinstance(embedder_cfg, list):
-        candidates.extend(c["model_name"] for c in embedder_cfg if isinstance(c, dict) and c.get("model_name"))
-    elif isinstance(embedder_cfg, dict) and embedder_cfg.get("model_name"):
-        candidates.append(embedder_cfg["model_name"])
-    return candidates
-
-
-def _max_int(value: Any, default: int) -> int:  # noqa: ANN401
-    if value is None:
-        return default
-    if isinstance(value, list) and value:
-        return max(int(x) for x in value)
-    if isinstance(value, dict):
-        return int(value.get("high", default))
-    try:
-        return int(value)
-    except (TypeError, ValueError):
-        return default
-
-
-def _walk_modules_indexed(
-    search_space: list[dict[str, Any]],
-) -> Iterable[tuple[int, str, dict[str, Any]]]:
-    """Yield (node_index, node_type, module_entry) — index lets us bound per-node max cost."""
-    for node_idx, node in enumerate(search_space or []):
-        node_type = node.get("node_type", "?")
-        for entry in node.get("search_space", []) or []:
-            yield node_idx, node_type, entry
-
-
-def _walk_modules(search_space: list[dict[str, Any]]) -> Iterable[tuple[str, dict[str, Any]]]:
-    """Yield (node_type, module_entry) pairs — index-agnostic view over `_walk_modules_indexed`."""
-    for _, node_type, entry in _walk_modules_indexed(search_space):
-        yield node_type, entry
-
-
-def _weights_vram_for_transformer(meta: ModelMeta, mode: str) -> float:
-    """Weight-side VRAM in GB — weights + grads + Adam optimizer state. Excludes activations.
-
-    Modes:
-      * ``inference``: forward only — weights + ~30% intermediate-tensor overhead.
-      * ``lora``: frozen base + small trainable adapters + their grads/optimizer (~0.5 GB).
-      * ``full-finetune`` (default): weights + grads + Adam (m, v) = 4x weights.
-    """
-    weights_gb = meta.weights_gb
-    if mode == "inference":
-        return weights_gb * 1.3
-    if mode == "lora":
-        return weights_gb * 1.3 + 0.5
-    return weights_gb * 4.0
-
-
-def _vram_for_transformer(
-    meta: ModelMeta,
-    mode: str,
-    mixed_precision: bool,
-    *,
-    batch_size: int = 0,
-    seq_len: int = _DEFAULT_SEQ_LEN,
-) -> float:
-    """Total VRAM in GB: weights + grads + optimizer state + activations x batch.
-
-    Activation accounting differs by mode — training keeps per-layer outputs for
-    backward; inference only needs one or two layers in flight.
-    """
-    base = _weights_vram_for_transformer(meta, mode)
-    if batch_size <= 0:
-        return base
-    per_sample = _activations_gb_per_sample(
-        meta, seq_len, mixed_precision=mixed_precision, is_training=mode != "inference"
-    )
-    return base + per_sample * batch_size
-
-
-def _ram_for_module(meta: ModelMeta, stats: DatasetStats) -> float:
-    """RAM in GB. Loose upper bound."""
-    return meta.weights_gb + (stats.n_samples * stats.avg_tokens * 4) / _BYTES_PER_GB
-
-
-def _floor_to_power_of_two(n: int) -> int:
-    """Largest power of two ≤ n; returns 0 when n < 1."""
-    if n < 1:
-        return 0
-    power = 1
-    while power * 2 <= n:
-        power *= 2
-    return power
-
-
-def _n_layers(meta: ModelMeta | None) -> int:
-    """Layer count from the model's ``config.json``; falls back to BERT-base when absent."""
-    if meta is not None and meta.n_layers is not None:
-        return meta.n_layers
-    return _DEFAULT_LAYERS
-
-
-def _activations_gb_per_sample(
-    meta: ModelMeta | None,
-    seq_len: int,
-    *,
-    mixed_precision: bool,
-    is_training: bool,
-) -> float:
-    """Heuristic activation memory per sample.
-
-    Training: ``seq_len x hidden x layers x const`` — per-layer outputs are kept
-    for backward.
-    Inference: ``seq_len x hidden x const`` — only one or two layers' outputs in
-    flight at once.
-    Mixed precision halves activation bytes.
-    """
-    hidden = _embedder_dim(meta)
-    # Training keeps every layer's outputs for backward -> scales x n_layers.
-    # The 16-byte/token/layer coefficient bundles fp32 activation + ~4x backward overhead.
-    # Inference only holds ~1-2 layers' outputs in flight at once.
-    bytes_per_sample = seq_len * hidden * _n_layers(meta) * 16 if is_training else seq_len * hidden * 8
-    if mixed_precision:
-        bytes_per_sample //= 2
-    return bytes_per_sample / _BYTES_PER_GB
-
-
-def _max_fitting_batch_size(
-    *,
-    weight_vram_gb: float,
-    vram_budget_gb: float,
-    per_sample_gb: float,
-) -> int:
-    """Largest batch that keeps total VRAM under the AMPLE/TIGHT threshold.
-
-    Returns 0 when even the weights blow the budget. Result is rounded down to
-    the nearest power of two.
-    """
-    if per_sample_gb <= 0:
-        return 0
-    target_vram = vram_budget_gb * _TIGHT_RATIO
-    available_for_activations = target_vram - weight_vram_gb
-    if available_for_activations <= 0:
-        return 0
-    return _floor_to_power_of_two(int(available_for_activations / per_sample_gb))
-
-
-def _embedder_dim(meta: ModelMeta | None) -> int:
-    """Hidden size from the model's ``config.json``; falls back to BERT-base when absent."""
-    if meta is not None and meta.hidden_size is not None:
-        return meta.hidden_size
-    return _DEFAULT_HIDDEN
-
-
-def _largest_embedder(seen_models: dict[str, ModelMeta]) -> ModelMeta | None:
-    if not seen_models:
-        return None
-    return max(seen_models.values(), key=lambda m: m.total_params)
-
-
-def _ram_for_linear(*, stats: DatasetStats, embedder_dim: int) -> float:
-    """Float64 design matrix dominates; coefficients and L-BFGS history are small."""
-    data_bytes = 8.0 * stats.n_samples * embedder_dim
-    coef_bytes = 8.0 * max(1, stats.n_classes) * embedder_dim
-    lbfgs_bytes = 10.0 * 8.0 * embedder_dim
-    return (data_bytes + coef_bytes + lbfgs_bytes) / _BYTES_PER_GB
-
-
-def _time_for_linear(
-    *,
-    n_trials: int,
-    n_samples: int,
-    embedder_dim: int,
-    max_iter: int,
-    cv_multiplier: int,
-    class_multiplier: int,
-) -> float:
-    seconds = (
-        n_trials
-        * _LINEAR_CPU_S_PER_SAMPLE_FEATURE_ITER
-        * n_samples
-        * embedder_dim
-        * max_iter
-        * cv_multiplier
-        * class_multiplier
-    )
-    return seconds / 3600.0
-
-
-def _ram_for_catboost(*, stats: DatasetStats, n_features: int, iterations: int, depth: int) -> float:
-    data_bytes = 4.0 * stats.n_samples * n_features
-    histograms_bytes = 4.0 * n_features * _CATBOOST_DEFAULT_BINS
-    trees_bytes = iterations * (2**depth) * _CATBOOST_BYTES_PER_TREE_NODE
-    return float((data_bytes + histograms_bytes + trees_bytes) / _BYTES_PER_GB)
-
-
-def _time_for_catboost(
-    *,
-    n_trials: int,
-    n_samples: int,
-    n_features: int,
-    iterations: int,
-    depth: int,
-    class_multiplier: int,
-    on_gpu: bool,
-) -> float:
-    coeff = _CATBOOST_CPU_S_PER_SAMPLE_FEATURE_ITER
-    if on_gpu:
-        coeff /= _CATBOOST_GPU_SPEEDUP
-    seconds = n_trials * iterations * coeff * n_samples * n_features * depth * class_multiplier
-    return seconds / 3600.0
-
-
-def _time_for_transformer(
-    *,
-    n_trials: int,
-    epochs: int,
-    batch_size: int,
-    n_samples: int,
-) -> float:
-    """Transformer training time in hours, assuming a flat 1 second per step.
-
-    The advisor has no real wall-time calibration across hardware tiers / model
-    sizes, so the report uses ``time_hours`` as a step-count proxy rather than
-    pretending to estimate seconds. Users should treat the number as ordering /
-    ballpark information, not a budget.
-    """
-    steps = max(1, (n_samples // max(1, batch_size))) * epochs
-    return (n_trials * steps) / 3600.0
-
-
-def _classify_severity(estimate: float, budget: float) -> Severity:
-    if estimate <= 0:
-        return Severity.AMPLE
-    if budget <= 0:
-        return Severity.TIGHT
-    ratio = estimate / budget
-    if ratio >= 1:
-        return Severity.OVER
-    if ratio >= _TIGHT_RATIO:
-        return Severity.TIGHT
-    return Severity.AMPLE
-
-
-@dataclass
-class _ModuleEstimate:
-    """Per-module cost contribution + the dict that gets rendered in the report."""
-
-    driver: dict[str, Any]
-    vram_gb: float
-    ram_gb: float
-    time_hours: float
-    model_weights_gb: float = 0.0
-
-
-def _refit_factor(*, refit_after: bool, n_trials: int) -> float:
-    """Wall-time multiplier for ``refit_after=True`` (amortized 1/n_trials extra)."""
-    return 1 + 1.0 / max(1, n_trials) if refit_after else 1.0
-
-
-def _split_entries(
-    search_space: list[dict[str, Any]],
-) -> tuple[list[tuple[int, str, dict[str, Any]]], list[tuple[int, str, dict[str, Any]]]]:
-    """Partition search-space entries into (transformer-bearing, classic)."""
-    transformer: list[tuple[int, str, dict[str, Any]]] = []
-    classic: list[tuple[int, str, dict[str, Any]]] = []
-    for node_idx, node_type, entry in _walk_modules_indexed(search_space):
-        bucket = classic if entry.get("module_name") in {"linear", "catboost"} else transformer
-        bucket.append((node_idx, node_type, entry))
-    return transformer, classic
-
-
-def _estimate_transformer_model(
-    *,
-    meta: ModelMeta,
-    entry: dict[str, Any],
-    node_type: str,
-    module: str,
-    name: str,
-    stats: DatasetStats,
-    hardware: HardwareProfile,
-    n_trials: int,
-    refit_after: bool,
-) -> _ModuleEstimate:
-    """One row of cost for a transformer module + a specific model checkpoint."""
-    mixed_precision = entry.get("dtype") in {"fp16", "bf16"}
-    mode = _TRANSFORMER_TRAINING_MODE.get(module, "inference")
-    batch_size = _max_int(entry.get("batch_size"), 32)
-    epochs = _max_int(entry.get("num_train_epochs"), 1 if mode == "inference" else 10)
-    seq_len = _max_int(entry.get("max_length"), _DEFAULT_SEQ_LEN)
-
-    vram = _vram_for_transformer(meta, mode, mixed_precision, batch_size=batch_size, seq_len=seq_len)
-    ram = _ram_for_module(meta, stats)
-
-    driver_max_batch: int | None = None
-    if hardware.vram_gb > 0:
-        driver_max_batch = _max_fitting_batch_size(
-            weight_vram_gb=_weights_vram_for_transformer(meta, mode),
-            vram_budget_gb=hardware.vram_gb,
-            per_sample_gb=_activations_gb_per_sample(
-                meta, seq_len, mixed_precision=mixed_precision, is_training=mode != "inference"
-            ),
-        )
-
-    time_h = _time_for_transformer(
-        n_trials=n_trials,
-        epochs=epochs,
-        batch_size=batch_size,
-        n_samples=stats.n_samples,
-    )
-    if mode != "inference":
-        time_h *= _refit_factor(refit_after=refit_after, n_trials=n_trials)
-
-    return _ModuleEstimate(
-        driver={
-            "node_type": node_type,
-            "module": module,
-            "model": name,
-            "mode": mode,
-            "vram_gb": round(vram, 2),
-            "ram_gb": round(ram, 2),
-            "time_hours": round(time_h, 2),
-            "batch_size": batch_size,
-            "max_batch_size": driver_max_batch,
-            "confidence": meta.confidence,
-        },
-        vram_gb=vram,
-        ram_gb=ram,
-        time_hours=time_h,
-        model_weights_gb=meta.weights_gb,
-    )
-
-
-def _estimate_classic_entry(
-    *,
-    entry: dict[str, Any],
-    node_type: str,
-    embedder_meta: ModelMeta | None,
-    embedder_dim: int,
-    stats: DatasetStats,
-    hardware: HardwareProfile,
-    n_trials: int,
-    refit_after: bool,
-) -> _ModuleEstimate | None:
-    """Cost row for a linear or catboost scorer (returns ``None`` for any other module)."""
-    module = entry.get("module_name", "?")
-    refit = _refit_factor(refit_after=refit_after, n_trials=n_trials)
-    # Both multinomial (multiclass) and one-vs-rest (multilabel) LR scale linearly in n_classes.
-    class_multiplier = max(1, stats.n_classes)
-
-    if module == "linear":
-        cv_multiplier = 1 if stats.multilabel else _LOGREG_CV_MULTIPLIER
-        ram = _ram_for_linear(stats=stats, embedder_dim=embedder_dim)
-        time_h = (
-            _time_for_linear(
-                n_trials=n_trials,
-                n_samples=stats.n_samples,
-                embedder_dim=embedder_dim,
-                max_iter=_max_int(entry.get("max_iter"), 100),
-                cv_multiplier=cv_multiplier,
-                class_multiplier=class_multiplier,
-            )
-            * refit
-        )
-        vram = 0.0
-        mode = "linear-cv" if cv_multiplier > 1 else "linear"
-    elif module == "catboost":
-        on_gpu = entry.get("task_type") == "GPU" and hardware.accelerator == "cuda"
-        # CatBoost MultiClass loss grows per-class trees only above binary; binary uses
-        # Logloss with one tree per iteration.
-        cb_class_mult = class_multiplier if stats.n_classes > _MULTICLASS_THRESHOLD or stats.multilabel else 1
-        iterations = _max_int(entry.get("iterations"), 1000)
-        depth = _max_int(entry.get("depth"), 6)
-        ram_total = _ram_for_catboost(stats=stats, n_features=embedder_dim, iterations=iterations, depth=depth)
-        time_h = (
-            _time_for_catboost(
-                n_trials=n_trials,
-                n_samples=stats.n_samples,
-                n_features=embedder_dim,
-                iterations=iterations,
-                depth=depth,
-                class_multiplier=cb_class_mult,
-                on_gpu=on_gpu,
-            )
-            * refit
-        )
-        vram, ram = (ram_total, 0.0) if on_gpu else (0.0, ram_total)
-        mode = "catboost-gpu" if on_gpu else "catboost"
-    else:
-        return None
-
-    return _ModuleEstimate(
-        driver={
-            "node_type": node_type,
-            "module": module,
-            "model": embedder_meta.name if embedder_meta else "(no embedder)",
-            "mode": mode,
-            "vram_gb": round(vram, 2),
-            "ram_gb": round(ram, 2),
-            "time_hours": round(time_h, 2),
-            "batch_size": None,
-            "max_batch_size": None,
-            "confidence": embedder_meta.confidence if embedder_meta else "heuristic",
-        },
-        vram_gb=vram,
-        ram_gb=ram,
-        time_hours=time_h,
-    )
-
-
-def _aggregate_disk(
-    estimate: ResourceEstimate,
-    seen_models: dict[str, ModelMeta],
-    node_max_weights: dict[int, float],
-    *,
-    dump_modules: bool,
-    n_trials: int,
-) -> None:
-    """Fold per-model download/cached sizes into ``estimate`` and apply dump-modules accounting."""
-    for meta in seen_models.values():
-        if meta.cached_locally:
-            estimate.disk_cached_gb += meta.disk_gb
-        else:
-            estimate.disk_download_gb += meta.disk_gb
-    if dump_modules:
-        # Each trial selects one variant per node, so per-trial dumped weights
-        # are bounded by the heaviest module in each node, summed across nodes.
-        estimate.disk_dump_gb = sum(node_max_weights.values()) * n_trials
-
-
-def _emit_resource_findings(
-    report: PreflightReport,
-    estimate: ResourceEstimate,
-    hardware: HardwareProfile,
-    *,
-    n_jobs: int,
-) -> None:
-    """Translate aggregated estimates into VRAM/RAM/disk/time findings on the report."""
-    parallel_gpu = n_jobs > 1 and hardware.accelerator in {"cuda", "mps"}
-    effective_vram = estimate.vram_gb * n_jobs if parallel_gpu else estimate.vram_gb
-    # MPS shares one unified pool: parallel workers each allocate weights+activations
-    # in RAM, so peak RAM also scales with n_jobs on Apple Silicon.
-    effective_ram = estimate.ram_gb * n_jobs if n_jobs > 1 and hardware.accelerator == "mps" else estimate.ram_gb
-
-    if hardware.accelerator == "cpu" and effective_vram > 0:
-        report.add(
-            "resource",
-            Severity.TIGHT,
-            f"No GPU detected; transformer modules will be very slow (worst case ~{estimate.time_hours:.1f} h).",
-            metric="vram",
-        )
-    else:
-        msg = f"VRAM ~{effective_vram:.1f} GB"
-        if n_jobs > 1:
-            msg += f" (= per-trial {estimate.vram_gb:.1f} GB × {n_jobs} parallel trials)"
-        msg += f" vs available {hardware.vram_gb:.1f} GB"
-        report.add("resource", _classify_severity(effective_vram, hardware.vram_gb), msg, metric="vram")
-
-    report.add(
-        "resource",
-        _classify_severity(effective_ram, hardware.ram_gb),
-        f"RAM ~{effective_ram:.1f} GB vs available {hardware.ram_gb:.1f} GB",
-        metric="ram",
-    )
-
-    disk_total = estimate.disk_download_gb + estimate.disk_dump_gb
-    disk_msg = f"Disk ~{estimate.disk_download_gb:.1f} GB to download"
-    if estimate.disk_cached_gb > 0:
-        disk_msg += f", {estimate.disk_cached_gb:.1f} GB already cached"
-    if estimate.disk_dump_gb > 0:
-        disk_msg += f", +{estimate.disk_dump_gb:.1f} GB during training (dump_modules=True)"
-    disk_msg += f" vs {hardware.free_disk_gb:.0f} GB free"
-    report.add("resource", _classify_severity(disk_total, hardware.free_disk_gb), disk_msg, metric="disk")
-
-    if estimate.time_hours > 0:
-        report.add(
-            "resource",
-            Severity.AMPLE,
-            f"Time ~{estimate.time_hours:.1f} h (worst case, no HPO pruning)",
-            metric="time",
-        )
-
-
-def _resource_phase(
-    config: dict[str, Any],
-    stats: DatasetStats,
-    hardware: HardwareProfile,
-    report: PreflightReport,
-    *,
-    refit_after: bool = False,
-) -> None:
-    cfg = _validated_config(config)
-    n_trials = cfg.hpo_config.n_trials
-    n_jobs = cfg.hpo_config.n_jobs
-    dump_modules = cfg.logging_config.dump_modules
-
-    seen_models: dict[str, ModelMeta] = {}
-    global_embedder = _embedder_model_name(cfg.embedder_config)
-    if global_embedder:
-        seen_models[global_embedder] = resolve_model(global_embedder)
-
-    transformer_entries, classic_entries = _split_entries(cfg.search_space)
-
-    # First pass: transformer modules (also populates seen_models for the classic pass).
-    module_estimates: list[_ModuleEstimate] = []
-    node_max_weights: dict[int, float] = {}
-    for node_idx, node_type, entry in transformer_entries:
-        module = entry.get("module_name", "?")
-        model_names = _extract_model_names(entry)
-        if not model_names and global_embedder and module in {"knn", "mlknn"}:
-            model_names = [global_embedder]
-        for name in model_names:
-            meta = seen_models.setdefault(name, resolve_model(name))
-            me = _estimate_transformer_model(
-                meta=meta,
-                entry=entry,
-                node_type=node_type,
-                module=module,
-                name=name,
-                stats=stats,
-                hardware=hardware,
-                n_trials=n_trials,
-                refit_after=refit_after,
-            )
-            module_estimates.append(me)
-            # Track heaviest weight per node so dump_modules is bounded by one
-            # selected variant per node x n_trials, not the sum of all candidates.
-            node_max_weights[node_idx] = max(node_max_weights.get(node_idx, 0.0), me.model_weights_gb)
-
-    # Second pass: linear / catboost — cost depends on embedder_dim, not a checkpoint.
-    embedder_meta = _largest_embedder(seen_models)
-    embedder_dim = _embedder_dim(embedder_meta)
-    for _, node_type, entry in classic_entries:
-        classic_estimate = _estimate_classic_entry(
-            entry=entry,
-            node_type=node_type,
-            embedder_meta=embedder_meta,
-            embedder_dim=embedder_dim,
-            stats=stats,
-            hardware=hardware,
-            n_trials=n_trials,
-            refit_after=refit_after,
-        )
-        if classic_estimate is not None:
-            module_estimates.append(classic_estimate)
-
-    estimate = ResourceEstimate(parallel_factor=n_jobs)
-    for me in module_estimates:
-        estimate.vram_gb = max(estimate.vram_gb, me.vram_gb)
-        estimate.ram_gb = max(estimate.ram_gb, me.ram_gb)
-        estimate.time_hours += me.time_hours
-        estimate.drivers.append(me.driver)
-
-    _aggregate_disk(estimate, seen_models, node_max_weights, dump_modules=dump_modules, n_trials=n_trials)
-
-    # Flip low_confidence if any model fell back to the heuristic path (Hub
-    # unreachable, repo missing safetensors metadata, local-path checkpoint).
-    heuristic_models = [m.name for m in seen_models.values() if m.confidence == "heuristic"]
-    if heuristic_models:
-        report.low_confidence = True
-        report.notes.append(
-            f"Heuristic fallback used for {len(heuristic_models)} model(s) — sizes are BERT-base "
-            f"defaults: {', '.join(heuristic_models[:3])}{'...' if len(heuristic_models) > 3 else ''}",  # noqa: PLR2004
-        )
-
-    report.resource = estimate
-    _emit_resource_findings(report, estimate, hardware, n_jobs=n_jobs)
-
-
-def _config_phase(
-    config: dict[str, Any],
-    hardware: HardwareProfile,
-    report: PreflightReport,
-) -> None:
-    hpo = config.get("hpo_config") or {}
-    n_jobs = int(hpo.get("n_jobs", 1))
-
-    if n_jobs > 1 and hardware.accelerator in {"cuda", "mps"}:
-        report.add(
-            "config",
-            Severity.TIGHT,
-            f"hpo_config.n_jobs={n_jobs} on a single GPU multiplies VRAM demand by {n_jobs}×.",
-        )
-
-    uses_catboost_gpu = False
-    for _, entry in _walk_modules(config.get("search_space") or []):
-        if entry.get("module_name") == "catboost" and entry.get("task_type") == "GPU":
-            uses_catboost_gpu = True
-            break
-    if uses_catboost_gpu and hardware.accelerator != "cuda":
-        report.add(
-            "config",
-            Severity.TIGHT,
-            "CatBoost task_type=GPU configured but no CUDA detected — will fall back to CPU.",
-        )
-
-
-def _data_phase(
-    config: dict[str, Any],
-    stats: DatasetStats,
-    report: PreflightReport,
-) -> None:
-    # token-length truncation (heuristic — we use stats.p95_tokens vs configured max_length)
-    p95 = stats.p95_tokens or int(stats.avg_tokens * 2.5)
-    for _, entry in _walk_modules(config.get("search_space") or []):
-        max_len_value = entry.get("max_length")
-        if max_len_value is None:
-            continue
-        max_len = _max_int(max_len_value, 512)
-        if p95 > max_len:
-            severity = Severity.OVER if p95 > max_len * 1.5 else Severity.TIGHT
-            module_name = entry.get("module_name", "?")
-            report.add(
-                "data",
-                severity,
-                f"Train tokens p95~{p95} exceeds {module_name}.max_length={max_len}; expect silent truncation.",
-            )
-
-    # rare class x linear-CV (LogisticRegressionCV cv=3 needs >=3 samples/class;
-    # multilabel path uses one-vs-rest without CV so the failure can't occur there)
-    has_linear = any(e.get("module_name") == "linear" for _, e in _walk_modules(config.get("search_space") or []))
-    if has_linear and stats.rare_classes and not stats.multilabel:
-        report.add(
-            "data",
-            Severity.OVER,
-            (f"LogisticRegressionCV (cv=3) will fail: classes {stats.rare_classes[:5]} have <3 samples."),
-        )
-
-    # partial descriptions x description scorer
-    description_modules = {"description_bi", "description_cross", "description_llm"}
-    has_description = any(
-        e.get("module_name") in description_modules for _, e in _walk_modules(config.get("search_space") or [])
-    )
-    if has_description and stats.has_descriptions is False:
-        report.add(
-            "data",
-            Severity.OVER,
-            "description scorer present but intent descriptions are missing — fill them in or drop the scorer.",
-        )
-
-
-def run_preflight(
-    config: dict[str, Any],
-    stats: DatasetStats,
-    hardware: HardwareProfile,
-    *,
-    preset_name: str | None = None,
-    refit_after: bool = False,
-) -> PreflightReport:
-    """Run all three phases and return one report.
-
-    Args:
-        config: parsed preset / OptimizationConfig dict (top-level keys:
-            ``search_space``, ``hpo_config``, optional ``embedder_config``,
-            optional ``logging_config.dump_modules``).
-        stats: dataset statistics (real or placeholder).
-        hardware: detected hardware profile.
-        preset_name: optional friendly name for the report header.
-        refit_after: matches the ``Pipeline.fit(refit_after=...)`` argument.
-            When True, time estimates include the extra refit-on-full-data pass.
-
-    Returns:
-        PreflightReport with findings across resource/data/config phases.
-    """
-    report = PreflightReport(
-        preset_name=preset_name,
-        hardware={
-            "accelerator": hardware.accelerator,
-            "device_name": hardware.device_name,
-            "vram_gb": round(hardware.vram_gb, 2),
-            "ram_gb": round(hardware.ram_gb, 2),
-            "free_disk_gb": round(hardware.free_disk_gb, 2),
-            "device_class": hardware.device_class,
-        },
-        dataset={
-            "n_samples": stats.n_samples,
-            "n_classes": stats.n_classes,
-            "avg_tokens": stats.avg_tokens,
-            "p95_tokens": stats.p95_tokens,
-            "multilabel": stats.multilabel,
-            "source": stats.source,
-        },
-    )
-    report.notes.extend(hardware.notes)
-
-    _resource_phase(config, stats, hardware, report, refit_after=refit_after)
-    _data_phase(config, stats, report)
-    _config_phase(config, hardware, report)
-
-    return report
diff --git a/src/autointent/_advisor/_hub.py b/src/autointent/_advisor/_hub.py
index a5b6126a5..677e6045c 100644
--- a/src/autointent/_advisor/_hub.py
+++ b/src/autointent/_advisor/_hub.py
@@ -94,7 +94,7 @@ def _hub_metadata(model_name: str) -> ModelMeta | None:
         return None
     # Bytes-per-element for safetensors dtype strings. Used to convert the per-dtype
     # parameter counts (info.safetensors.parameters) into a weighted average
-    # bytes-per-param for mixed-precision repos.
+    # bytes-per-param when a checkpoint stores tensors in multiple dtypes.
     _dtype_bytes: dict[str, int] = {
         "F64": 8,
         "F32": 4,
diff --git a/src/autointent/_advisor/_workflows.py b/src/autointent/_advisor/workflows.py
similarity index 92%
rename from src/autointent/_advisor/_workflows.py
rename to src/autointent/_advisor/workflows.py
index 0bd7ee5bf..2331e225a 100644
--- a/src/autointent/_advisor/_workflows.py
+++ b/src/autointent/_advisor/workflows.py
@@ -20,9 +20,9 @@
 from autointent.custom_types import SearchSpacePreset
 from autointent.utils import load_preset
 
-from ._estimates import run_preflight
 from ._hardware import detect_hardware
 from ._report import DatasetStats, RecommendationResult, Severity
+from .runner import run_preflight
 
 if TYPE_CHECKING:
     from collections.abc import Iterable
@@ -196,10 +196,11 @@ def recommend(
         ``RecommendationResult`` with the chosen preset name and full results list.
 
     Note:
-        Among feasible presets we pick the one with the largest estimated
-        ``time_hours`` (ties broken alphabetically). Higher-quality presets cost
-        more wall-time, so the slowest feasible preset is also the heaviest
-        preset that still fits the hardware — i.e. "use what you have".
+        Among feasible presets we pick the heaviest one that still fits the
+        hardware budget — "use what you have" semantics. This is a *cost*
+        ranking, not a quality ranking: a heavier preset is not strictly better
+        and may overfit on small datasets where a classic-* preset would win on
+        accuracy. Override ``presets=`` if you want a different ranking.
     """
     hardware = detect_hardware(vram_budget_gb=budget_vram_gb)
     stats = stats or DatasetStats.placeholder()
@@ -221,11 +222,9 @@ def recommend(
             )
         results.append((preset, report))
 
-    # Rank by Literal position (lower index = higher quality); presets the user
-    # passed via the ``presets`` override but not in BUNDLED_PRESETS sort last.
-    quality_rank = {name: i for i, name in enumerate(BUNDLED_PRESETS)}
+    cost_rank = {name: i for i, name in enumerate(BUNDLED_PRESETS)}
     feasible = [(name, r) for name, r in results if r.is_feasible]
-    feasible.sort(key=lambda pair: (quality_rank.get(pair[0], len(BUNDLED_PRESETS)), pair[0]))
+    feasible.sort(key=lambda pair: (cost_rank.get(pair[0], len(BUNDLED_PRESETS)), pair[0]))
     chosen = feasible[0][0] if feasible else None
 
     return RecommendationResult(chosen=chosen, results=results)
diff --git a/src/autointent/custom_types/_types.py b/src/autointent/custom_types/_types.py
index a54da368d..59e6b87a3 100644
--- a/src/autointent/custom_types/_types.py
+++ b/src/autointent/custom_types/_types.py
@@ -128,10 +128,14 @@ class Split:
     "zero-shot-encoders",
     "classic-light",
 ]
-"""Bundled search-space presets, listed in descending quality order.
-
-The order is consumed by ``autointent._advisor.recommend`` to pick the
-highest-quality feasible preset (lower index = higher quality)."""
+"""Bundled search-space presets, listed in descending resource-cost order.
+
+Heavier presets explore more / larger models and take longer to run. The order
+is a cost ranking, **not** a quality ranking: a heavier preset is not strictly
+better — e.g. ``transformers-heavy`` will overfit on tiny datasets where a
+classic-* preset wins on accuracy. ``autointent._advisor.recommend`` uses this
+ordering to pick the heaviest preset that still fits the hardware budget,
+which is a reasonable default but not always the right choice for the data."""
 
 
 class Document(BaseModel):
diff --git a/tests/advisor/test_estimates_internals.py b/tests/advisor/test_estimates_internals.py
index 3fa293b7e..3d4e4e526 100644
--- a/tests/advisor/test_estimates_internals.py
+++ b/tests/advisor/test_estimates_internals.py
@@ -138,18 +138,18 @@ def meta(self) -> ModelMeta:
         )
 
     def test_full_finetune_is_larger_than_lora_is_larger_than_inference(self, meta: ModelMeta) -> None:
-        inference = _vram_for_transformer(meta, "inference", mixed_precision=False)
-        lora = _vram_for_transformer(meta, "lora", mixed_precision=False)
-        full = _vram_for_transformer(meta, "full-finetune", mixed_precision=False)
+        inference = _vram_for_transformer(meta, "inference")
+        lora = _vram_for_transformer(meta, "lora")
+        full = _vram_for_transformer(meta, "full-finetune")
         assert inference < lora < full
 
     def test_inference_activations_are_smaller_than_training(self, meta: ModelMeta) -> None:
         """Inference doesn't store per-layer outputs for backward — activation memory
         should be many times smaller than training at the same batch_size."""
-        train_total = _vram_for_transformer(meta, "full-finetune", False, batch_size=64, seq_len=128)
-        train_weights = _vram_for_transformer(meta, "full-finetune", False, batch_size=0)
-        inf_total = _vram_for_transformer(meta, "inference", False, batch_size=64, seq_len=128)
-        inf_weights = _vram_for_transformer(meta, "inference", False, batch_size=0)
+        train_total = _vram_for_transformer(meta, "full-finetune", batch_size=64, seq_len=128)
+        train_weights = _vram_for_transformer(meta, "full-finetune", batch_size=0)
+        inf_total = _vram_for_transformer(meta, "inference", batch_size=64, seq_len=128)
+        inf_weights = _vram_for_transformer(meta, "inference", batch_size=0)
         train_acts = train_total - train_weights
         inf_acts = inf_total - inf_weights
         assert inf_acts > 0
@@ -157,21 +157,6 @@ def test_inference_activations_are_smaller_than_training(self, meta: ModelMeta)
         # 12-layer model: training activations should be at least ~5x inference.
         assert train_acts / inf_acts > 5
 
-    def test_amp_does_not_reduce_weight_side_vram(self, meta: ModelMeta) -> None:
-        """Weight-side AMP accounting: fp16 weights+grads (W) + fp32 master copy (W)
-        + fp32 Adam moments (2W) = 4W, identical to pure fp32. AMP's savings live
-        in activations, not the optimizer."""
-        full_fp32 = _vram_for_transformer(meta, "full-finetune", mixed_precision=False, batch_size=0)
-        full_amp = _vram_for_transformer(meta, "full-finetune", mixed_precision=True, batch_size=0)
-        assert full_amp == pytest.approx(full_fp32)
-
-    def test_amp_does_reduce_activation_side_vram(self, meta: ModelMeta) -> None:
-        """When a batch is configured, AMP halves activation bytes — total VRAM
-        with batch should be strictly smaller under AMP than fp32."""
-        fp32 = _vram_for_transformer(meta, "full-finetune", mixed_precision=False, batch_size=64, seq_len=128)
-        amp = _vram_for_transformer(meta, "full-finetune", mixed_precision=True, batch_size=64, seq_len=128)
-        assert amp < fp32
-
 
 def test_ram_scales_with_dataset_size() -> None:
     meta = ModelMeta(
@@ -497,13 +482,13 @@ def test_driver_records_current_and_max_batch(self) -> None:
         report = run_preflight(
             self._bert_cfg("microsoft/deberta-v3-large", batch_size=64),
             DatasetStats.placeholder(),
-            _profile(vram_gb=6.5),
+            _profile(vram_gb=7.5),
         )
         drivers = [d for d in report.resource.drivers if d["module"] == "bert"]
         assert drivers
         d = drivers[0]
         assert d["batch_size"] == 64
-        # vram_gb=6.5 against ~5 GB weights x 0.9 tight ratio -> little activation room, max < 64.
+        # vram_gb=7.5 against ~5.9 GB weights x 0.9 tight ratio -> little activation room, max < 64.
         assert d["max_batch_size"] is not None
         assert 0 < d["max_batch_size"] < 64