From 47f06678366d9cb25a377964c0e6fd57e407770c Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@cabinetoffice.gov.uk>
Date: Tue, 17 Mar 2026 16:37:19 +0000
Subject: [PATCH 1/2] Add adversarial weight regularisation pipeline

Introduces a diagnostics package that detects high-influence survey
records, generates synthetic offspring via TVAE, and recalibrates
with entropy regularisation and weight capping to reduce output
noise in population subgroup statistics.

Components:
- influence.py: reporting surface definition, per-record influence
  computation, Kish effective sample size, random reform sampling
- generative_model.py: TVAE training on FRS input attributes,
  conditional sampling with varied conditioning fractions
- offspring.py: adversarial detect-spawn-recalibrate loop
- recalibrate.py: entropy-regularised weight optimisation with
  optional hard weight cap and zero-weight pruning
- __main__.py: CLI with diagnose/train/regularise commands
---
 .../adversarial-weight-regularisation.added   |   1 +
 policyengine_uk_data/diagnostics/__init__.py  |  38 ++
 policyengine_uk_data/diagnostics/__main__.py  | 270 +++++++++++
 .../diagnostics/generative_model.py           | 386 +++++++++++++++
 policyengine_uk_data/diagnostics/influence.py | 447 ++++++++++++++++++
 policyengine_uk_data/diagnostics/offspring.py | 329 +++++++++++++
 .../diagnostics/recalibrate.py                | 165 +++++++
 pyproject.toml                                |   2 +
 uv.lock                                       | 367 +++++++++++++-
 9 files changed, 1980 insertions(+), 25 deletions(-)
 create mode 100644 changelog.d/adversarial-weight-regularisation.added
 create mode 100644 policyengine_uk_data/diagnostics/__init__.py
 create mode 100644 policyengine_uk_data/diagnostics/__main__.py
 create mode 100644 policyengine_uk_data/diagnostics/generative_model.py
 create mode 100644 policyengine_uk_data/diagnostics/influence.py
 create mode 100644 policyengine_uk_data/diagnostics/offspring.py
 create mode 100644 policyengine_uk_data/diagnostics/recalibrate.py

diff --git a/changelog.d/adversarial-weight-regularisation.added b/changelog.d/adversarial-weight-regularisation.added
new file mode 100644
index 00000000..68088959
--- /dev/null
+++ b/changelog.d/adversarial-weight-regularisation.added
@@ -0,0 +1 @@
+Added adversarial weight regularisation pipeline: detects high-influence survey records, generates synthetic offspring via TVAE, and recalibrates with entropy regularisation and weight capping to reduce output noise in population subgroup statistics.
diff --git a/policyengine_uk_data/diagnostics/__init__.py b/policyengine_uk_data/diagnostics/__init__.py
new file mode 100644
index 00000000..06e85fa8
--- /dev/null
+++ b/policyengine_uk_data/diagnostics/__init__.py
@@ -0,0 +1,38 @@
+"""Adversarial weight regularisation for PolicyEngine UK.
+
+Detects high-influence survey records, generates synthetic offspring
+to diffuse their weight, and recalibrates to population targets with
+entropy regularisation.
+"""
+
+from policyengine_uk_data.diagnostics.influence import (
+    compute_influence_matrix,
+    find_high_influence_records,
+    compute_kish_effective_sample_size,
+    run_diagnostics,
+)
+from policyengine_uk_data.diagnostics.generative_model import (
+    train_generative_model,
+    extract_household_features,
+    validate_generative_model,
+)
+from policyengine_uk_data.diagnostics.offspring import (
+    run_adversarial_loop,
+)
+from policyengine_uk_data.diagnostics.recalibrate import (
+    recalibrate_with_regularisation,
+    prune_zero_weight_records,
+)
+
+__all__ = [
+    "compute_influence_matrix",
+    "find_high_influence_records",
+    "compute_kish_effective_sample_size",
+    "run_diagnostics",
+    "train_generative_model",
+    "extract_household_features",
+    "validate_generative_model",
+    "run_adversarial_loop",
+    "recalibrate_with_regularisation",
+    "prune_zero_weight_records",
+]
diff --git a/policyengine_uk_data/diagnostics/__main__.py b/policyengine_uk_data/diagnostics/__main__.py
new file mode 100644
index 00000000..ed895719
--- /dev/null
+++ b/policyengine_uk_data/diagnostics/__main__.py
@@ -0,0 +1,270 @@
+"""CLI entry point for adversarial weight regularisation.
+
+Usage:
+    uv run python -m policyengine_uk_data.diagnostics [command] [options]
+
+Commands:
+    diagnose    Run Phase 1 influence diagnostics (read-only)
+    train       Train the generative model on FRS attributes
+    regularise  Run the full adversarial loop (detect + spawn + recalibrate)
+"""
+
+import argparse
+import json
+import logging
+import sys
+
+import numpy as np
+
+
+def cmd_diagnose(args):
+    """Run influence diagnostics on a dataset."""
+    from policyengine_uk.data import UKSingleYearDataset
+    from policyengine_uk_data.diagnostics.influence import run_diagnostics
+
+    dataset = UKSingleYearDataset(file_path=args.dataset)
+    results = run_diagnostics(
+        dataset,
+        time_period=args.year,
+        n_reforms=args.n_reforms,
+        threshold=args.threshold,
+        seed=args.seed,
+    )
+
+    print("\n=== Weight distribution ===")
+    for k, v in results["weight_stats"].items():
+        print(f"  {k}: {v:,.1f}")
+
+    print(f"\n=== Flagged records (threshold={args.threshold}) ===")
+    flagged = results["flagged_records"]
+    if flagged.empty:
+        print("  No records exceed the influence threshold.")
+    else:
+        print(f"  {len(flagged)} records flagged")
+        print(flagged.head(20).to_string(index=False))
+
+    print("\n=== Kish effective sample size (top 20 worst) ===")
+    kish = results["kish_by_slice"]
+    sorted_kish = sorted(kish.items(), key=lambda x: x[1])
+    for name, val in sorted_kish[:20]:
+        print(f"  {name}: {val:,.0f}")
+
+    if args.output:
+        # Save full results to JSON
+        serialisable = {
+            "weight_stats": results["weight_stats"],
+            "kish_by_slice": {k: float(v) for k, v in kish.items()},
+            "flagged_records": flagged.to_dict(orient="records"),
+        }
+        with open(args.output, "w") as f:
+            json.dump(serialisable, f, indent=2)
+        print(f"\nFull results saved to {args.output}")
+
+
+def cmd_train(args):
+    """Train the generative model."""
+    from policyengine_uk.data import UKSingleYearDataset
+    from policyengine_uk_data.diagnostics.generative_model import (
+        train_generative_model,
+        extract_household_features,
+        validate_generative_model,
+    )
+    import pickle
+
+    dataset = UKSingleYearDataset(file_path=args.dataset)
+    model = train_generative_model(
+        dataset,
+        epochs=args.epochs,
+        seed=args.seed,
+    )
+
+    # Validate
+    features = extract_household_features(dataset)
+    validation = validate_generative_model(model, features)
+
+    print("\n=== Generative model validation ===")
+    print("Marginal KS statistics (lower is better):")
+    for col, ks in sorted(validation["marginal_ks"].items(), key=lambda x: -x[1])[:10]:
+        print(f"  {col}: {ks:.3f}")
+
+    if validation["correlation_diff"] is not None:
+        print(f"Max correlation difference: {validation['correlation_diff']:.3f}")
+
+    # Save model
+    with open(args.output, "wb") as f:
+        pickle.dump(model, f)
+    print(f"\nModel saved to {args.output}")
+
+
+def cmd_regularise(args):
+    """Run the full adversarial weight regularisation pipeline."""
+    from policyengine_uk.data import UKSingleYearDataset
+    from policyengine_uk_data.diagnostics.offspring import (
+        run_adversarial_loop,
+    )
+    from policyengine_uk_data.diagnostics.recalibrate import (
+        prune_zero_weight_records,
+    )
+    import pickle
+
+    dataset = UKSingleYearDataset(file_path=args.dataset)
+
+    # Load or train generative model
+    if args.model:
+        with open(args.model, "rb") as f:
+            model = pickle.load(f)
+    else:
+        from policyengine_uk_data.diagnostics.generative_model import (
+            train_generative_model,
+        )
+
+        print("No model provided, training generative model...")
+        model = train_generative_model(dataset, epochs=args.train_epochs)
+
+    result = run_adversarial_loop(
+        dataset,
+        model,
+        time_period=args.year,
+        threshold=args.threshold,
+        max_rounds=args.max_rounds,
+        n_offspring=args.n_offspring,
+        weight_target=args.weight_target,
+        seed=args.seed,
+    )
+
+    expanded = result["expanded_dataset"]
+
+    # Prune zero-weight records
+    pruned = prune_zero_weight_records(expanded, epsilon=1.0)
+
+    # Save
+    pruned.save(args.output)
+
+    weights = pruned.household.household_weight.values
+    print("\n=== Results ===")
+    print(f"  Rounds completed: {result['rounds_completed']}")
+    print(f"  Records added: {result['records_expanded']}")
+    print(f"  Final dataset size: {len(pruned.household)} households")
+    print(f"  Max weight: {weights.max():,.0f}")
+    print(f"  Median weight: {np.median(weights):,.0f}")
+    print(f"  Influence history: {[f'{x:.3f}' for x in result['influence_history']]}")
+    print(f"\nExpanded dataset saved to {args.output}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Adversarial weight regularisation for PolicyEngine UK",
+    )
+    subparsers = parser.add_subparsers(dest="command")
+
+    # diagnose
+    diag = subparsers.add_parser(
+        "diagnose",
+        help="Run influence diagnostics",
+    )
+    diag.add_argument("dataset", help="Path to .h5 dataset")
+    diag.add_argument("--year", default="2025", help="Time period")
+    diag.add_argument(
+        "--n-reforms",
+        type=int,
+        default=50,
+        help="Number of random reforms for influence sampling",
+    )
+    diag.add_argument(
+        "--threshold",
+        type=float,
+        default=0.05,
+        help="Influence threshold",
+    )
+    diag.add_argument("--seed", type=int, default=42)
+    diag.add_argument("--output", "-o", help="Output JSON file for full results")
+
+    # train
+    tr = subparsers.add_parser(
+        "train",
+        help="Train generative model",
+    )
+    tr.add_argument("dataset", help="Path to .h5 dataset")
+    tr.add_argument(
+        "--epochs",
+        type=int,
+        default=300,
+        help="TVAE training epochs",
+    )
+    tr.add_argument("--seed", type=int, default=42)
+    tr.add_argument(
+        "--output",
+        "-o",
+        default="generative_model.pkl",
+        help="Output pickle file",
+    )
+
+    # regularise
+    reg = subparsers.add_parser(
+        "regularise",
+        help="Run full adversarial loop",
+    )
+    reg.add_argument("dataset", help="Path to .h5 dataset")
+    reg.add_argument(
+        "--model",
+        help="Path to trained generative model (.pkl)",
+    )
+    reg.add_argument("--year", default="2025", help="Time period")
+    reg.add_argument(
+        "--threshold",
+        type=float,
+        default=0.05,
+        help="Influence threshold",
+    )
+    reg.add_argument(
+        "--max-rounds",
+        type=int,
+        default=10,
+        help="Max adversarial rounds",
+    )
+    reg.add_argument(
+        "--n-offspring",
+        type=int,
+        default=50,
+        help="Offspring per flagged record",
+    )
+    reg.add_argument(
+        "--weight-target",
+        type=float,
+        default=None,
+        help="Target max weight for offspring splitting",
+    )
+    reg.add_argument(
+        "--train-epochs",
+        type=int,
+        default=300,
+        help="TVAE epochs if training from scratch",
+    )
+    reg.add_argument("--seed", type=int, default=42)
+    reg.add_argument(
+        "--output",
+        "-o",
+        default="regularised_dataset.h5",
+        help="Output .h5 file",
+    )
+
+    args = parser.parse_args()
+    if args.command is None:
+        parser.print_help()
+        sys.exit(1)
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(name)s %(levelname)s %(message)s",
+    )
+
+    commands = {
+        "diagnose": cmd_diagnose,
+        "train": cmd_train,
+        "regularise": cmd_regularise,
+    }
+    commands[args.command](args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/policyengine_uk_data/diagnostics/generative_model.py b/policyengine_uk_data/diagnostics/generative_model.py
new file mode 100644
index 00000000..2d70f11b
--- /dev/null
+++ b/policyengine_uk_data/diagnostics/generative_model.py
@@ -0,0 +1,386 @@
+"""Generative model for household attributes.
+
+Trains a TVAE (Tabular Variational Autoencoder) on FRS input
+attributes and provides conditional sampling for offspring
+generation.  The model learns the joint distribution of household
+demographics, income, housing, and geographic variables so that
+synthetic records are plausible completions of partial attribute
+sets.
+
+Only *input* attributes are modelled.  Tax-benefit *outputs* (tax
+liability, benefit entitlement, net income) are recomputed by
+running offspring through PolicyEngine's calculator.
+"""
+
+import logging
+
+import numpy as np
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+# Input attributes to model.  These are the FRS variables that
+# define a household before PolicyEngine calculates anything.
+PERSON_INPUT_ATTRS = [
+    "age",
+    "gender",
+    "employment_income",
+    "self_employment_income",
+    "private_pension_income",
+    "savings_interest_income",
+    "dividend_income",
+    "property_income",
+    "hours_worked",
+    "employment_status",
+    "is_disabled_for_benefits",
+    "marital_status",
+]
+
+HOUSEHOLD_INPUT_ATTRS = [
+    "region",
+    "tenure_type",
+    "rent",
+    "council_tax",
+    "council_tax_band",
+    "accommodation_type",
+    "household_weight",
+]
+
+BENUNIT_INPUT_ATTRS = [
+    "would_claim_uc",
+    "is_married",
+]
+
+
+def extract_household_features(
+    dataset,
+    use_design_weights: bool = True,
+) -> pd.DataFrame:
+    """Extract a flat feature table from a UKSingleYearDataset.
+
+    Each row is one household.  Person-level variables are aggregated
+    to household level (head's values for demographics, sums for
+    incomes).
+
+    Args:
+        dataset: UKSingleYearDataset instance.
+        use_design_weights: if True, use the original grossing
+            weights (not calibrated) for training the generative
+            model.
+
+    Returns:
+        DataFrame with one row per household.
+    """
+    person = dataset.person
+    household = dataset.household
+    benunit = dataset.benunit
+
+    hh_ids = household.household_id.values
+    features = pd.DataFrame({"household_id": hh_ids})
+
+    # Household-level attributes
+    for attr in HOUSEHOLD_INPUT_ATTRS:
+        if attr in household.columns:
+            features[attr] = household[attr].values
+
+    # Person-level: head's demographics + income sums
+    head_mask = person.is_household_head.astype(bool)
+    heads = person[head_mask].set_index("person_household_id")
+
+    for attr in ["age", "gender", "employment_status", "marital_status"]:
+        if attr in heads.columns:
+            features[f"head_{attr}"] = heads[attr].reindex(hh_ids).values
+
+    # Income sums across all persons in household
+    income_attrs = [
+        "employment_income",
+        "self_employment_income",
+        "private_pension_income",
+        "savings_interest_income",
+        "dividend_income",
+        "property_income",
+    ]
+    for attr in income_attrs:
+        if attr in person.columns:
+            summed = (
+                person.groupby("person_household_id")[attr]
+                .sum()
+                .reindex(hh_ids)
+                .fillna(0)
+            )
+            features[f"hh_{attr}"] = summed.values
+
+    # Household size
+    features["n_persons"] = (
+        person.groupby("person_household_id")
+        .size()
+        .reindex(hh_ids)
+        .fillna(1)
+        .astype(int)
+        .values
+    )
+
+    # Number of children (age < 18)
+    if "age" in person.columns:
+        features["n_children"] = (
+            person[person.age < 18]
+            .groupby("person_household_id")
+            .size()
+            .reindex(hh_ids)
+            .fillna(0)
+            .astype(int)
+            .values
+        )
+
+    # Hours worked (head)
+    if "hours_worked" in heads.columns:
+        features["head_hours_worked"] = (
+            heads["hours_worked"].reindex(hh_ids).fillna(0).values
+        )
+
+    # Disability flag (any person in household)
+    if "is_disabled_for_benefits" in person.columns:
+        features["has_disabled_member"] = (
+            person.groupby("person_household_id")["is_disabled_for_benefits"]
+            .max()
+            .reindex(hh_ids)
+            .fillna(0)
+            .astype(int)
+            .values
+        )
+
+    # Benunit: UC claim status (any benunit in household)
+    if "would_claim_uc" in benunit.columns:
+        person_bu = person[["person_household_id", "person_benunit_id"]]
+        bu_hh = person_bu.drop_duplicates("person_benunit_id").set_index(
+            "person_benunit_id"
+        )["person_household_id"]
+        uc_by_hh = benunit.set_index("benunit_id")["would_claim_uc"].reindex(
+            bu_hh.index
+        )
+        uc_by_hh.index = bu_hh.values
+        features["any_uc_claim"] = (
+            uc_by_hh.groupby(level=0).max().reindex(hh_ids).fillna(0).astype(int).values
+        )
+
+    return features
+
+
+def identify_column_types(
+    df: pd.DataFrame,
+) -> tuple[list[str], list[str]]:
+    """Split columns into categorical and continuous.
+
+    Returns:
+        (categorical_columns, continuous_columns)
+    """
+    categorical = []
+    continuous = []
+    skip = {"household_id", "household_weight"}
+
+    for col in df.columns:
+        if col in skip:
+            continue
+        if df[col].dtype == object or df[col].nunique() < 20:
+            categorical.append(col)
+        else:
+            continuous.append(col)
+
+    return categorical, continuous
+
+
+def train_generative_model(
+    dataset,
+    epochs: int = 300,
+    seed: int = 42,
+):
+    """Train a TVAE on household features.
+
+    Args:
+        dataset: UKSingleYearDataset instance.
+        epochs: training epochs for the TVAE.
+        seed: random seed.
+
+    Returns:
+        Trained TVAE model (sdv SingleTableSynthesizer).
+    """
+    from sdv.single_table import TVAESynthesizer
+    from sdv.metadata import Metadata
+
+    features = extract_household_features(dataset)
+    categorical_cols, continuous_cols = identify_column_types(features)
+
+    # Drop household_id for training
+    train_df = features.drop(columns=["household_id"])
+    if "household_weight" in train_df.columns:
+        sample_weights = train_df["household_weight"].values.copy()
+        train_df = train_df.drop(columns=["household_weight"])
+    else:
+        sample_weights = None
+
+    # Build metadata
+    metadata = Metadata.detect_from_dataframe(data=train_df)
+
+    model = TVAESynthesizer(
+        metadata=metadata,
+        epochs=epochs,
+        verbose=True,
+    )
+
+    # Weight the training data by design weights if available
+    if sample_weights is not None:
+        # Resample proportional to weights for training
+        rng = np.random.default_rng(seed)
+        probs = sample_weights / sample_weights.sum()
+        n_train = len(train_df)
+        indices = rng.choice(len(train_df), size=n_train, replace=True, p=probs)
+        train_df = train_df.iloc[indices].reset_index(drop=True)
+
+    model.fit(train_df)
+    logger.info("TVAE trained on %d records", len(train_df))
+
+    return model
+
+
+def sample_offspring(
+    model,
+    source_record: pd.Series,
+    n_samples: int = 50,
+    conditioning_fractions: list[float] | None = None,
+    seed: int = 42,
+) -> pd.DataFrame:
+    """Generate synthetic offspring conditioned on a source record.
+
+    For each sample, a random subset of the source record's
+    attributes are fixed, and the rest are sampled from the model.
+    The conditioning fraction varies across samples to explore both
+    close variants and broader alternatives.
+
+    Args:
+        model: trained TVAE model.
+        source_record: Series of attribute values for the source
+            household.
+        n_samples: number of offspring to generate.
+        conditioning_fractions: list of fractions of attributes to
+            condition on.  Defaults to a spread from 0.2 to 0.8.
+        seed: random seed.
+
+    Returns:
+        DataFrame of synthetic offspring (n_samples rows).
+    """
+    rng = np.random.default_rng(seed)
+
+    if conditioning_fractions is None:
+        conditioning_fractions = [0.2, 0.4, 0.5, 0.6, 0.8]
+
+    all_cols = list(source_record.index)
+    skip_cols = {"household_id", "household_weight"}
+    usable_cols = [c for c in all_cols if c not in skip_cols]
+
+    offspring = []
+    samples_per_fraction = max(1, n_samples // len(conditioning_fractions))
+
+    for frac in conditioning_fractions:
+        n_cond = max(1, int(len(usable_cols) * frac))
+        for _ in range(samples_per_fraction):
+            cond_cols = rng.choice(usable_cols, size=n_cond, replace=False).tolist()
+            conditions = {col: source_record[col] for col in cond_cols}
+            try:
+                from sdv.sampling import Condition
+
+                condition = Condition(
+                    num_rows=1,
+                    column_values=conditions,
+                )
+                sample = model.sample_from_conditions(conditions=[condition])
+                offspring.append(sample)
+            except Exception:
+                # Fall back to unconditional sampling and manually
+                # override conditioned columns
+                sample = model.sample(num_rows=1)
+                for col, val in conditions.items():
+                    if col in sample.columns:
+                        sample[col] = val
+                offspring.append(sample)
+
+    if not offspring:
+        return pd.DataFrame()
+
+    result = pd.concat(offspring, ignore_index=True)
+
+    # Top up if we're short
+    if len(result) < n_samples:
+        extra = model.sample(num_rows=n_samples - len(result))
+        result = pd.concat([result, extra], ignore_index=True)
+
+    return result.head(n_samples)
+
+
+def validate_generative_model(
+    model,
+    original_features: pd.DataFrame,
+    n_samples: int = 10_000,
+) -> dict:
+    """Compare synthetic samples against original data.
+
+    Args:
+        model: trained TVAE model.
+        original_features: the training data.
+        n_samples: number of synthetic samples to generate.
+
+    Returns:
+        Dict with validation metrics:
+          - marginal_ks: Kolmogorov-Smirnov stats for continuous cols
+          - categorical_tvd: total variation distance for cat cols
+          - correlation_diff: max absolute difference in correlation
+            matrix
+    """
+    from scipy import stats
+
+    synthetic = model.sample(num_rows=n_samples)
+    orig = original_features.drop(
+        columns=["household_id", "household_weight"],
+        errors="ignore",
+    )
+
+    categorical_cols, continuous_cols = identify_column_types(orig)
+
+    # KS test for continuous columns
+    ks_stats = {}
+    for col in continuous_cols:
+        if col in synthetic.columns and col in orig.columns:
+            stat, _ = stats.ks_2samp(
+                orig[col].dropna().values,
+                synthetic[col].dropna().values,
+            )
+            ks_stats[col] = float(stat)
+
+    # Total variation distance for categorical columns
+    tvd = {}
+    for col in categorical_cols:
+        if col in synthetic.columns and col in orig.columns:
+            orig_dist = orig[col].value_counts(normalize=True)
+            synth_dist = synthetic[col].value_counts(normalize=True)
+            all_vals = set(orig_dist.index) | set(synth_dist.index)
+            tv = (
+                sum(abs(orig_dist.get(v, 0) - synth_dist.get(v, 0)) for v in all_vals)
+                / 2
+            )
+            tvd[col] = float(tv)
+
+    # Correlation matrix difference (continuous only)
+    shared_cont = [
+        c for c in continuous_cols if c in synthetic.columns and c in orig.columns
+    ]
+    if len(shared_cont) >= 2:
+        orig_corr = orig[shared_cont].corr().values
+        synth_corr = synthetic[shared_cont].corr().values
+        corr_diff = float(np.nanmax(np.abs(orig_corr - synth_corr)))
+    else:
+        corr_diff = None
+
+    return {
+        "marginal_ks": ks_stats,
+        "categorical_tvd": tvd,
+        "correlation_diff": corr_diff,
+    }
diff --git a/policyengine_uk_data/diagnostics/influence.py b/policyengine_uk_data/diagnostics/influence.py
new file mode 100644
index 00000000..e2672820
--- /dev/null
+++ b/policyengine_uk_data/diagnostics/influence.py
@@ -0,0 +1,447 @@
+"""Influence detector for survey record weights.
+
+Computes per-record influence across a reporting surface of
+(metric x slice) statistics.  A record has high influence when it
+contributes a large fraction of a slice-level aggregate, meaning
+small perturbations to that record propagate into published outputs.
+
+The reporting surface is built from:
+  - metrics:  net income, income tax, NI, universal credit, child
+              benefit, pension credit, council tax, housing benefit
+  - slices:   income decile, region, age band, family type, tenure
+
+Influence is computed under a sample of policy reforms (random
+parameter perturbations) so that structurally high-influence records
+are identified regardless of which reform is being analysed.
+"""
+
+import logging
+
+import numpy as np
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+# ── Reporting surface definition ────────────────────────────────────
+
+METRICS = [
+    "household_net_income",
+    "income_tax",
+    "national_insurance",
+    "universal_credit",
+    "child_benefit",
+    "pension_credit",
+    "council_tax",
+    "housing_benefit_reported",
+    "employment_income",
+    "self_employment_income",
+]
+
+SLICE_DEFINITIONS = {
+    "income_decile": {
+        "variable": "household_net_income",
+        "bins": 10,
+        "labels": [f"decile_{i}" for i in range(1, 11)],
+    },
+    "region": {
+        "variable": "region",
+        "categorical": True,
+    },
+    "age_band": {
+        "variable": "age",
+        "bins": [0, 16, 25, 35, 45, 55, 65, 75, 100],
+        "labels": [
+            "0-15",
+            "16-24",
+            "25-34",
+            "35-44",
+            "45-54",
+            "55-64",
+            "65-74",
+            "75+",
+        ],
+    },
+    "tenure": {
+        "variable": "tenure_type",
+        "categorical": True,
+    },
+}
+
+
+def _build_slice_assignments(
+    sim,
+    time_period: str,
+) -> dict[str, np.ndarray]:
+    """Compute household-level slice assignments.
+
+    Returns a dict mapping slice_name -> array of labels, one per
+    household.
+    """
+    slices = {}
+
+    for name, defn in SLICE_DEFINITIONS.items():
+        variable = defn["variable"]
+
+        if defn.get("categorical"):
+            values = sim.calculate(variable, map_to="household")
+            slices[name] = np.asarray(values)
+            continue
+
+        values = sim.calculate(variable, map_to="household").astype(float)
+        weights = sim.calculate("household_weight", map_to="household").astype(float)
+
+        if "bins" in defn and isinstance(defn["bins"], int):
+            # Weighted quantile bins
+            n_bins = defn["bins"]
+            sorted_idx = np.argsort(values)
+            cum_weight = np.cumsum(weights[sorted_idx])
+            total_weight = cum_weight[-1]
+            labels = np.empty(len(values), dtype=object)
+            for b in range(n_bins):
+                lo = b / n_bins * total_weight
+                hi = (b + 1) / n_bins * total_weight
+                mask_sorted = (cum_weight > lo) & (cum_weight <= hi)
+                if b == 0:
+                    mask_sorted[0] = True
+                labels[sorted_idx[mask_sorted]] = defn["labels"][b]
+            slices[name] = labels
+        else:
+            bins = defn["bins"]
+            label_list = defn["labels"]
+            digitised = np.digitize(values, bins) - 1
+            digitised = np.clip(digitised, 0, len(label_list) - 1)
+            slices[name] = np.array([label_list[d] for d in digitised])
+
+    return slices
+
+
+def _compute_metric_values(
+    sim,
+    time_period: str,
+) -> dict[str, np.ndarray]:
+    """Compute household-level metric values.
+
+    Returns a dict mapping metric_name -> array of values, one per
+    household.
+    """
+    result = {}
+    for metric in METRICS:
+        try:
+            result[metric] = np.asarray(
+                sim.calculate(metric, map_to="household"),
+                dtype=float,
+            )
+        except Exception:
+            logger.debug("Metric %s not available, skipping", metric)
+    return result
+
+
+def compute_influence_matrix(
+    sim,
+    time_period: str,
+    reform_sim=None,
+) -> pd.DataFrame:
+    """Compute per-record influence across the reporting surface.
+
+    Args:
+        sim: a policyengine_uk Microsimulation (baseline).
+        time_period: the period string (e.g. "2025").
+        reform_sim: optional reform Microsimulation.  When provided
+            the metric is the *change* between baseline and reform.
+
+    Returns:
+        DataFrame with shape (n_households, n_statistics) where each
+        cell I[i,s] is the fractional influence of household i on
+        statistic s.
+    """
+    weights = np.asarray(
+        sim.calculate("household_weight", map_to="household"),
+        dtype=float,
+    )
+    slices = _build_slice_assignments(sim, time_period)
+
+    if reform_sim is not None:
+        baseline_vals = _compute_metric_values(sim, time_period)
+        reform_vals = _compute_metric_values(reform_sim, time_period)
+        metric_values = {
+            m: reform_vals[m] - baseline_vals[m]
+            for m in baseline_vals
+            if m in reform_vals
+        }
+    else:
+        metric_values = _compute_metric_values(sim, time_period)
+
+    records = []
+    stat_names = []
+
+    for metric_name, values in metric_values.items():
+        for slice_name, labels in slices.items():
+            unique_labels = np.unique(labels)
+            for label in unique_labels:
+                if label is None or (isinstance(label, float) and np.isnan(label)):
+                    continue
+                mask = labels == label
+                weighted_total = np.sum(weights[mask] * values[mask])
+                denom = max(abs(weighted_total), 1e-10)
+                influence = np.abs(weights * values * mask) / denom
+                records.append(influence)
+                stat_names.append(f"{metric_name}/{slice_name}={label}")
+
+    if not records:
+        return pd.DataFrame()
+
+    matrix = np.column_stack(records)
+    return pd.DataFrame(matrix, columns=stat_names)
+
+
+def find_high_influence_records(
+    influence_matrix: pd.DataFrame,
+    threshold: float = 0.05,
+) -> pd.DataFrame:
+    """Identify records exceeding the influence threshold.
+
+    Args:
+        influence_matrix: output of compute_influence_matrix.
+        threshold: max allowable influence fraction (default 5%).
+
+    Returns:
+        DataFrame with columns:
+          - record_idx: household index
+          - max_influence: maximum influence across all statistics
+          - worst_statistic: the statistic where influence is highest
+          - n_violations: number of statistics exceeding threshold
+    """
+    if influence_matrix.empty:
+        return pd.DataFrame(
+            columns=[
+                "record_idx",
+                "max_influence",
+                "worst_statistic",
+                "n_violations",
+            ]
+        )
+
+    max_influence = influence_matrix.max(axis=1)
+    worst_stat_idx = influence_matrix.values.argmax(axis=1)
+    worst_stat = influence_matrix.columns[worst_stat_idx]
+    n_violations = (influence_matrix > threshold).sum(axis=1)
+
+    flagged_mask = max_influence > threshold
+    result = pd.DataFrame(
+        {
+            "record_idx": np.where(flagged_mask)[0],
+            "max_influence": max_influence[flagged_mask].values,
+            "worst_statistic": worst_stat[flagged_mask],
+            "n_violations": n_violations[flagged_mask].values,
+        }
+    )
+    return result.sort_values("max_influence", ascending=False).reset_index(drop=True)
+
+
+def compute_kish_effective_sample_size(
+    weights: np.ndarray,
+    slice_mask: np.ndarray | None = None,
+) -> float:
+    """Compute Kish's effective sample size.
+
+    n_eff = (sum w_i)^2 / sum(w_i^2)
+
+    Args:
+        weights: array of household weights.
+        slice_mask: optional boolean mask to restrict to a subgroup.
+
+    Returns:
+        Effective sample size.
+    """
+    if slice_mask is not None:
+        w = weights[slice_mask]
+    else:
+        w = weights
+    w = w[w > 0]
+    if len(w) == 0:
+        return 0.0
+    return float(np.sum(w) ** 2 / np.sum(w**2))
+
+
+def generate_random_reforms(
+    n_reforms: int = 50,
+    seed: int = 42,
+) -> list[dict]:
+    """Generate random parameter perturbations for influence sampling.
+
+    Each reform is a dict of parameter_path -> multiplier pairs.
+    The reforms perturb tax rates and benefit amounts by +-20%.
+
+    Args:
+        n_reforms: number of reforms to generate.
+        seed: random seed.
+
+    Returns:
+        List of reform specification dicts.
+    """
+    rng = np.random.default_rng(seed)
+
+    # Parameters amenable to perturbation
+    rate_params = [
+        "gov.hmrc.income_tax.rates.uk[0].rate",
+        "gov.hmrc.income_tax.rates.uk[1].rate",
+        "gov.hmrc.income_tax.rates.uk[2].rate",
+        "gov.hmrc.national_insurance.class_1.rates.employee.main.rate",
+    ]
+    amount_params = [
+        "gov.hmrc.income_tax.allowances.personal_allowance.amount",
+        "gov.dwp.universal_credit.elements.standard_allowance.amount.single.over_25",
+        "gov.dwp.universal_credit.elements.child.amount.first",
+    ]
+
+    reforms = []
+    for _ in range(n_reforms):
+        reform = {}
+        # Perturb 2-4 parameters per reform
+        n_params = rng.integers(2, 5)
+        all_params = rate_params + amount_params
+        chosen = rng.choice(
+            len(all_params),
+            size=min(n_params, len(all_params)),
+            replace=False,
+        )
+        for idx in chosen:
+            param = all_params[idx]
+            if param in rate_params:
+                # Rates: multiply by 0.8-1.2
+                reform[param] = float(rng.uniform(0.8, 1.2))
+            else:
+                # Amounts: multiply by 0.8-1.2
+                reform[param] = float(rng.uniform(0.8, 1.2))
+        reforms.append(reform)
+
+    return reforms
+
+
+def run_diagnostics(
+    dataset,
+    time_period: str = "2025",
+    n_reforms: int = 50,
+    threshold: float = 0.05,
+    seed: int = 42,
+) -> dict:
+    """Run the full Phase 1 influence diagnostics.
+
+    Args:
+        dataset: a UKSingleYearDataset.
+        time_period: calendar year as string.
+        n_reforms: number of random reforms for influence sampling.
+        threshold: max allowable influence fraction.
+        seed: random seed.
+
+    Returns:
+        Dict with keys:
+          - baseline_influence: DataFrame of influence matrix under
+            current law
+          - flagged_records: DataFrame of high-influence records
+          - weight_stats: dict of weight distribution statistics
+          - kish_by_slice: dict of Kish effective sample sizes
+          - reform_influence_summary: DataFrame summarising influence
+            across reforms
+    """
+    from policyengine_uk import Microsimulation
+
+    sim = Microsimulation(dataset=dataset)
+    sim.default_calculation_period = time_period
+
+    weights = np.asarray(
+        sim.calculate("household_weight", map_to="household"),
+        dtype=float,
+    )
+
+    # Weight distribution statistics
+    weight_stats = {
+        "n_households": len(weights),
+        "mean": float(np.mean(weights)),
+        "median": float(np.median(weights)),
+        "p90": float(np.percentile(weights, 90)),
+        "p99": float(np.percentile(weights, 99)),
+        "max": float(np.max(weights)),
+        "min": float(np.min(weights[weights > 0])),
+        "skewness": float(
+            np.mean(((weights - np.mean(weights)) / np.std(weights)) ** 3)
+        ),
+    }
+
+    # Baseline influence
+    logger.info("Computing baseline influence matrix...")
+    baseline_influence = compute_influence_matrix(sim, time_period)
+    flagged = find_high_influence_records(baseline_influence, threshold)
+
+    # Kish effective sample size by slice
+    slices = _build_slice_assignments(sim, time_period)
+    kish_by_slice = {"overall": compute_kish_effective_sample_size(weights)}
+    for slice_name, labels in slices.items():
+        for label in np.unique(labels):
+            if label is None:
+                continue
+            mask = labels == label
+            kish_by_slice[f"{slice_name}={label}"] = compute_kish_effective_sample_size(
+                weights, mask
+            )
+
+    # Reform-level influence sampling
+    reforms = generate_random_reforms(n_reforms, seed)
+    reform_max_influences = []
+
+    for i, reform_spec in enumerate(reforms):
+        logger.info(
+            "Computing influence for reform %d/%d...",
+            i + 1,
+            len(reforms),
+        )
+        try:
+            reform_sim = _create_reform_sim(dataset, time_period, reform_spec)
+            infl = compute_influence_matrix(sim, time_period, reform_sim=reform_sim)
+            if not infl.empty:
+                max_per_record = infl.max(axis=1)
+                reform_max_influences.append(max_per_record)
+        except Exception as e:
+            logger.warning("Reform %d failed: %s", i, e)
+
+    if reform_max_influences:
+        reform_matrix = pd.concat(reform_max_influences, axis=1).fillna(0)
+        reform_summary = pd.DataFrame(
+            {
+                "mean_max_influence": reform_matrix.mean(axis=1),
+                "max_max_influence": reform_matrix.max(axis=1),
+                "n_reforms_above_threshold": (reform_matrix > threshold).sum(axis=1),
+            }
+        )
+    else:
+        reform_summary = pd.DataFrame()
+
+    return {
+        "baseline_influence": baseline_influence,
+        "flagged_records": flagged,
+        "weight_stats": weight_stats,
+        "kish_by_slice": kish_by_slice,
+        "reform_influence_summary": reform_summary,
+    }
+
+
+def _create_reform_sim(dataset, time_period, reform_spec):
+    """Create a Microsimulation with parameter perturbations applied."""
+    from policyengine_uk import Microsimulation
+
+    sim = Microsimulation(dataset=dataset)
+    sim.default_calculation_period = time_period
+
+    for param_path, multiplier in reform_spec.items():
+        try:
+            param = sim.tax_benefit_system.parameters.get_child(param_path)
+            current = param(time_period)
+            param.update(
+                period=f"year:{time_period}:1",
+                value=current * multiplier,
+            )
+        except Exception:
+            pass
+
+    sim.tax_benefit_system.reset_parameter_caches()
+    return sim
diff --git a/policyengine_uk_data/diagnostics/offspring.py b/policyengine_uk_data/diagnostics/offspring.py
new file mode 100644
index 00000000..3d6df43f
--- /dev/null
+++ b/policyengine_uk_data/diagnostics/offspring.py
@@ -0,0 +1,329 @@
+"""Adversarial offspring generation.
+
+For each high-influence household record, generates synthetic
+offspring via the generative model, runs them through PolicyEngine
+to compute tax-benefit outputs, and assembles an expanded dataset
+ready for recalibration.
+"""
+
+import logging
+
+import numpy as np
+import pandas as pd
+
+from policyengine_uk_data.diagnostics.influence import (
+    compute_influence_matrix,
+    find_high_influence_records,
+)
+from policyengine_uk_data.diagnostics.generative_model import (
+    extract_household_features,
+    sample_offspring,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _expand_household_to_dataset_records(
+    synthetic_hh: pd.Series,
+    source_dataset,
+    source_hh_idx: int,
+    new_hh_id_start: int,
+    weight_per_offspring: float,
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """Create person/benunit/household rows for one synthetic household.
+
+    The synthetic household inherits the *structure* (number of
+    persons, benefit units, relationships) from the source household,
+    with attribute values replaced by the synthetic record where
+    applicable.
+
+    Args:
+        synthetic_hh: Series of synthetic household-level features.
+        source_dataset: the original UKSingleYearDataset.
+        source_hh_idx: index of the source household in the dataset.
+        new_hh_id_start: starting household_id for the new record.
+        weight_per_offspring: weight to assign.
+
+    Returns:
+        (person_df, benunit_df, household_df) for the new household.
+    """
+    orig_hh = source_dataset.household
+    orig_person = source_dataset.person
+    orig_benunit = source_dataset.benunit
+
+    source_hh_id = orig_hh.household_id.iloc[source_hh_idx]
+    new_hh_id = new_hh_id_start
+
+    # Copy the source household's structure
+    hh_row = orig_hh.iloc[[source_hh_idx]].copy()
+    hh_row["household_id"] = new_hh_id
+    hh_row["household_weight"] = weight_per_offspring
+
+    # Override household-level attributes from the synthetic record
+    hh_attr_map = {
+        "region": "region",
+        "tenure_type": "tenure_type",
+        "rent": "rent",
+        "council_tax": "council_tax",
+        "council_tax_band": "council_tax_band",
+        "accommodation_type": "accommodation_type",
+    }
+    for synth_col, hh_col in hh_attr_map.items():
+        if synth_col in synthetic_hh.index and hh_col in hh_row.columns:
+            hh_row[hh_col] = synthetic_hh[synth_col]
+
+    # Copy persons, remapping IDs
+    person_mask = orig_person.person_household_id == source_hh_id
+    new_persons = orig_person[person_mask].copy()
+    new_persons["person_household_id"] = new_hh_id
+
+    # Remap person IDs to avoid collisions
+    person_id_offset = new_hh_id * 1000
+    new_persons["person_id"] = np.arange(
+        person_id_offset,
+        person_id_offset + len(new_persons),
+    )
+
+    # Override head's income attributes from the synthetic record
+    head_mask = new_persons.is_household_head.astype(bool)
+    income_map = {
+        "hh_employment_income": "employment_income",
+        "hh_self_employment_income": "self_employment_income",
+        "hh_private_pension_income": "private_pension_income",
+        "hh_savings_interest_income": "savings_interest_income",
+        "hh_dividend_income": "dividend_income",
+        "hh_property_income": "property_income",
+    }
+    for synth_col, person_col in income_map.items():
+        if synth_col in synthetic_hh.index and person_col in new_persons.columns:
+            # Assign all household income to head (simplified)
+            new_persons.loc[head_mask, person_col] = max(
+                0, float(synthetic_hh[synth_col])
+            )
+
+    # Copy benefit units, remapping IDs
+    old_bu_ids = new_persons.person_benunit_id.unique()
+    bu_id_offset = new_hh_id * 100
+    bu_id_map = {old: bu_id_offset + i for i, old in enumerate(old_bu_ids)}
+    new_persons["person_benunit_id"] = new_persons["person_benunit_id"].map(bu_id_map)
+
+    bu_mask = orig_benunit.benunit_id.isin(old_bu_ids)
+    new_beunits = orig_benunit[bu_mask].copy()
+    new_beunits["benunit_id"] = new_beunits["benunit_id"].map(bu_id_map)
+
+    return new_persons, new_beunits, hh_row
+
+
+def generate_offspring_for_record(
+    dataset,
+    record_idx: int,
+    model,
+    features: pd.DataFrame,
+    n_offspring: int = 50,
+    weight_target: float | None = None,
+    seed: int = 42,
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """Generate synthetic offspring for a single household.
+
+    Args:
+        dataset: UKSingleYearDataset.
+        record_idx: index of the household to split.
+        model: trained generative model (TVAE).
+        features: household features DataFrame (from
+            extract_household_features).
+        n_offspring: number of candidate offspring.
+        weight_target: desired max weight per offspring.  If None,
+            uses p90 of the current weight distribution.
+        seed: random seed.
+
+    Returns:
+        (person_df, benunit_df, household_df) for all offspring
+        combined.
+    """
+    weights = dataset.household.household_weight.values
+    source_weight = weights[record_idx]
+
+    if weight_target is None:
+        weight_target = float(np.percentile(weights[weights > 0], 90))
+
+    k = max(2, int(np.ceil(source_weight / weight_target)))
+    n_candidates = max(n_offspring, k * 3)
+
+    source_features = features.iloc[record_idx]
+    synthetic = sample_offspring(
+        model,
+        source_features,
+        n_samples=n_candidates,
+        seed=seed,
+    )
+
+    weight_per = source_weight / n_candidates
+    max_hh_id = dataset.household.household_id.max()
+
+    all_persons = []
+    all_beunits = []
+    all_households = []
+
+    for i in range(len(synthetic)):
+        new_hh_id = int(max_hh_id + record_idx * 10_000 + i + 1)
+        try:
+            p, b, h = _expand_household_to_dataset_records(
+                synthetic.iloc[i],
+                dataset,
+                record_idx,
+                new_hh_id,
+                weight_per,
+            )
+            all_persons.append(p)
+            all_beunits.append(b)
+            all_households.append(h)
+        except Exception as e:
+            logger.debug("Offspring %d failed: %s", i, e)
+
+    if not all_persons:
+        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
+
+    return (
+        pd.concat(all_persons, ignore_index=True),
+        pd.concat(all_beunits, ignore_index=True),
+        pd.concat(all_households, ignore_index=True),
+    )
+
+
+def run_adversarial_loop(
+    dataset,
+    model,
+    time_period: str = "2025",
+    threshold: float = 0.05,
+    max_rounds: int = 10,
+    n_offspring: int = 50,
+    weight_target: float | None = None,
+    seed: int = 42,
+) -> dict:
+    """Run the full adversarial detect-spawn-recalibrate loop.
+
+    Args:
+        dataset: UKSingleYearDataset to expand.
+        model: trained generative model (TVAE).
+        time_period: calendar year as string.
+        threshold: max allowable influence fraction.
+        max_rounds: maximum number of adversarial rounds.
+        n_offspring: offspring per flagged record.
+        weight_target: desired max weight.
+        seed: random seed.
+
+    Returns:
+        Dict with:
+          - expanded_dataset: the expanded UKSingleYearDataset
+          - rounds_completed: number of rounds run
+          - influence_history: list of max-influence per round
+          - records_expanded: total number of records added
+    """
+    from policyengine_uk import Microsimulation
+    from policyengine_uk.data import UKSingleYearDataset
+    from policyengine_uk_data.diagnostics.recalibrate import (
+        recalibrate_with_regularisation,
+    )
+
+    working = dataset.copy()
+    features = extract_household_features(working)
+    influence_history = []
+    total_added = 0
+
+    for round_num in range(max_rounds):
+        logger.info("Adversarial round %d/%d", round_num + 1, max_rounds)
+
+        # Detect
+        sim = Microsimulation(dataset=working)
+        sim.default_calculation_period = time_period
+        infl = compute_influence_matrix(sim, time_period)
+        flagged = find_high_influence_records(infl, threshold)
+
+        if flagged.empty:
+            logger.info(
+                "No records above threshold, stopping at round %d",
+                round_num + 1,
+            )
+            break
+
+        max_infl = flagged.max_influence.iloc[0]
+        influence_history.append(float(max_infl))
+        logger.info(
+            "Round %d: %d flagged records, max influence %.3f",
+            round_num + 1,
+            len(flagged),
+            max_infl,
+        )
+
+        # Spawn offspring for the worst offender
+        worst_idx = int(flagged.record_idx.iloc[0])
+        persons_new, beunits_new, hh_new = generate_offspring_for_record(
+            working,
+            worst_idx,
+            model,
+            features,
+            n_offspring=n_offspring,
+            weight_target=weight_target,
+            seed=seed + round_num,
+        )
+
+        if hh_new.empty:
+            logger.warning(
+                "No offspring generated for record %d, skipping",
+                worst_idx,
+            )
+            continue
+
+        # Remove source record and add offspring
+        orig_hh_id = working.household.household_id.iloc[worst_idx]
+
+        new_person = pd.concat(
+            [
+                working.person[working.person.person_household_id != orig_hh_id],
+                persons_new,
+            ],
+            ignore_index=True,
+        )
+        new_benunit = pd.concat(
+            [
+                working.benunit[
+                    ~working.benunit.benunit_id.isin(
+                        working.person[
+                            working.person.person_household_id == orig_hh_id
+                        ].person_benunit_id
+                    )
+                ],
+                beunits_new,
+            ],
+            ignore_index=True,
+        )
+        new_household = pd.concat(
+            [
+                working.household[working.household.household_id != orig_hh_id],
+                hh_new,
+            ],
+            ignore_index=True,
+        )
+
+        working = UKSingleYearDataset(
+            person=new_person,
+            benunit=new_benunit,
+            household=new_household,
+            fiscal_year=int(time_period),
+        )
+        features = extract_household_features(working)
+        total_added += len(hh_new)
+
+        # Recalibrate
+        logger.info("Recalibrating expanded dataset...")
+        working = recalibrate_with_regularisation(
+            working,
+            time_period=time_period,
+        )
+
+    return {
+        "expanded_dataset": working,
+        "rounds_completed": min(round_num + 1, max_rounds),
+        "influence_history": influence_history,
+        "records_expanded": total_added,
+    }
diff --git a/policyengine_uk_data/diagnostics/recalibrate.py b/policyengine_uk_data/diagnostics/recalibrate.py
new file mode 100644
index 00000000..20bf0c41
--- /dev/null
+++ b/policyengine_uk_data/diagnostics/recalibrate.py
@@ -0,0 +1,165 @@
+"""Recalibration with weight regularisation.
+
+Extends the existing calibration pipeline to add entropy
+regularisation (penalising weight distributions that diverge from
+a prior) and optional hard weight capping.  This prevents the
+calibration from concentrating weight on a few records, even when
+the expanded dataset provides alternatives.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+import numpy as np
+import torch
+
+if TYPE_CHECKING:
+    from policyengine_uk.data import UKSingleYearDataset
+
+logger = logging.getLogger(__name__)
+
+
+def recalibrate_with_regularisation(
+    dataset,
+    time_period: str = "2025",
+    entropy_lambda: float = 0.01,
+    weight_cap: float | None = 5_000.0,
+    epochs: int = 256,
+    lr: float = 0.05,
+) -> UKSingleYearDataset:
+    """Recalibrate dataset weights with entropy regularisation.
+
+    Minimises:
+        sum_t (hat_T_t(w) - T_t)^2  +  lambda * sum_i w_i * log(w_i / w0_i)
+
+    where T_t are population targets, hat_T_t are weighted estimates,
+    w0_i are prior weights (uniform for offspring, design weights for
+    originals), and lambda controls regularisation strength.
+
+    Args:
+        dataset: UKSingleYearDataset to recalibrate.
+        time_period: calendar year as string.
+        entropy_lambda: entropy regularisation strength.
+        weight_cap: optional hard upper bound on any weight.
+        epochs: optimisation epochs.
+        lr: learning rate.
+
+    Returns:
+        Recalibrated UKSingleYearDataset (copy with updated weights).
+    """
+    from policyengine_uk_data.targets.build_loss_matrix import (
+        create_target_matrix,
+    )
+
+    dataset = dataset.copy()
+
+    matrix, targets = create_target_matrix(dataset, time_period=time_period)
+
+    if matrix.empty:
+        logger.warning("No targets available, returning unmodified dataset")
+        return dataset
+
+    initial_weights = dataset.household.household_weight.values.astype(float)
+    # Prior weights: original weights normalised to sum to population
+    w0 = np.maximum(initial_weights, 1.0)
+
+    # Tensors
+    log_w = torch.tensor(
+        np.log(np.maximum(initial_weights, 1e-6)),
+        dtype=torch.float32,
+        requires_grad=True,
+    )
+    M = torch.tensor(matrix.values, dtype=torch.float32)  # (n_households, n_targets)
+    T = torch.tensor(targets.values, dtype=torch.float32)  # (n_targets,)
+    w0_t = torch.tensor(w0, dtype=torch.float32)
+
+    optimizer = torch.optim.Adam([log_w], lr=lr)
+
+    def loss_fn():
+        w = torch.exp(log_w)
+
+        # Apply weight cap via soft clamping
+        if weight_cap is not None:
+            w = torch.clamp(w, max=weight_cap)
+
+        # Target matching: symmetric relative error
+        pred = (w.unsqueeze(1) * M).sum(dim=0)
+        sre = torch.min(
+            ((1 + pred) / (1 + T) - 1) ** 2,
+            ((1 + T) / (1 + pred) - 1) ** 2,
+        )
+        target_loss = sre.mean()
+
+        # Entropy regularisation: KL divergence from prior
+        w_normed = w / w.sum()
+        w0_normed = w0_t / w0_t.sum()
+        # Avoid log(0) with small epsilon
+        kl = (w_normed * torch.log((w_normed + 1e-10) / (w0_normed + 1e-10))).sum()
+
+        return target_loss + entropy_lambda * kl
+
+    for epoch in range(epochs):
+        optimizer.zero_grad()
+        loss = loss_fn()
+        loss.backward()
+        optimizer.step()
+
+        if epoch % 50 == 0:
+            w_current = torch.exp(log_w).detach().numpy()
+            if weight_cap is not None:
+                w_current = np.clip(w_current, 0, weight_cap)
+            logger.info(
+                "Epoch %d: loss=%.6f, max_weight=%.0f, n_nonzero=%d",
+                epoch,
+                loss.item(),
+                w_current.max(),
+                (w_current > 1).sum(),
+            )
+
+    # Final weights
+    final_weights = torch.exp(log_w).detach().numpy()
+    if weight_cap is not None:
+        final_weights = np.clip(final_weights, 0, weight_cap)
+
+    dataset.household["household_weight"] = final_weights
+    return dataset
+
+
+def prune_zero_weight_records(
+    dataset,
+    epsilon: float = 1.0,
+) -> UKSingleYearDataset:
+    """Remove records with near-zero weight after recalibration.
+
+    Args:
+        dataset: UKSingleYearDataset with calibrated weights.
+        epsilon: weight threshold below which records are removed.
+
+    Returns:
+        Pruned UKSingleYearDataset.
+    """
+    from policyengine_uk.data import UKSingleYearDataset
+
+    keep_mask = dataset.household.household_weight > epsilon
+    keep_hh_ids = dataset.household.household_id[keep_mask].values
+
+    person_keep = dataset.person.person_household_id.isin(keep_hh_ids)
+    keep_bu_ids = dataset.person[person_keep].person_benunit_id.unique()
+    benunit_keep = dataset.benunit.benunit_id.isin(keep_bu_ids)
+
+    n_removed = (~keep_mask).sum()
+    logger.info(
+        "Pruned %d zero-weight records (%.1f%%), %d remain",
+        n_removed,
+        100 * n_removed / len(keep_mask),
+        keep_mask.sum(),
+    )
+
+    return UKSingleYearDataset(
+        person=dataset.person[person_keep].reset_index(drop=True),
+        benunit=dataset.benunit[benunit_keep].reset_index(drop=True),
+        household=dataset.household[keep_mask].reset_index(drop=True),
+        fiscal_year=int(dataset.time_period),
+    )
diff --git a/pyproject.toml b/pyproject.toml
index 7bad94c4..8f8c2a5f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,6 +31,8 @@ dependencies = [
     "openpyxl",
     "pydantic>=2.0",
     "pyyaml",
+    "sdv>=1.0.0",
+    "scipy",
 ]
 
 [project.optional-dependencies]
diff --git a/uv.lock b/uv.lock
index fd7ec078..14d34e40 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,6 +1,10 @@
 version = 1
 revision = 3
 requires-python = ">=3.13"
+resolution-markers = [
+    "python_full_version >= '3.14'",
+    "python_full_version < '3.14'",
+]
 
 [[package]]
 name = "accessible-pygments"
@@ -107,7 +111,8 @@ dependencies = [
     { name = "msgpack" },
     { name = "ndindex" },
     { name = "numexpr", marker = "platform_machine != 'wasm32'" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
     { name = "platformdirs" },
     { name = "py-cpuinfo", marker = "platform_machine != 'wasm32'" },
     { name = "requests" },
@@ -131,6 +136,34 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/24/b5/05dd7a720d8cd5f523146a4e2ff5d051125b23c2395c5423a9b1c42a3889/blosc2-3.11.1-cp314-cp314t-win_amd64.whl", hash = "sha256:cd276ab00b9b6ea2810091879e4665150738b6d323e1f1970ccd62b58df7b9b6", size = 2355394, upload-time = "2025-11-16T16:02:11.061Z" },
 ]
 
+[[package]]
+name = "boto3"
+version = "1.42.69"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "botocore" },
+    { name = "jmespath" },
+    { name = "s3transfer" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/1f/f3/26d800e4efe85e7d59c63ac11d02ab2fafed371bede567af7258eb7e4c1c/boto3-1.42.69.tar.gz", hash = "sha256:e59846f4ff467b23bae4751948298db554dbdda0d72b09028d2cacbeff27e1ad", size = 112777, upload-time = "2026-03-16T20:35:30.77Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f3/39/54ad87436c637de9f7bf83ba2a28cf3b15409cbb849401837fcc37fbd794/boto3-1.42.69-py3-none-any.whl", hash = "sha256:6823a4b59aa578c7d98124280a9b6d83cea04bdb02525cbaa79370e5b6f7f631", size = 140556, upload-time = "2026-03-16T20:35:28.754Z" },
+]
+
+[[package]]
+name = "botocore"
+version = "1.42.69"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "jmespath" },
+    { name = "python-dateutil" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/42/d1/81a6e39c7d5419ba34bad8a1ac2c5360c26f21af698a481a8397d79134d1/botocore-1.42.69.tar.gz", hash = "sha256:0934f2d90403c5c8c2cba83e754a39d77edcad5885d04a79363edff3e814f55e", size = 14997632, upload-time = "2026-03-16T20:35:18.533Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f6/13/779f3427e17f9989fd0fa6651817c5f13b63e574f3541e460b8238883290/botocore-1.42.69-py3-none-any.whl", hash = "sha256:ef0e3d860a5d7bffc0ccb4911781c4c27d538557ed9a616ba1926c762d72e5f6", size = 14670334, upload-time = "2026-03-16T20:35:14.543Z" },
+]
+
 [[package]]
 name = "build"
 version = "1.3.0"
@@ -216,6 +249,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" },
 ]
 
+[[package]]
+name = "cloudpickle"
+version = "3.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/27/fb/576f067976d320f5f0114a8d9fa1215425441bb35627b1993e5afd8111e5/cloudpickle-3.1.2.tar.gz", hash = "sha256:7fda9eb655c9c230dab534f1983763de5835249750e85fbcef43aaa30a9a2414", size = 22330, upload-time = "2025-11-03T09:25:26.604Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/88/39/799be3f2f0f38cc727ee3b4f1445fe6d5e4133064ec2e4115069418a5bb6/cloudpickle-3.1.2-py3-none-any.whl", hash = "sha256:9acb47f6afd73f60dc1df93bb801b472f05ff42fa6c84167d25cb206be1fbf4a", size = 22228, upload-time = "2025-11-03T09:25:25.534Z" },
+]
+
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -237,6 +279,39 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6d/c1/e419ef3723a074172b68aaa89c9f3de486ed4c2399e2dbd8113a4fdcaf9e/colorlog-6.10.1-py3-none-any.whl", hash = "sha256:2d7e8348291948af66122cff006c9f8da6255d224e7cf8e37d8de2df3bad8c9c", size = 11743, upload-time = "2025-10-16T16:14:10.512Z" },
 ]
 
+[[package]]
+name = "copulas"
+version = "0.14.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
+    { name = "pandas" },
+    { name = "plotly" },
+    { name = "scipy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3d/a8/f15432ee5a691eafb7ecc5637c01843df92c0801aa83151c35d0fb190c92/copulas-0.14.1.tar.gz", hash = "sha256:adec8f65c98f16816bde5a03e9e7e7b3df91f3fb22ef9fd5023ebba8dd1628c8", size = 45007, upload-time = "2026-02-05T18:52:41.019Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/50/2c/7984ead5c59c7d3066d6c7b6a7839991a317ecfb3ab1f963900e14f3c339/copulas-0.14.1-py3-none-any.whl", hash = "sha256:6f010444385a304274e7587b45f56b860ca38b621529ad0f9bbd4024de91dd1d", size = 52663, upload-time = "2026-02-05T18:52:39.263Z" },
+]
+
+[[package]]
+name = "ctgan"
+version = "0.12.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
+    { name = "pandas" },
+    { name = "rdt" },
+    { name = "torch" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f4/0e/50724b2e49ad83c2ebd00d8b57753dc07f256b27e64aca1306de5f4666b8/ctgan-0.12.1.tar.gz", hash = "sha256:e545c2b1a752affba3de2933a5f8037228e837f7a73f5593399b52cfe9611bc7", size = 27072, upload-time = "2026-02-13T03:22:40.909Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/c2/e4761d20a9704ba7595ea7d14dc4880c3cd0bd81b8ae588435536b7d8f19/ctgan-0.12.1-py3-none-any.whl", hash = "sha256:38a3b83432643caa8381c74c49e6a079166efa40f8f6c3b7204db44d6d2c8f18", size = 25490, upload-time = "2026-02-13T03:22:39.48Z" },
+]
+
 [[package]]
 name = "datetime"
 version = "6.0"
@@ -259,6 +334,22 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" },
 ]
 
+[[package]]
+name = "deepecho"
+version = "0.8.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
+    { name = "pandas" },
+    { name = "torch" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f4/d7/68d071d98a2a921121f4e2f2a78ece38ce83dcdeb5dadad42d207b153e07/deepecho-0.8.1.tar.gz", hash = "sha256:7589d9b1be1a482a879caca7f674acf1195441de0c8ae020dd1c17a726472f86", size = 30733, upload-time = "2026-02-12T21:16:35.964Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/dd/43e447dbac86b38e7ac4afc38f24efc396b0bd380d172edf3aa2635e1364/deepecho-0.8.1-py3-none-any.whl", hash = "sha256:1706f85e479b8be5cedfbb14d9823eee5fddff9f3d13e73691241af7bd874e84", size = 28070, upload-time = "2026-02-12T21:16:34.582Z" },
+]
+
 [[package]]
 name = "defusedxml"
 version = "0.7.1"
@@ -304,6 +395,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" },
 ]
 
+[[package]]
+name = "faker"
+version = "40.11.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "tzdata", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/94/dc/b68e5378e5a7db0ab776efcdd53b6fe374b29d703e156fd5bb4c5437069e/faker-40.11.0.tar.gz", hash = "sha256:7c419299103b13126bd02ec14bd2b47b946edb5a5eedf305e66a193b25f9a734", size = 1957570, upload-time = "2026-03-13T14:36:11.844Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b1/fa/a86c6ba66f0308c95b9288b1e3eaccd934b545646f63494a86f1ec2f8c8e/faker-40.11.0-py3-none-any.whl", hash = "sha256:0e9816c950528d2a37d74863f3ef389ea9a3a936cbcde0b11b8499942e25bf90", size = 1989457, upload-time = "2026-03-13T14:36:09.792Z" },
+]
+
 [[package]]
 name = "filelock"
 version = "3.20.0"
@@ -437,6 +540,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c4/ab/09169d5a4612a5f92490806649ac8d41e3ec9129c636754575b3553f4ea4/googleapis_common_protos-1.72.0-py3-none-any.whl", hash = "sha256:4299c5a82d5ae1a9702ada957347726b167f9f8d1fc352477702a1e851ff4038", size = 297515, upload-time = "2025-11-06T18:29:13.14Z" },
 ]
 
+[[package]]
+name = "graphviz"
+version = "0.21"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f8/b3/3ac91e9be6b761a4b30d66ff165e54439dcd48b83f4e20d644867215f6ca/graphviz-0.21.tar.gz", hash = "sha256:20743e7183be82aaaa8ad6c93f8893c923bd6658a04c32ee115edb3c8a835f78", size = 200434, upload-time = "2025-06-15T09:35:05.824Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/91/4c/e0ce1ef95d4000ebc1c11801f9b944fa5910ecc15b5e351865763d8657f8/graphviz-0.21-py3-none-any.whl", hash = "sha256:54f33de9f4f911d7e84e4191749cac8cc5653f815b06738c54db9a15ab8b1e42", size = 47300, upload-time = "2025-06-15T09:35:04.433Z" },
+]
+
 [[package]]
 name = "greenlet"
 version = "3.2.4"
@@ -446,7 +558,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" },
     { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" },
     { url = "https://files.pythonhosted.org/packages/f7/0b/bc13f787394920b23073ca3b6c4a7a21396301ed75a655bcb47196b50e6e/greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc", size = 655191, upload-time = "2025-08-07T13:45:29.752Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/d6/6adde57d1345a8d0f14d31e4ab9c23cfe8e2cd39c3baf7674b4b0338d266/greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a", size = 649516, upload-time = "2025-08-07T13:53:16.314Z" },
     { url = "https://files.pythonhosted.org/packages/7f/3b/3a3328a788d4a473889a2d403199932be55b1b0060f4ddd96ee7cdfcad10/greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504", size = 652169, upload-time = "2025-08-07T13:18:32.861Z" },
     { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" },
     { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" },
@@ -457,7 +568,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" },
     { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" },
     { url = "https://files.pythonhosted.org/packages/c0/aa/687d6b12ffb505a4447567d1f3abea23bd20e73a5bed63871178e0831b7a/greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5", size = 699218, upload-time = "2025-08-07T13:45:30.969Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" },
     { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" },
     { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" },
     { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" },
@@ -479,7 +589,8 @@ name = "h5py"
 version = "3.15.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/4d/6a/0d79de0b025aa85dc8864de8e97659c94cf3d23148394a954dc5ca52f8c8/h5py-3.15.1.tar.gz", hash = "sha256:c86e3ed45c4473564de55aa83b6fc9e5ead86578773dfbd93047380042e26b69", size = 426236, upload-time = "2025-10-16T10:35:27.404Z" }
 wheels = [
@@ -632,7 +743,8 @@ version = "2.5.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "ipython" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
     { name = "pandas" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/02/a2/4652db589b5767ead6d1dd8016e94e6adc5ec9e9552ccd17cf1886900b04/itables-2.5.2.tar.gz", hash = "sha256:ec34bbacfbf4305570ea75b36970de442f924126f3701c323a5a46018de84c8a", size = 2356416, upload-time = "2025-09-02T20:14:41.722Z" }
@@ -664,6 +776,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
 ]
 
+[[package]]
+name = "jmespath"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" },
+]
+
 [[package]]
 name = "joblib"
 version = "1.5.2"
@@ -687,7 +808,8 @@ name = "l0-python"
 version = "0.4.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
     { name = "scipy" },
     { name = "torch" },
 ]
@@ -799,7 +921,8 @@ version = "0.21.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "l0-python" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
     { name = "optuna" },
     { name = "pandas" },
     { name = "torch" },
@@ -815,7 +938,8 @@ name = "microdf-python"
 version = "1.2.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
     { name = "pandas" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/91/f0/9689f33e2524b0c0d1cdf0d556ad196bfbb2ec0292f4545f467a37b27773/microdf_python-1.2.2.tar.gz", hash = "sha256:7e5f6adc10b0469de0e6549789ede0a2e6c600d0f5c83eafffc009d1495a7933", size = 20395, upload-time = "2026-02-24T10:47:16.438Z" }
@@ -829,7 +953,8 @@ version = "1.8.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "joblib" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
     { name = "optuna" },
     { name = "pandas" },
     { name = "plotly" },
@@ -837,7 +962,8 @@ dependencies = [
     { name = "pydantic" },
     { name = "quantile-forest" },
     { name = "requests" },
-    { name = "scikit-learn" },
+    { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
     { name = "scipy" },
     { name = "statsmodels" },
     { name = "tqdm" },
@@ -945,7 +1071,8 @@ name = "numexpr"
 version = "2.14.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/cb/2f/fdba158c9dbe5caca9c3eca3eaffffb251f2fb8674bf8e2d0aed5f38d319/numexpr-2.14.1.tar.gz", hash = "sha256:4be00b1086c7b7a5c32e31558122b7b80243fe098579b170967da83f3152b48b", size = 119400, upload-time = "2025-10-13T16:17:27.351Z" }
 wheels = [
@@ -987,6 +1114,9 @@ wheels = [
 name = "numpy"
 version = "2.1.3"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.14'",
+]
 sdist = { url = "https://files.pythonhosted.org/packages/25/ca/1166b75c21abd1da445b97bf1fa2f14f423c6cfb4fc7c4ef31dccf9f6a94/numpy-2.1.3.tar.gz", hash = "sha256:aa08e04e08aaf974d4458def539dece0d28146d866a39da5639596f4921fd761", size = 20166090, upload-time = "2024-11-02T17:48:55.832Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/4d/0b/620591441457e25f3404c8057eb924d04f161244cb8a3680d529419aa86e/numpy-2.1.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:96fe52fcdb9345b7cd82ecd34547fca4321f7656d500eca497eb7ea5a926692f", size = 20836263, upload-time = "2024-11-02T17:40:39.528Z" },
@@ -1011,6 +1141,59 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/86/09/a5ab407bd7f5f5599e6a9261f964ace03a73e7c6928de906981c31c38082/numpy-2.1.3-cp313-cp313t-win_amd64.whl", hash = "sha256:2564fbdf2b99b3f815f2107c1bbc93e2de8ee655a69c261363a1172a79a257d4", size = 12644098, upload-time = "2024-11-02T17:46:07.941Z" },
 ]
 
+[[package]]
+name = "numpy"
+version = "2.4.3"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/10/8b/c265f4823726ab832de836cdd184d0986dcf94480f81e8739692a7ac7af2/numpy-2.4.3.tar.gz", hash = "sha256:483a201202b73495f00dbc83796c6ae63137a9bdade074f7648b3e32613412dd", size = 20727743, upload-time = "2026-03-09T07:58:53.426Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b6/d0/1fe47a98ce0df229238b77611340aff92d52691bcbc10583303181abf7fc/numpy-2.4.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b346845443716c8e542d54112966383b448f4a3ba5c66409771b8c0889485dd3", size = 16665297, upload-time = "2026-03-09T07:56:52.296Z" },
+    { url = "https://files.pythonhosted.org/packages/27/d9/4e7c3f0e68dfa91f21c6fb6cf839bc829ec920688b1ce7ec722b1a6202fb/numpy-2.4.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2629289168f4897a3c4e23dc98d6f1731f0fc0fe52fb9db19f974041e4cc12b9", size = 14691853, upload-time = "2026-03-09T07:56:54.992Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/66/bd096b13a87549683812b53ab211e6d413497f84e794fb3c39191948da97/numpy-2.4.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:bb2e3cf95854233799013779216c57e153c1ee67a0bf92138acca0e429aefaee", size = 5198435, upload-time = "2026-03-09T07:56:57.184Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/2f/687722910b5a5601de2135c891108f51dfc873d8e43c8ed9f4ebb440b4a2/numpy-2.4.3-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:7f3408ff897f8ab07a07fbe2823d7aee6ff644c097cc1f90382511fe982f647f", size = 6546347, upload-time = "2026-03-09T07:56:59.531Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/ec/7971c4e98d86c564750393fab8d7d83d0a9432a9d78bb8a163a6dc59967a/numpy-2.4.3-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:decb0eb8a53c3b009b0962378065589685d66b23467ef5dac16cbe818afde27f", size = 15664626, upload-time = "2026-03-09T07:57:01.385Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/eb/7daecbea84ec935b7fc732e18f532073064a3816f0932a40a17f3349185f/numpy-2.4.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d5f51900414fc9204a0e0da158ba2ac52b75656e7dce7e77fb9f84bfa343b4cc", size = 16608916, upload-time = "2026-03-09T07:57:04.008Z" },
+    { url = "https://files.pythonhosted.org/packages/df/58/2a2b4a817ffd7472dca4421d9f0776898b364154e30c95f42195041dc03b/numpy-2.4.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6bd06731541f89cdc01b261ba2c9e037f1543df7472517836b78dfb15bd6e476", size = 17015824, upload-time = "2026-03-09T07:57:06.347Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/ca/627a828d44e78a418c55f82dd4caea8ea4a8ef24e5144d9e71016e52fb40/numpy-2.4.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:22654fe6be0e5206f553a9250762c653d3698e46686eee53b399ab90da59bd92", size = 18334581, upload-time = "2026-03-09T07:57:09.114Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/c0/76f93962fc79955fcba30a429b62304332345f22d4daec1cb33653425643/numpy-2.4.3-cp313-cp313-win32.whl", hash = "sha256:d71e379452a2f670ccb689ec801b1218cd3983e253105d6e83780967e899d687", size = 5958618, upload-time = "2026-03-09T07:57:11.432Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/3c/88af0040119209b9b5cb59485fa48b76f372c73068dbf9254784b975ac53/numpy-2.4.3-cp313-cp313-win_amd64.whl", hash = "sha256:0a60e17a14d640f49146cb38e3f105f571318db7826d9b6fef7e4dce758faecd", size = 12312824, upload-time = "2026-03-09T07:57:13.586Z" },
+    { url = "https://files.pythonhosted.org/packages/58/ce/3d07743aced3d173f877c3ef6a454c2174ba42b584ab0b7e6d99374f51ed/numpy-2.4.3-cp313-cp313-win_arm64.whl", hash = "sha256:c9619741e9da2059cd9c3f206110b97583c7152c1dc9f8aafd4beb450ac1c89d", size = 10221218, upload-time = "2026-03-09T07:57:16.183Z" },
+    { url = "https://files.pythonhosted.org/packages/62/09/d96b02a91d09e9d97862f4fc8bfebf5400f567d8eb1fe4b0cc4795679c15/numpy-2.4.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:7aa4e54f6469300ebca1d9eb80acd5253cdfa36f2c03d79a35883687da430875", size = 14819570, upload-time = "2026-03-09T07:57:18.564Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/ca/0b1aba3905fdfa3373d523b2b15b19029f4f3031c87f4066bd9d20ef6c6b/numpy-2.4.3-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:d1b90d840b25874cf5cd20c219af10bac3667db3876d9a495609273ebe679070", size = 5326113, upload-time = "2026-03-09T07:57:21.052Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/63/406e0fd32fcaeb94180fd6a4c41e55736d676c54346b7efbce548b94a914/numpy-2.4.3-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:a749547700de0a20a6718293396ec237bb38218049cfce788e08fcb716e8cf73", size = 6646370, upload-time = "2026-03-09T07:57:22.804Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/d0/10f7dc157d4b37af92720a196be6f54f889e90dcd30dce9dc657ed92c257/numpy-2.4.3-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:94f3c4a151a2e529adf49c1d54f0f57ff8f9b233ee4d44af623a81553ab86368", size = 15723499, upload-time = "2026-03-09T07:57:24.693Z" },
+    { url = "https://files.pythonhosted.org/packages/66/f1/d1c2bf1161396629701bc284d958dc1efa3a5a542aab83cf11ee6eb4cba5/numpy-2.4.3-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:22c31dc07025123aedf7f2db9e91783df13f1776dc52c6b22c620870dc0fab22", size = 16657164, upload-time = "2026-03-09T07:57:27.676Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/be/cca19230b740af199ac47331a21c71e7a3d0ba59661350483c1600d28c37/numpy-2.4.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:148d59127ac95979d6f07e4d460f934ebdd6eed641db9c0db6c73026f2b2101a", size = 17081544, upload-time = "2026-03-09T07:57:30.664Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/c5/9602b0cbb703a0936fb40f8a95407e8171935b15846de2f0776e08af04c7/numpy-2.4.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:a97cbf7e905c435865c2d939af3d93f99d18eaaa3cabe4256f4304fb51604349", size = 18380290, upload-time = "2026-03-09T07:57:33.763Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/81/9f24708953cd30be9ee36ec4778f4b112b45165812f2ada4cc5ea1c1f254/numpy-2.4.3-cp313-cp313t-win32.whl", hash = "sha256:be3b8487d725a77acccc9924f65fd8bce9af7fac8c9820df1049424a2115af6c", size = 6082814, upload-time = "2026-03-09T07:57:36.491Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/9e/52f6eaa13e1a799f0ab79066c17f7016a4a8ae0c1aefa58c82b4dab690b4/numpy-2.4.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1ec84fd7c8e652b0f4aaaf2e6e9cc8eaa9b1b80a537e06b2e3a2fb176eedcb26", size = 12452673, upload-time = "2026-03-09T07:57:38.281Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/04/b8cece6ead0b30c9fbd99bb835ad7ea0112ac5f39f069788c5558e3b1ab2/numpy-2.4.3-cp313-cp313t-win_arm64.whl", hash = "sha256:120df8c0a81ebbf5b9020c91439fccd85f5e018a927a39f624845be194a2be02", size = 10290907, upload-time = "2026-03-09T07:57:40.747Z" },
+    { url = "https://files.pythonhosted.org/packages/70/ae/3936f79adebf8caf81bd7a599b90a561334a658be4dcc7b6329ebf4ee8de/numpy-2.4.3-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:5884ce5c7acfae1e4e1b6fde43797d10aa506074d25b531b4f54bde33c0c31d4", size = 16664563, upload-time = "2026-03-09T07:57:43.817Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/62/760f2b55866b496bb1fa7da2a6db076bef908110e568b02fcfc1422e2a3a/numpy-2.4.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:297837823f5bc572c5f9379b0c9f3a3365f08492cbdc33bcc3af174372ebb168", size = 14702161, upload-time = "2026-03-09T07:57:46.169Z" },
+    { url = "https://files.pythonhosted.org/packages/32/af/a7a39464e2c0a21526fb4fb76e346fb172ebc92f6d1c7a07c2c139cc17b1/numpy-2.4.3-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:a111698b4a3f8dcbe54c64a7708f049355abd603e619013c346553c1fd4ca90b", size = 5208738, upload-time = "2026-03-09T07:57:48.506Z" },
+    { url = "https://files.pythonhosted.org/packages/29/8c/2a0cf86a59558fa078d83805589c2de490f29ed4fb336c14313a161d358a/numpy-2.4.3-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:4bd4741a6a676770e0e97fe9ab2e51de01183df3dcbcec591d26d331a40de950", size = 6543618, upload-time = "2026-03-09T07:57:50.591Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/b8/612ce010c0728b1c363fa4ea3aa4c22fe1c5da1de008486f8c2f5cb92fae/numpy-2.4.3-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:54f29b877279d51e210e0c80709ee14ccbbad647810e8f3d375561c45ef613dd", size = 15680676, upload-time = "2026-03-09T07:57:52.34Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/7e/4f120ecc54ba26ddf3dc348eeb9eb063f421de65c05fc961941798feea18/numpy-2.4.3-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:679f2a834bae9020f81534671c56fd0cc76dd7e5182f57131478e23d0dc59e24", size = 16613492, upload-time = "2026-03-09T07:57:54.91Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/86/1b6020db73be330c4b45d5c6ee4295d59cfeef0e3ea323959d053e5a6909/numpy-2.4.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d84f0f881cb2225c2dfd7f78a10a5645d487a496c6668d6cc39f0f114164f3d0", size = 17031789, upload-time = "2026-03-09T07:57:57.641Z" },
+    { url = "https://files.pythonhosted.org/packages/07/3a/3b90463bf41ebc21d1b7e06079f03070334374208c0f9a1f05e4ae8455e7/numpy-2.4.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d213c7e6e8d211888cc359bab7199670a00f5b82c0978b9d1c75baf1eddbeac0", size = 18339941, upload-time = "2026-03-09T07:58:00.577Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/74/6d736c4cd962259fd8bae9be27363eb4883a2f9069763747347544c2a487/numpy-2.4.3-cp314-cp314-win32.whl", hash = "sha256:52077feedeff7c76ed7c9f1a0428558e50825347b7545bbb8523da2cd55c547a", size = 6007503, upload-time = "2026-03-09T07:58:03.331Z" },
+    { url = "https://files.pythonhosted.org/packages/48/39/c56ef87af669364356bb011922ef0734fc49dad51964568634c72a009488/numpy-2.4.3-cp314-cp314-win_amd64.whl", hash = "sha256:0448e7f9caefb34b4b7dd2b77f21e8906e5d6f0365ad525f9f4f530b13df2afc", size = 12444915, upload-time = "2026-03-09T07:58:06.353Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/1f/ab8528e38d295fd349310807496fabb7cf9fe2e1f70b97bc20a483ea9d4a/numpy-2.4.3-cp314-cp314-win_arm64.whl", hash = "sha256:b44fd60341c4d9783039598efadd03617fa28d041fc37d22b62d08f2027fa0e7", size = 10494875, upload-time = "2026-03-09T07:58:08.734Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/ef/b7c35e4d5ef141b836658ab21a66d1a573e15b335b1d111d31f26c8ef80f/numpy-2.4.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0a195f4216be9305a73c0e91c9b026a35f2161237cf1c6de9b681637772ea657", size = 14822225, upload-time = "2026-03-09T07:58:11.034Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/8d/7730fa9278cf6648639946cc816e7cc89f0d891602584697923375f801ed/numpy-2.4.3-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:cd32fbacb9fd1bf041bf8e89e4576b6f00b895f06d00914820ae06a616bdfef7", size = 5328769, upload-time = "2026-03-09T07:58:13.67Z" },
+    { url = "https://files.pythonhosted.org/packages/47/01/d2a137317c958b074d338807c1b6a383406cdf8b8e53b075d804cc3d211d/numpy-2.4.3-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:2e03c05abaee1f672e9d67bc858f300b5ccba1c21397211e8d77d98350972093", size = 6649461, upload-time = "2026-03-09T07:58:15.912Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/34/812ce12bc0f00272a4b0ec0d713cd237cb390666eb6206323d1cc9cedbb2/numpy-2.4.3-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7d1ce23cce91fcea443320a9d0ece9b9305d4368875bab09538f7a5b4131938a", size = 15725809, upload-time = "2026-03-09T07:58:17.787Z" },
+    { url = "https://files.pythonhosted.org/packages/25/c0/2aed473a4823e905e765fee3dc2cbf504bd3e68ccb1150fbdabd5c39f527/numpy-2.4.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c59020932feb24ed49ffd03704fbab89f22aa9c0d4b180ff45542fe8918f5611", size = 16655242, upload-time = "2026-03-09T07:58:20.476Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/c8/7e052b2fc87aa0e86de23f20e2c42bd261c624748aa8efd2c78f7bb8d8c6/numpy-2.4.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:9684823a78a6cd6ad7511fc5e25b07947d1d5b5e2812c93fe99d7d4195130720", size = 17080660, upload-time = "2026-03-09T07:58:23.067Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/3d/0876746044db2adcb11549f214d104f2e1be00f07a67edbb4e2812094847/numpy-2.4.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0200b25c687033316fb39f0ff4e3e690e8957a2c3c8d22499891ec58c37a3eb5", size = 18380384, upload-time = "2026-03-09T07:58:25.839Z" },
+    { url = "https://files.pythonhosted.org/packages/07/12/8160bea39da3335737b10308df4f484235fd297f556745f13092aa039d3b/numpy-2.4.3-cp314-cp314t-win32.whl", hash = "sha256:5e10da9e93247e554bb1d22f8edc51847ddd7dde52d85ce31024c1b4312bfba0", size = 6154547, upload-time = "2026-03-09T07:58:28.289Z" },
+    { url = "https://files.pythonhosted.org/packages/42/f3/76534f61f80d74cc9cdf2e570d3d4eeb92c2280a27c39b0aaf471eda7b48/numpy-2.4.3-cp314-cp314t-win_amd64.whl", hash = "sha256:45f003dbdffb997a03da2d1d0cb41fbd24a87507fb41605c0420a3db5bd4667b", size = 12633645, upload-time = "2026-03-09T07:58:30.384Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/b6/7c0d4334c15983cec7f92a69e8ce9b1e6f31857e5ee3a413ac424e6bd63d/numpy-2.4.3-cp314-cp314t-win_arm64.whl", hash = "sha256:4d382735cecd7bcf090172489a525cd7d4087bc331f7df9f60ddc9a296cf208e", size = 10565454, upload-time = "2026-03-09T07:58:33.031Z" },
+]
+
 [[package]]
 name = "nvidia-cublas-cu12"
 version = "12.8.4.1"
@@ -1173,7 +1356,8 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "alembic" },
     { name = "colorlog" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
     { name = "packaging" },
     { name = "pyyaml" },
     { name = "sqlalchemy" },
@@ -1198,7 +1382,8 @@ name = "pandas"
 version = "2.3.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
     { name = "python-dateutil" },
     { name = "pytz" },
     { name = "tzdata" },
@@ -1256,7 +1441,8 @@ name = "patsy"
 version = "1.0.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/be/44/ed13eccdd0519eff265f44b670d46fbb0ec813e2274932dc1c0e48520f7d/patsy-1.0.2.tar.gz", hash = "sha256:cdc995455f6233e90e22de72c37fcadb344e7586fb83f06696f54d92f8ce74c0", size = 399942, upload-time = "2025-10-20T16:17:37.535Z" }
 wheels = [
@@ -1333,7 +1519,8 @@ dependencies = [
     { name = "ipython" },
     { name = "microdf-python" },
     { name = "numexpr" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
     { name = "pandas" },
     { name = "plotly" },
     { name = "psutil" },
@@ -1366,7 +1553,7 @@ wheels = [
 
 [[package]]
 name = "policyengine-uk-data"
-version = "1.40.3"
+version = "1.44.0"
 source = { editable = "." }
 dependencies = [
     { name = "google-auth" },
@@ -1385,6 +1572,8 @@ dependencies = [
     { name = "requests" },
     { name = "rich" },
     { name = "ruff" },
+    { name = "scipy" },
+    { name = "sdv" },
     { name = "tabulate" },
     { name = "tqdm" },
 ]
@@ -1427,6 +1616,8 @@ requires-dist = [
     { name = "rich", specifier = ">=13.0.0" },
     { name = "ruff", specifier = ">=0.9.0" },
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" },
+    { name = "scipy" },
+    { name = "sdv", specifier = ">=1.0.0" },
     { name = "tables", marker = "extra == 'dev'" },
     { name = "tabulate" },
     { name = "torch", marker = "extra == 'dev'" },
@@ -1716,8 +1907,10 @@ name = "quantile-forest"
 version = "1.4.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
-    { name = "scikit-learn" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
+    { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
     { name = "scipy" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/62/6e/3f1493d4abcce71fdc82ed575475d3e02da7b03375129e84be2622e1532f/quantile_forest-1.4.1.tar.gz", hash = "sha256:713a23c69562b7551ba4a05c22ce9d0e90db6a73d043e760b29c331cb19dc552", size = 486249, upload-time = "2025-09-10T12:48:04.578Z" }
@@ -1729,6 +1922,25 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f2/be/f77c6705e974b23353c43da1cd93e11fe0afc7e859c2d14f748d25cc0376/quantile_forest-1.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:fe33f6a8b63b3617568cc1254e1802a70ce3ac23897790f3be10f8db5257fe83", size = 685417, upload-time = "2025-09-10T12:47:57.346Z" },
 ]
 
+[[package]]
+name = "rdt"
+version = "1.20.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "faker" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
+    { name = "pandas" },
+    { name = "python-dateutil" },
+    { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
+    { name = "scipy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6b/2d/0218de90d3f995ecc2c86a1bd5c6f6a6c1a4109979389e4dbecd308d1b90/rdt-1.20.0.tar.gz", hash = "sha256:2f68e62f1a722cccea8b2ac44cfb2b1ade6e632f54684d84cf6e86c2c5fce773", size = 65388, upload-time = "2026-01-23T20:07:36.578Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9a/f2/0a95df18b5f549932228a196cace4571253b088f30430c0eec8662056256/rdt-1.20.0-py3-none-any.whl", hash = "sha256:e239edde36fd2bc1de51c119d93105a074adc8fdd6aa1941fa50af61ccb65dac", size = 74468, upload-time = "2026-01-23T20:07:34.107Z" },
+]
+
 [[package]]
 name = "requests"
 version = "2.32.5"
@@ -1803,15 +2015,30 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fe/4e/cd76eca6db6115604b7626668e891c9dd03330384082e33662fb0f113614/ruff-0.15.5-py3-none-win_arm64.whl", hash = "sha256:b498d1c60d2fe5c10c45ec3f698901065772730b411f164ae270bb6bfcc4740b", size = 10965572, upload-time = "2026-03-05T20:06:16.984Z" },
 ]
 
+[[package]]
+name = "s3transfer"
+version = "0.16.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "botocore" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/05/04/74127fc843314818edfa81b5540e26dd537353b123a4edc563109d8f17dd/s3transfer-0.16.0.tar.gz", hash = "sha256:8e990f13268025792229cd52fa10cb7163744bf56e719e0b9cb925ab79abf920", size = 153827, upload-time = "2025-12-01T02:30:59.114Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl", hash = "sha256:18e25d66fed509e3868dc1572b3f427ff947dd2c56f844a5bf09481ad3f3b2fe", size = 86830, upload-time = "2025-12-01T02:30:57.729Z" },
+]
+
 [[package]]
 name = "scikit-learn"
 version = "1.7.2"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.14'",
+]
 dependencies = [
-    { name = "joblib" },
-    { name = "numpy" },
-    { name = "scipy" },
-    { name = "threadpoolctl" },
+    { name = "joblib", marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "scipy", marker = "python_full_version < '3.14'" },
+    { name = "threadpoolctl", marker = "python_full_version < '3.14'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/98/c2/a7855e41c9d285dfe86dc50b250978105dce513d6e459ea66a6aeb0e1e0c/scikit_learn-1.7.2.tar.gz", hash = "sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda", size = 7193136, upload-time = "2025-09-09T08:21:29.075Z" }
 wheels = [
@@ -1832,12 +2059,54 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8e/87/24f541b6d62b1794939ae6422f8023703bbf6900378b2b34e0b4384dfefd/scikit_learn-1.7.2-cp314-cp314-win_amd64.whl", hash = "sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8", size = 8820007, upload-time = "2025-09-09T08:21:26.713Z" },
 ]
 
+[[package]]
+name = "scikit-learn"
+version = "1.8.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14'",
+]
+dependencies = [
+    { name = "joblib", marker = "python_full_version >= '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
+    { name = "scipy", marker = "python_full_version >= '3.14'" },
+    { name = "threadpoolctl", marker = "python_full_version >= '3.14'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0e/d4/40988bf3b8e34feec1d0e6a051446b1f66225f8529b9309becaeef62b6c4/scikit_learn-1.8.0.tar.gz", hash = "sha256:9bccbb3b40e3de10351f8f5068e105d0f4083b1a65fa07b6634fbc401a6287fd", size = 7335585, upload-time = "2025-12-10T07:08:53.618Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/03/aa/e22e0768512ce9255eba34775be2e85c2048da73da1193e841707f8f039c/scikit_learn-1.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0d6ae97234d5d7079dc0040990a6f7aeb97cb7fa7e8945f1999a429b23569e0a", size = 8513770, upload-time = "2025-12-10T07:08:03.251Z" },
+    { url = "https://files.pythonhosted.org/packages/58/37/31b83b2594105f61a381fc74ca19e8780ee923be2d496fcd8d2e1147bd99/scikit_learn-1.8.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:edec98c5e7c128328124a029bceb09eda2d526997780fef8d65e9a69eead963e", size = 8044458, upload-time = "2025-12-10T07:08:05.336Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/5a/3f1caed8765f33eabb723596666da4ebbf43d11e96550fb18bdec42b467b/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:74b66d8689d52ed04c271e1329f0c61635bcaf5b926db9b12d58914cdc01fe57", size = 8610341, upload-time = "2025-12-10T07:08:07.732Z" },
+    { url = "https://files.pythonhosted.org/packages/38/cf/06896db3f71c75902a8e9943b444a56e727418f6b4b4a90c98c934f51ed4/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8fdf95767f989b0cfedb85f7ed8ca215d4be728031f56ff5a519ee1e3276dc2e", size = 8900022, upload-time = "2025-12-10T07:08:09.862Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/f9/9b7563caf3ec8873e17a31401858efab6b39a882daf6c1bfa88879c0aa11/scikit_learn-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:2de443b9373b3b615aec1bb57f9baa6bb3a9bd093f1269ba95c17d870422b271", size = 7989409, upload-time = "2025-12-10T07:08:12.028Z" },
+    { url = "https://files.pythonhosted.org/packages/49/bd/1f4001503650e72c4f6009ac0c4413cb17d2d601cef6f71c0453da2732fc/scikit_learn-1.8.0-cp313-cp313-win_arm64.whl", hash = "sha256:eddde82a035681427cbedded4e6eff5e57fa59216c2e3e90b10b19ab1d0a65c3", size = 7619760, upload-time = "2025-12-10T07:08:13.688Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/7d/a630359fc9dcc95496588c8d8e3245cc8fd81980251079bc09c70d41d951/scikit_learn-1.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7cc267b6108f0a1499a734167282c00c4ebf61328566b55ef262d48e9849c735", size = 8826045, upload-time = "2025-12-10T07:08:15.215Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/56/a0c86f6930cfcd1c7054a2bc417e26960bb88d32444fe7f71d5c2cfae891/scikit_learn-1.8.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:fe1c011a640a9f0791146011dfd3c7d9669785f9fed2b2a5f9e207536cf5c2fd", size = 8420324, upload-time = "2025-12-10T07:08:17.561Z" },
+    { url = "https://files.pythonhosted.org/packages/46/1e/05962ea1cebc1cf3876667ecb14c283ef755bf409993c5946ade3b77e303/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72358cce49465d140cc4e7792015bb1f0296a9742d5622c67e31399b75468b9e", size = 8680651, upload-time = "2025-12-10T07:08:19.952Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/56/a85473cd75f200c9759e3a5f0bcab2d116c92a8a02ee08ccd73b870f8bb4/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:80832434a6cc114f5219211eec13dcbc16c2bac0e31ef64c6d346cde3cf054cb", size = 8925045, upload-time = "2025-12-10T07:08:22.11Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/b7/64d8cfa896c64435ae57f4917a548d7ac7a44762ff9802f75a79b77cb633/scikit_learn-1.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ee787491dbfe082d9c3013f01f5991658b0f38aa8177e4cd4bf434c58f551702", size = 8507994, upload-time = "2025-12-10T07:08:23.943Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/37/e192ea709551799379958b4c4771ec507347027bb7c942662c7fbeba31cb/scikit_learn-1.8.0-cp313-cp313t-win_arm64.whl", hash = "sha256:bf97c10a3f5a7543f9b88cbf488d33d175e9146115a451ae34568597ba33dcde", size = 7869518, upload-time = "2025-12-10T07:08:25.71Z" },
+    { url = "https://files.pythonhosted.org/packages/24/05/1af2c186174cc92dcab2233f327336058c077d38f6fe2aceb08e6ab4d509/scikit_learn-1.8.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:c22a2da7a198c28dd1a6e1136f19c830beab7fdca5b3e5c8bba8394f8a5c45b3", size = 8528667, upload-time = "2025-12-10T07:08:27.541Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/25/01c0af38fe969473fb292bba9dc2b8f9b451f3112ff242c647fee3d0dfe7/scikit_learn-1.8.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:6b595b07a03069a2b1740dc08c2299993850ea81cce4fe19b2421e0c970de6b7", size = 8066524, upload-time = "2025-12-10T07:08:29.822Z" },
+    { url = "https://files.pythonhosted.org/packages/be/ce/a0623350aa0b68647333940ee46fe45086c6060ec604874e38e9ab7d8e6c/scikit_learn-1.8.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:29ffc74089f3d5e87dfca4c2c8450f88bdc61b0fc6ed5d267f3988f19a1309f6", size = 8657133, upload-time = "2025-12-10T07:08:31.865Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/cb/861b41341d6f1245e6ca80b1c1a8c4dfce43255b03df034429089ca2a2c5/scikit_learn-1.8.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fb65db5d7531bccf3a4f6bec3462223bea71384e2cda41da0f10b7c292b9e7c4", size = 8923223, upload-time = "2025-12-10T07:08:34.166Z" },
+    { url = "https://files.pythonhosted.org/packages/76/18/a8def8f91b18cd1ba6e05dbe02540168cb24d47e8dcf69e8d00b7da42a08/scikit_learn-1.8.0-cp314-cp314-win_amd64.whl", hash = "sha256:56079a99c20d230e873ea40753102102734c5953366972a71d5cb39a32bc40c6", size = 8096518, upload-time = "2025-12-10T07:08:36.339Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/77/482076a678458307f0deb44e29891d6022617b2a64c840c725495bee343f/scikit_learn-1.8.0-cp314-cp314-win_arm64.whl", hash = "sha256:3bad7565bc9cf37ce19a7c0d107742b320c1285df7aab1a6e2d28780df167242", size = 7754546, upload-time = "2025-12-10T07:08:38.128Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/d1/ef294ca754826daa043b2a104e59960abfab4cf653891037d19dd5b6f3cf/scikit_learn-1.8.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:4511be56637e46c25721e83d1a9cea9614e7badc7040c4d573d75fbe257d6fd7", size = 8848305, upload-time = "2025-12-10T07:08:41.013Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/e2/b1f8b05138ee813b8e1a4149f2f0d289547e60851fd1bb268886915adbda/scikit_learn-1.8.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:a69525355a641bf8ef136a7fa447672fb54fe8d60cab5538d9eb7c6438543fb9", size = 8432257, upload-time = "2025-12-10T07:08:42.873Z" },
+    { url = "https://files.pythonhosted.org/packages/26/11/c32b2138a85dcb0c99f6afd13a70a951bfdff8a6ab42d8160522542fb647/scikit_learn-1.8.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c2656924ec73e5939c76ac4c8b026fc203b83d8900362eb2599d8aee80e4880f", size = 8678673, upload-time = "2025-12-10T07:08:45.362Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/57/51f2384575bdec454f4fe4e7a919d696c9ebce914590abf3e52d47607ab8/scikit_learn-1.8.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15fc3b5d19cc2be65404786857f2e13c70c83dd4782676dd6814e3b89dc8f5b9", size = 8922467, upload-time = "2025-12-10T07:08:47.408Z" },
+    { url = "https://files.pythonhosted.org/packages/35/4d/748c9e2872637a57981a04adc038dacaa16ba8ca887b23e34953f0b3f742/scikit_learn-1.8.0-cp314-cp314t-win_amd64.whl", hash = "sha256:00d6f1d66fbcf4eba6e356e1420d33cc06c70a45bb1363cd6f6a8e4ebbbdece2", size = 8774395, upload-time = "2025-12-10T07:08:49.337Z" },
+    { url = "https://files.pythonhosted.org/packages/60/22/d7b2ebe4704a5e50790ba089d5c2ae308ab6bb852719e6c3bd4f04c3a363/scikit_learn-1.8.0-cp314-cp314t-win_arm64.whl", hash = "sha256:f28dd15c6bb0b66ba09728cf09fd8736c304be29409bd8445a080c1280619e8c", size = 8002647, upload-time = "2025-12-10T07:08:51.601Z" },
+]
+
 [[package]]
 name = "scipy"
 version = "1.16.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/0a/ca/d8ace4f98322d01abcd52d381134344bf7b431eba7ed8b42bdea5a3c2ac9/scipy-1.16.3.tar.gz", hash = "sha256:01e87659402762f43bd2fee13370553a17ada367d42e7487800bf2916535aecb", size = 30597883, upload-time = "2025-10-28T17:38:54.068Z" }
 wheels = [
@@ -1883,6 +2152,52 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/64/47/a494741db7280eae6dc033510c319e34d42dd41b7ac0c7ead39354d1a2b5/scipy-1.16.3-cp314-cp314t-win_arm64.whl", hash = "sha256:21d9d6b197227a12dcbf9633320a4e34c6b0e51c57268df255a0942983bac562", size = 26464127, upload-time = "2025-10-28T17:38:11.34Z" },
 ]
 
+[[package]]
+name = "sdmetrics"
+version = "0.27.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "copulas" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
+    { name = "pandas" },
+    { name = "plotly" },
+    { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
+    { name = "scipy" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/1a/94/8b09a3df8d3572104528a3a52b7f7ea6e2b304495ef3e34ce570e4b653c1/sdmetrics-0.27.2.tar.gz", hash = "sha256:afb5b5b7084b62dbcecb3fdaeeac67e1c4ba4ed05f760342336bf67ea4e5cc52", size = 137564, upload-time = "2026-02-26T22:14:54.696Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c5/eb/c764f388b4b86b2c48e4c7af5cceba577d57dfc35e100e5ad59b8b3e5174/sdmetrics-0.27.2-py3-none-any.whl", hash = "sha256:c6ebb17850716bd290e7fb39c6bc520b2752c85961e5bafd63ca1fb55aa0adc1", size = 201455, upload-time = "2026-02-26T22:14:52.621Z" },
+]
+
+[[package]]
+name = "sdv"
+version = "1.34.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "boto3" },
+    { name = "botocore" },
+    { name = "cloudpickle" },
+    { name = "copulas" },
+    { name = "ctgan" },
+    { name = "deepecho" },
+    { name = "graphviz" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
+    { name = "pandas" },
+    { name = "platformdirs" },
+    { name = "pyyaml" },
+    { name = "rdt" },
+    { name = "sdmetrics" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c6/8f/1f31e3ec68ed9ab28a7882416c627ecd3c51b37b6852c250afb45f82606a/sdv-1.34.3.tar.gz", hash = "sha256:e85238b9bbde276a9386c9221b7749f35a4e02ba53467674ee8123afb85be474", size = 174054, upload-time = "2026-03-06T20:34:15.073Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/aa/8a/4991c5351a384c9f8f405e9483f045b01243c9c947374fcc9d77b97d6299/sdv-1.34.3-py3-none-any.whl", hash = "sha256:19e89eb0f48baf971a30b2dd5fcb9272d9939a7206813277542da4c50d5d238e", size = 200876, upload-time = "2026-03-06T20:34:13.385Z" },
+]
+
 [[package]]
 name = "setuptools"
 version = "80.9.0"
@@ -2089,7 +2404,8 @@ name = "statsmodels"
 version = "0.14.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
     { name = "packaging" },
     { name = "pandas" },
     { name = "patsy" },
@@ -2130,7 +2446,8 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "blosc2" },
     { name = "numexpr" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
     { name = "packaging" },
     { name = "py-cpuinfo" },
     { name = "typing-extensions" },

From db6a4b71d6267be486b581c7d42053297e89181b Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@cabinetoffice.gov.uk>
Date: Tue, 17 Mar 2026 16:39:32 +0000
Subject: [PATCH 2/2] Add weight diagnostics visualisation script

Produces charts showing weight distribution, Kish effective sample
sizes by population slice, high-influence records table, influence
heatmap, and weight-vs-influence scatter plot.
---
 analysis/visualise_weight_diagnostics.py | 243 +++++++++++++++++++++++
 1 file changed, 243 insertions(+)
 create mode 100644 analysis/visualise_weight_diagnostics.py

diff --git a/analysis/visualise_weight_diagnostics.py b/analysis/visualise_weight_diagnostics.py
new file mode 100644
index 00000000..0aa556ed
--- /dev/null
+++ b/analysis/visualise_weight_diagnostics.py
@@ -0,0 +1,243 @@
+"""Visualise weight distribution diagnostics for the enhanced FRS.
+
+Produces a set of charts showing:
+1. Weight distribution histogram (before regularisation)
+2. Per-slice Kish effective sample sizes
+3. Top high-influence records
+4. Influence heatmap (top records x statistics)
+"""
+
+import json
+import logging
+import sys
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+DATASET_PATH = "policyengine_uk_data/storage/enhanced_frs_2023_24.h5"
+OUTPUT_PREFIX = "analysis/weight_diagnostics"
+TIME_PERIOD = "2025"
+# Use fewer reforms for speed; increase for production
+N_REFORMS = 10
+THRESHOLD = 0.05
+
+
+def main():
+    from policyengine_uk.data import UKSingleYearDataset
+    from policyengine_uk_data.diagnostics.influence import (
+        compute_influence_matrix,
+        compute_kish_effective_sample_size,
+        find_high_influence_records,
+        _build_slice_assignments,
+    )
+    from policyengine_uk import Microsimulation
+
+    logger.info("Loading dataset from %s", DATASET_PATH)
+    dataset = UKSingleYearDataset(file_path=DATASET_PATH)
+
+    sim = Microsimulation(dataset=dataset)
+    sim.default_calculation_period = TIME_PERIOD
+
+    weights = np.asarray(
+        sim.calculate("household_weight", map_to="household"),
+        dtype=float,
+    )
+
+    # ── 1. Weight distribution ──────────────────────────────────────
+    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+
+    ax = axes[0]
+    ax.hist(weights, bins=100, edgecolor="white", alpha=0.8, color="#2563eb")
+    ax.set_xlabel("Household weight")
+    ax.set_ylabel("Count")
+    ax.set_title("Weight distribution (all households)")
+    ax.axvline(
+        np.median(weights),
+        color="red",
+        linestyle="--",
+        label=f"Median: {np.median(weights):,.0f}",
+    )
+    ax.axvline(
+        np.percentile(weights, 90),
+        color="orange",
+        linestyle="--",
+        label=f"P90: {np.percentile(weights, 90):,.0f}",
+    )
+    ax.axvline(
+        np.percentile(weights, 99),
+        color="darkred",
+        linestyle="--",
+        label=f"P99: {np.percentile(weights, 99):,.0f}",
+    )
+    ax.legend()
+
+    ax = axes[1]
+    log_weights = np.log10(np.maximum(weights, 1))
+    ax.hist(log_weights, bins=80, edgecolor="white", alpha=0.8, color="#7c3aed")
+    ax.set_xlabel("log₁₀(weight)")
+    ax.set_ylabel("Count")
+    ax.set_title("Weight distribution (log scale)")
+
+    plt.tight_layout()
+    fig.savefig(f"{OUTPUT_PREFIX}_weight_dist.png", dpi=150, bbox_inches="tight")
+    logger.info("Saved weight distribution plot")
+    plt.close()
+
+    # ── 2. Kish effective sample size by slice ──────────────────────
+    slices = _build_slice_assignments(sim, TIME_PERIOD)
+    kish_data = {"overall": compute_kish_effective_sample_size(weights)}
+    for slice_name, labels in slices.items():
+        for label in np.unique(labels):
+            if label is None:
+                continue
+            mask = labels == label
+            n_actual = mask.sum()
+            n_eff = compute_kish_effective_sample_size(weights, mask)
+            kish_data[f"{slice_name}={label}"] = n_eff
+
+    kish_df = pd.DataFrame(
+        {"slice": list(kish_data.keys()), "kish_n_eff": list(kish_data.values())}
+    ).sort_values("kish_n_eff")
+
+    fig, ax = plt.subplots(figsize=(10, max(6, len(kish_df) * 0.3)))
+    colors = [
+        "#ef4444" if v < 100 else "#f59e0b" if v < 500 else "#22c55e"
+        for v in kish_df.kish_n_eff
+    ]
+    ax.barh(kish_df.slice, kish_df.kish_n_eff, color=colors, edgecolor="white")
+    ax.set_xlabel("Kish effective sample size")
+    ax.set_title("Effective sample size by population slice")
+    ax.axvline(100, color="red", linestyle=":", alpha=0.5, label="n_eff = 100")
+    ax.axvline(500, color="orange", linestyle=":", alpha=0.5, label="n_eff = 500")
+    ax.legend()
+    plt.tight_layout()
+    fig.savefig(f"{OUTPUT_PREFIX}_kish.png", dpi=150, bbox_inches="tight")
+    logger.info("Saved Kish ESS plot")
+    plt.close()
+
+    # ── 3. Influence matrix ─────────────────────────────────────────
+    logger.info("Computing baseline influence matrix...")
+    infl = compute_influence_matrix(sim, TIME_PERIOD)
+    flagged = find_high_influence_records(infl, THRESHOLD)
+
+    if not flagged.empty:
+        # Top flagged records table
+        fig, ax = plt.subplots(figsize=(12, max(4, len(flagged.head(20)) * 0.4)))
+        ax.axis("off")
+        table_data = flagged.head(20).copy()
+        table_data["max_influence"] = table_data["max_influence"].map(
+            lambda x: f"{x:.3f}"
+        )
+        table = ax.table(
+            cellText=table_data.values,
+            colLabels=table_data.columns,
+            cellLoc="center",
+            loc="center",
+        )
+        table.auto_set_font_size(False)
+        table.set_fontsize(8)
+        table.auto_set_column_width(col=list(range(len(table_data.columns))))
+        ax.set_title(
+            f"Top {min(20, len(flagged))} high-influence records "
+            f"(threshold={THRESHOLD})",
+            fontsize=12,
+            pad=20,
+        )
+        plt.tight_layout()
+        fig.savefig(
+            f"{OUTPUT_PREFIX}_flagged_records.png", dpi=150, bbox_inches="tight"
+        )
+        logger.info("Saved flagged records table")
+        plt.close()
+
+        # Influence heatmap for top records
+        top_n = min(15, len(flagged))
+        top_indices = flagged.record_idx.iloc[:top_n].values
+
+        # Select columns with highest max influence
+        col_maxes = infl.max(axis=0).sort_values(ascending=False)
+        top_cols = col_maxes.head(30).index
+        heatmap_data = infl.iloc[top_indices][top_cols]
+
+        fig, ax = plt.subplots(figsize=(16, max(4, top_n * 0.5)))
+        im = ax.imshow(heatmap_data.values, aspect="auto", cmap="YlOrRd")
+        ax.set_yticks(range(top_n))
+        ax.set_yticklabels([f"HH #{idx}" for idx in top_indices], fontsize=7)
+        ax.set_xticks(range(len(top_cols)))
+        ax.set_xticklabels(
+            [c.split("/")[-1][:25] for c in top_cols],
+            rotation=90,
+            fontsize=6,
+        )
+        ax.set_title("Influence heatmap: top records × top statistics")
+        plt.colorbar(im, ax=ax, label="Influence fraction")
+        plt.tight_layout()
+        fig.savefig(f"{OUTPUT_PREFIX}_heatmap.png", dpi=150, bbox_inches="tight")
+        logger.info("Saved influence heatmap")
+        plt.close()
+    else:
+        logger.info("No records exceed influence threshold — no flagged records plot")
+
+    # ── 4. Weight vs influence scatter ──────────────────────────────
+    max_infl_per_record = infl.max(axis=1) if not infl.empty else pd.Series(dtype=float)
+
+    if not max_infl_per_record.empty:
+        fig, ax = plt.subplots(figsize=(10, 6))
+        sc = ax.scatter(
+            weights,
+            max_infl_per_record.values,
+            alpha=0.3,
+            s=5,
+            c=np.log10(np.maximum(weights, 1)),
+            cmap="viridis",
+        )
+        ax.set_xlabel("Household weight")
+        ax.set_ylabel("Max influence across all statistics")
+        ax.set_title("Weight vs maximum influence")
+        ax.axhline(THRESHOLD, color="red", linestyle="--", label=f"Threshold={THRESHOLD}")
+        ax.set_xscale("log")
+        ax.legend()
+        plt.colorbar(sc, ax=ax, label="log₁₀(weight)")
+        plt.tight_layout()
+        fig.savefig(f"{OUTPUT_PREFIX}_scatter.png", dpi=150, bbox_inches="tight")
+        logger.info("Saved weight vs influence scatter")
+        plt.close()
+
+    # ── 5. Summary statistics ───────────────────────────────────────
+    summary = {
+        "n_households": int(len(weights)),
+        "weight_mean": float(np.mean(weights)),
+        "weight_median": float(np.median(weights)),
+        "weight_p90": float(np.percentile(weights, 90)),
+        "weight_p99": float(np.percentile(weights, 99)),
+        "weight_max": float(np.max(weights)),
+        "weight_skewness": float(
+            np.mean(((weights - np.mean(weights)) / np.std(weights)) ** 3)
+        ),
+        "kish_overall": float(kish_data["overall"]),
+        "n_flagged_records": int(len(flagged)) if not flagged.empty else 0,
+        "threshold": THRESHOLD,
+    }
+
+    with open(f"{OUTPUT_PREFIX}_summary.json", "w") as f:
+        json.dump(summary, f, indent=2)
+    logger.info("Saved summary to %s_summary.json", OUTPUT_PREFIX)
+
+    # Print summary
+    print("\n" + "=" * 60)
+    print("WEIGHT DIAGNOSTICS SUMMARY")
+    print("=" * 60)
+    for k, v in summary.items():
+        if isinstance(v, float):
+            print(f"  {k}: {v:,.2f}")
+        else:
+            print(f"  {k}: {v}")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()